diff --git a/.DS_Store b/.DS_Store
index 15f707a1859d2244436a931beff7b8f9109b1916..e3a7dce1816d0b828ff67eb2cb5fbc634b33542e 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/handler.py b/handler.py
index d273937d9f65c663f520e69bcf609fd62099a4f8..4cb67880080d5282085dd8950a5223c2fab07189 100644
--- a/handler.py
+++ b/handler.py
@@ -14,6 +14,10 @@ from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 
 from huggingface_hub import hf_hub_download
 
+import sys
+root_local = './'
+sys.path.insert(0, root_local)
+
 from insightface.app import FaceAnalysis
 
 from style_template import styles
@@ -48,7 +52,7 @@ class EndpointHandler():
         #     providers=["CPUExecutionProvider"],
         # )
         self.app = FaceAnalysis(
-            name="antelopev2",
+            name="buffalo_l",
             root="./",
             providers=["CPUExecutionProvider"],
         )
diff --git a/insightface/.gitignore b/insightface/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4f61b8cbb5309a14e86d03eaf0e3c1ad3c451a0d
--- /dev/null
+++ b/insightface/.gitignore
@@ -0,0 +1,103 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+.DS_Store
diff --git a/insightface/CODE_OF_CONDUCT.md b/insightface/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1d657f0068040ecb5a6ad19873f7e54bb702cea
--- /dev/null
+++ b/insightface/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+contact@insightface.ai.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/insightface/README.md b/insightface/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8266b3ab756e80c619a736ab1e21083bda17a62a
--- /dev/null
+++ b/insightface/README.md
@@ -0,0 +1,258 @@
+
+# InsightFace: 2D and 3D Face Analysis Project
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+InsightFace project is mainly maintained By [Jia Guo](mailto:guojia@gmail.com?subject=[GitHub]%20InsightFace%20Project) and [Jiankang Deng](https://jiankangdeng.github.io/). 
+
+For all main contributors, please check [contributing](#contributing).
+
+## License
+
+The code of InsightFace is released under the MIT License. There is no limitation for both academic and commercial usage.
+
+The training data containing the annotation (and the models trained with these data) are available for non-commercial research purposes only.
+
+Both manual-downloading models from our github repo and auto-downloading models with our [python-library](python-package) follow the above license policy(which is for non-commercial research purposes only).
+
+## Top News
+
+**`2023-08-08`**: We released the implementation of [Generalizing Gaze Estimation with Weak-Supervision from Synthetic Views](https://arxiv.org/abs/2212.02997) at [reconstruction/gaze](reconstruction/gaze).
+
+**`2023-05-03`**: We have launched the ongoing version of wild face anti-spoofing challenge. See details [here](https://github.com/deepinsight/insightface/tree/master/challenges/cvpr23-fas-wild#updates).
+
+**`2023-04-01`**: We move the swapping demo to Discord bot, which support editing on Midjourney generated images, see detail at [web-demos/swapping_discord](web-demos/swapping_discord).
+
+**`2023-02-13`**: We launch a large scale in the wild face anti-spoofing challenge on CVPR23 Workshop, see details at [challenges/cvpr23-fas-wild](challenges/cvpr23-fas-wild).
+
+**`2022-11-28`**: Single line code for facial identity swapping in our python packge ver 0.7, please check the example [here](examples/in_swapper).
+
+**`2022-10-28`**: [MFR-Ongoing](http://iccv21-mfr.com) website is refactored, please create issues if there's any bug.
+
+**`2022-09-22`**: Now we have [web-demos](web-demos): [face-localization](http://demo.insightface.ai:7007/), [face-recognition](http://demo.insightface.ai:7008/), and [face-swapping](http://demo.insightface.ai:7009/).
+
+**`2022-08-12`**: We achieved Rank-1st of 
+[Perspective Projection Based Monocular 3D Face Reconstruction Challenge](https://tianchi.aliyun.com/competition/entrance/531961/introduction)
+of [ECCV-2022 WCPA Workshop](https://sites.google.com/view/wcpa2022), [paper](https://arxiv.org/abs/2208.07142) and [code](reconstruction/jmlr).
+
+**`2022-03-30`**: [Partial FC](https://arxiv.org/abs/2203.15565) accepted by CVPR-2022.
+
+**`2022-02-23`**: [SCRFD](detection/scrfd) accepted by [ICLR-2022](https://iclr.cc/Conferences/2022).
+
+**`2021-11-30`**: [MFR-Ongoing](challenges/mfr) challenge launched(same with IFRT), which is an extended version of [iccv21-mfr](challenges/iccv21-mfr).
+
+**`2021-10-29`**: We achieved 1st place on the [VISA track](https://pages.nist.gov/frvt/plots/11/visa.html) of [NIST-FRVT 1:1](https://pages.nist.gov/frvt/html/frvt11.html) by using Partial FC (Xiang An, Jiankang Deng, Jia Guo).
+
+**`2021-10-11`**: [Leaderboard](https://insightface.ai/mfr21) of [ICCV21 - Masked Face Recognition Challenge](challenges/iccv21-mfr) released. Video: [Youtube](https://www.youtube.com/watch?v=lL-7l5t6x2w), [Bilibili](https://www.bilibili.com/video/BV15b4y1h79N/).
+
+**`2021-06-05`**: We launch a [Masked Face Recognition Challenge & Workshop](challenges/iccv21-mfr) on ICCV 2021.
+
+
+
+## Introduction
+
+[InsightFace](https://insightface.ai) is an open source 2D&3D deep face analysis toolbox, mainly based on PyTorch and MXNet. 
+
+Please check our [website](https://insightface.ai) for detail.
+
+The master branch works with **PyTorch 1.6+** and/or **MXNet=1.6-1.8**, with **Python 3.x**.
+
+InsightFace efficiently implements a rich variety of state of the art algorithms of face recognition, face detection and face alignment, which optimized for both training and deployment.
+
+## Quick Start
+
+Please start with our [python-package](python-package/), for testing detection, recognition and alignment models on input images.
+
+
+### ArcFace Video Demo
+
+
+[<img src=https://insightface.ai/assets/img/github/facerecognitionfromvideo.PNG width="760" />](https://www.youtube.com/watch?v=y-D1tReryGA&t=81s)
+
+
+Please click the image to watch the Youtube video. For Bilibili users, click [here](https://www.bilibili.com/video/av38041494?from=search&seid=11501833604850032313).
+
+
+
+## Projects
+
+The [page](https://insightface.ai/projects) on InsightFace website also describes all supported projects in InsightFace.
+
+You may also interested in some [challenges](https://insightface.ai/challenges) hold by InsightFace.
+
+
+
+## Face Recognition
+
+### Introduction
+
+In this module, we provide training data, network settings and loss designs for deep face recognition.
+
+The supported methods are as follows:
+
+- [x] [ArcFace_mxnet (CVPR'2019)](recognition/arcface_mxnet)
+- [x] [ArcFace_torch (CVPR'2019)](recognition/arcface_torch)
+- [x] [SubCenter ArcFace (ECCV'2020)](recognition/subcenter_arcface)
+- [x] [PartialFC_mxnet (CVPR'2022)](recognition/partial_fc)
+- [x] [PartialFC_torch (CVPR'2022)](recognition/arcface_torch)
+- [x] [VPL (CVPR'2021)](recognition/vpl)
+- [x] [Arcface_oneflow](recognition/arcface_oneflow)
+- [x] [ArcFace_Paddle (CVPR'2019)](recognition/arcface_paddle)
+
+Commonly used network backbones are included in most of the methods, such as IResNet, MobilefaceNet, MobileNet, InceptionResNet_v2, DenseNet, etc..
+
+
+### Datasets
+
+The training data includes, but not limited to the cleaned MS1M, VGG2 and CASIA-Webface datasets, which were already packed in MXNet binary format. Please [dataset](recognition/_datasets_) page for detail.
+
+### Evaluation
+
+We provide standard IJB and Megaface evaluation pipelines in [evaluation](recognition/_evaluation_)
+
+
+### Pretrained Models
+
+**Please check [Model-Zoo](https://github.com/deepinsight/insightface/wiki/Model-Zoo) for more pretrained models.**
+
+### Third-party Re-implementation of ArcFace
+
+- TensorFlow: [InsightFace_TF](https://github.com/auroua/InsightFace_TF)
+- TensorFlow: [tf-insightface](https://github.com/AIInAi/tf-insightface)
+- TensorFlow:[insightface](https://github.com/Fei-Wang/insightface)
+- PyTorch: [InsightFace_Pytorch](https://github.com/TreB1eN/InsightFace_Pytorch)
+- PyTorch: [arcface-pytorch](https://github.com/ronghuaiyang/arcface-pytorch)
+- Caffe: [arcface-caffe](https://github.com/xialuxi/arcface-caffe)
+- Caffe: [CombinedMargin-caffe](https://github.com/gehaocool/CombinedMargin-caffe)
+- Tensorflow: [InsightFace-tensorflow](https://github.com/luckycallor/InsightFace-tensorflow)
+- TensorRT: [wang-xinyu/tensorrtx](https://github.com/wang-xinyu/tensorrtx)  
+- TensorRT: [InsightFace-REST](https://github.com/SthPhoenix/InsightFace-REST)
+- ONNXRuntime C++: [ArcFace-ONNXRuntime](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ort/cv/glint_arcface.cpp)
+- ONNXRuntime Go: [arcface-go](https://github.com/jack139/arcface-go)
+- MNN: [ArcFace-MNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/mnn/cv/mnn_glint_arcface.cpp)
+- TNN: [ArcFace-TNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/tnn/cv/tnn_glint_arcface.cpp)
+- NCNN: [ArcFace-NCNN](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ncnn/cv/ncnn_glint_arcface.cpp)
+
+## Face Detection
+
+### Introduction
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/github/11513D05.jpg" width="640"/>
+</div>
+
+In this module, we provide training data with annotation, network settings and loss designs for face detection training, evaluation and inference.
+
+The supported methods are as follows:
+
+- [x] [RetinaFace (CVPR'2020)](detection/retinaface)
+- [x] [SCRFD (Arxiv'2021)](detection/scrfd)
+- [x] [blazeface_paddle](detection/blazeface_paddle)
+
+[RetinaFace](detection/retinaface) is a practical single-stage face detector which is accepted by [CVPR 2020](https://openaccess.thecvf.com/content_CVPR_2020/html/Deng_RetinaFace_Single-Shot_Multi-Level_Face_Localisation_in_the_Wild_CVPR_2020_paper.html). We provide training code, training dataset, pretrained models and evaluation scripts. 
+
+[SCRFD](detection/scrfd) is an efficient high accuracy face detection approach which is initialy described in [Arxiv](https://arxiv.org/abs/2105.04714). We provide an easy-to-use pipeline to train high efficiency face detectors with NAS supporting.
+
+
+## Face Alignment
+
+### Introduction
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/thumb_sdunet.png" width="600"/>
+</div>
+
+In this module, we provide datasets and training/inference pipelines for face alignment.
+
+Supported methods:
+
+- [x] [SDUNets (BMVC'2018)](alignment/heatmap)
+- [x] [SimpleRegression](alignment/coordinate_reg)
+
+
+[SDUNets](alignment/heatmap) is a heatmap based method which accepted on [BMVC](http://bmvc2018.org/contents/papers/0051.pdf).
+
+[SimpleRegression](alignment/coordinate_reg) provides very lightweight facial landmark models with fast coordinate regression. The input of these models is loose cropped face image while the output is the direct landmark coordinates.
+
+
+## Citation
+
+If you find *InsightFace* useful in your research, please consider to cite the following related papers:
+
+```
+@inproceedings{ren2023pbidr,
+  title={Facial Geometric Detail Recovery via Implicit Representation},
+  author={Ren, Xingyu and Lattas, Alexandros and Gecer, Baris and Deng, Jiankang and Ma, Chao and Yang, Xiaokang},
+  booktitle={2023 IEEE 17th International Conference on Automatic Face and Gesture Recognition (FG)},  
+  year={2023}
+ }
+
+@article{guo2021sample,
+  title={Sample and Computation Redistribution for Efficient Face Detection},
+  author={Guo, Jia and Deng, Jiankang and Lattas, Alexandros and Zafeiriou, Stefanos},
+  journal={arXiv preprint arXiv:2105.04714},
+  year={2021}
+}
+
+@inproceedings{gecer2021ostec,
+  title={OSTeC: One-Shot Texture Completion},
+  author={Gecer, Baris and Deng, Jiankang and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2021}
+}
+
+@inproceedings{an2020partical_fc,
+  title={Partial FC: Training 10 Million Identities on a Single Machine},
+  author={An, Xiang and Zhu, Xuhan and Xiao, Yang and Wu, Lan and Zhang, Ming and Gao, Yuan and Qin, Bin and
+  Zhang, Debing and Fu Ying},
+  booktitle={Arxiv 2010.05222},
+  year={2020}
+}
+
+@inproceedings{deng2020subcenter,
+  title={Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces},
+  author={Deng, Jiankang and Guo, Jia and Liu, Tongliang and Gong, Mingming and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on European Conference on Computer Vision},
+  year={2020}
+}
+
+@inproceedings{Deng2020CVPR,
+title = {RetinaFace: Single-Shot Multi-Level Face Localisation in the Wild},
+author = {Deng, Jiankang and Guo, Jia and Ververas, Evangelos and Kotsia, Irene and Zafeiriou, Stefanos},
+booktitle = {CVPR},
+year = {2020}
+}
+
+@inproceedings{guo2018stacked,
+  title={Stacked Dense U-Nets with Dual Transformers for Robust Face Alignment},
+  author={Guo, Jia and Deng, Jiankang and Xue, Niannan and Zafeiriou, Stefanos},
+  booktitle={BMVC},
+  year={2018}
+}
+
+@article{deng2018menpo,
+  title={The Menpo benchmark for multi-pose 2D and 3D facial landmark localisation and tracking},
+  author={Deng, Jiankang and Roussos, Anastasios and Chrysos, Grigorios and Ververas, Evangelos and Kotsia, Irene and Shen, Jie and Zafeiriou, Stefanos},
+  journal={IJCV},
+  year={2018}
+}
+
+@inproceedings{deng2018arcface,
+title={ArcFace: Additive Angular Margin Loss for Deep Face Recognition},
+author={Deng, Jiankang and Guo, Jia and Niannan, Xue and Zafeiriou, Stefanos},
+booktitle={CVPR},
+year={2019}
+}
+```
+
+## Contributing
+
+Main contributors:
+
+- [Jia Guo](https://github.com/nttstar), ``guojia[at]gmail.com``
+- [Jiankang Deng](https://github.com/jiankangdeng) ``jiankangdeng[at]gmail.com``
+- [Xiang An](https://github.com/anxiangsir) ``anxiangsir[at]gmail.com``
+- [Jack Yu](https://github.com/szad670401) ``jackyu961127[at]gmail.com``
+- [Baris Gecer](https://barisgecer.github.io/) ``barisgecer[at]msn.com``
diff --git a/insightface/alignment/README.md b/insightface/alignment/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f261f644c1a5dc319f699ceda7edaeb60286b1c5
--- /dev/null
+++ b/insightface/alignment/README.md
@@ -0,0 +1,42 @@
+## Face Alignment
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+
+## Introduction
+
+These are the face alignment methods of [InsightFace](https://insightface.ai)
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/thumb_sdunet.png" width="600"/>
+</div>
+
+
+### Datasets
+
+  Please refer to [datasets](_datasets_) page for the details of face alignment datasets used for training and evaluation.
+
+### Evaluation
+
+  Please refer to [evaluation](_evaluation_) page for the details of face alignment evaluation.
+
+
+## Methods
+
+
+Supported methods:
+
+- [x] [SDUNets (BMVC'2018)](heatmap)
+- [x] [SimpleRegression](coordinate_reg)
+- [x] [Alignment By Face Synthetics](synthetics)
+
+
+## Contributing
+
+We appreciate all contributions to improve the face alignment model zoo of InsightFace. 
+
+
diff --git a/insightface/alignment/_datasets_/README.md b/insightface/alignment/_datasets_/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..af269cc81723d0394fe2f39dff27ce3410fece4b
--- /dev/null
+++ b/insightface/alignment/_datasets_/README.md
@@ -0,0 +1,57 @@
+# Face Alignment Datasets
+
+(Updating)
+
+## Training Datasets
+
+### Menpo2D-Train
+
+https://ibug.doc.ic.ac.uk/resources/2nd-facial-landmark-tracking-competition-menpo-ben/
+
+### 300W-Train
+
+https://ibug.doc.ic.ac.uk/resources/300-W/
+
+
+### LFPW
+
+https://neerajkumar.org/databases/lfpw/
+
+### Helen
+
+http://www.ifp.illinois.edu/~vuongle2/helen/
+
+### AFW
+
+### AFLW
+
+https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/
+
+### FDDB
+
+
+### Face Synthetics
+
+https://github.com/microsoft/FaceSynthetics
+
+### 300W-LP (3D annotation)
+
+http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/main.htm
+
+## Test Datasets
+
+### 300W-Test
+
+https://ibug.doc.ic.ac.uk/resources/300-W/
+
+### COFW
+
+http://www.vision.caltech.edu/xpburgos/ICCV13/#dataset
+
+### Menpo2D-Test
+
+https://ibug.doc.ic.ac.uk/resources/2nd-facial-landmark-tracking-competition-menpo-ben/
+
+### AFLW2000-3D (3D annotation)
+
+http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/main.htm
diff --git a/insightface/alignment/coordinate_reg/README.md b/insightface/alignment/coordinate_reg/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c448cdb4b47f4d4ec50850c26b8215b39606600e
--- /dev/null
+++ b/insightface/alignment/coordinate_reg/README.md
@@ -0,0 +1,58 @@
+### Introduction
+
+ Here we provide some lightweight facial landmark models with fast coordinate regression.
+ The input of these models is loose cropped face image while the output is the direct landmark coordinates. 
+
+
+### Pretrained Models
+
+- **Model ``2d106det``**
+
+  **2021.07: We now support model inference by our `insightface` python package, please check [image_infer.py](image_infer.py) for detail.**
+
+  Given face detection bounding box, predict 2d-106 landmarks. Mainly used for static image inference. 
+  
+  Backbone: MobileNet-0.5, size 5MB.
+  
+  Input: size 192x192, loose cropped detection bounding-box.
+  
+  Download link:
+  
+  [baidu cloud](https://pan.baidu.com/s/10m5GmtNV5snynDrq3KqIdg) (code: ``lqvv``)
+  
+  [google drive](https://drive.google.com/file/d/13Pz8mH-a1s7RXpq_jFUXxaqCpDUE0oSr/view?usp=sharing)
+  
+  
+
+- **Model ``2d106track``** 
+
+  Given landmarks bounding box, predict 2d-106 landmarks. Used for video landmarks tracking.
+  
+  Download link: coming soon
+
+### Visualization
+
+
+<p align="center">Points mark-up(ordered by point names):</p>
+
+<div align="center">
+	<img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/2d106markup.jpg" alt="markup" width="320">
+</div>
+
+
+<p align="center">Image result:</p>
+
+<div align="center">
+	<img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/t1_out.jpg" alt="imagevis" width="800">
+</div>
+
+
+<p align="center">Video result:</p>
+
+<div align="center">
+	<img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/C_jiaguo.gif" alt="videovis" width="240">
+</div>
+
+
+### FAQ
+
diff --git a/insightface/alignment/coordinate_reg/image_infer.py b/insightface/alignment/coordinate_reg/image_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a336221e8a2e92a727777d323c479e5d41d5b486
--- /dev/null
+++ b/insightface/alignment/coordinate_reg/image_infer.py
@@ -0,0 +1,23 @@
+import cv2
+import numpy as np
+import os
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+if __name__ == '__main__':
+    app = FaceAnalysis(allowed_modules=['detection', 'landmark_2d_106'])
+    app.prepare(ctx_id=0, det_size=(640, 640))
+    img = ins_get_image('t1')
+    faces = app.get(img)
+    #assert len(faces)==6
+    tim = img.copy()
+    color = (200, 160, 75)
+    for face in faces:
+        lmk = face.landmark_2d_106
+        lmk = np.round(lmk).astype(np.int)
+        for i in range(lmk.shape[0]):
+            p = tuple(lmk[i])
+            cv2.circle(tim, p, 1, color, 1, cv2.LINE_AA)
+    cv2.imwrite('./test_out.jpg', tim)
+
diff --git a/insightface/alignment/heatmap/README.md b/insightface/alignment/heatmap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3bf4a58ef24afa5f65b9f7b929a24533bd2c743
--- /dev/null
+++ b/insightface/alignment/heatmap/README.md
@@ -0,0 +1,10 @@
+We provide our implementation of ``Stacked Dense U-Nets with Dual Transformers for Robust Face Alignment`` here at [BMVC](http://bmvc2018.org/contents/papers/0051.pdf) or link at [Arxiv](https://arxiv.org/abs/1812.01936).
+
+We also provide some popular heatmap based approaches like stacked hourglass, etc..  You can define different loss-type/network structure/dataset in ``config.py``(from ``sample_config.py``).
+
+For example, by default, you can train our approach by ``train.py --network sdu`` or train hourglass network by ``train.py --network hourglass``.
+
+2D training/validation dataset is now available at [baiducloud](https://pan.baidu.com/s/1kdquiIGTlK7l26SPWO_cmw) or [dropbox](https://www.dropbox.com/s/por6mbguegmywo6/bmvc_sdu_data2d.zip?dl=0)
+
+3D training/validation dataset is now available at [baiducloud](https://pan.baidu.com/s/1VjFWm6eEtIqGKk92GE2rgw) or [dropbox](https://www.dropbox.com/s/tjze176lh76nciw/bmvc_sdu_data3d.zip?dl=0)
+
diff --git a/insightface/alignment/heatmap/data.py b/insightface/alignment/heatmap/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeaca9c519ff1e38677a1240467347b289c36d1b
--- /dev/null
+++ b/insightface/alignment/heatmap/data.py
@@ -0,0 +1,354 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import sys, os
+import random
+import math
+import scipy.misc
+import cv2
+import logging
+import sklearn
+import datetime
+import img_helper
+from mxnet.io import DataIter
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+from PIL import Image
+from config import config
+from skimage import transform as tf
+
+
+class FaceSegIter(DataIter):
+    def __init__(self,
+                 batch_size,
+                 per_batch_size=0,
+                 path_imgrec=None,
+                 aug_level=0,
+                 force_mirror=False,
+                 exf=1,
+                 use_coherent=0,
+                 args=None,
+                 data_name="data",
+                 label_name="softmax_label"):
+        self.aug_level = aug_level
+        self.force_mirror = force_mirror
+        self.use_coherent = use_coherent
+        self.exf = exf
+        self.batch_size = batch_size
+        self.per_batch_size = per_batch_size
+        self.data_name = data_name
+        self.label_name = label_name
+        assert path_imgrec
+        logging.info('loading recordio %s...', path_imgrec)
+        path_imgidx = path_imgrec[0:-4] + ".idx"
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec,
+                                                    'r')  # pylint: disable=redefined-variable-type
+        self.oseq = list(self.imgrec.keys)
+        print('train size', len(self.oseq))
+        self.cur = 0
+        self.reset()
+        self.data_shape = (3, config.input_img_size, config.input_img_size)
+        self.num_classes = config.num_classes
+        self.input_img_size = config.input_img_size
+        #self.label_classes = self.num_classes
+        if config.losstype == 'heatmap':
+            if aug_level > 0:
+                self.output_label_size = config.output_label_size
+                self.label_shape = (self.num_classes, self.output_label_size,
+                                    self.output_label_size)
+            else:
+                self.output_label_size = self.input_img_size
+                #self.label_shape = (self.num_classes, 2)
+                self.label_shape = (self.num_classes, self.output_label_size,
+                                    self.output_label_size)
+        else:
+            if aug_level > 0:
+                self.output_label_size = config.output_label_size
+                self.label_shape = (self.num_classes, 2)
+            else:
+                self.output_label_size = self.input_img_size
+                #self.label_shape = (self.num_classes, 2)
+                self.label_shape = (self.num_classes, 2)
+        self.provide_data = [(data_name, (batch_size, ) + self.data_shape)]
+        self.provide_label = [(label_name, (batch_size, ) + self.label_shape)]
+        self.img_num = 0
+        self.invalid_num = 0
+        self.mode = 1
+        self.vis = 0
+        self.stats = [0, 0]
+        self.flip_order = [
+            16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 26, 25,
+            24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35, 34, 33, 32, 31,
+            45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41, 40, 54, 53, 52, 51, 50,
+            49, 48, 59, 58, 57, 56, 55, 64, 63, 62, 61, 60, 67, 66, 65
+        ]
+        #self.mirror_set = [
+        #        (22,23),
+        #        (21,24),
+        #        (20,25),
+        #        (19,26),
+        #        (18,27),
+        #        (40,43),
+        #        (39,44),
+        #        (38,45),
+        #        (37,46),
+        #        (42,47),
+        #        (41,48),
+        #        (33,35),
+        #        (32,36),
+        #        (51,53),
+        #        (50,54),
+        #        (62,64),
+        #        (61,65),
+        #        (49,55),
+        #        (49,55),
+        #        (68,66),
+        #        (60,56),
+        #        (59,57),
+        #        (1,17),
+        #        (2,16),
+        #        (3,15),
+        #        (4,14),
+        #        (5,13),
+        #        (6,12),
+        #        (7,11),
+        #        (8,10),
+        #    ]
+
+    def get_data_shape(self):
+        return self.data_shape
+
+    #def get_label_shape(self):
+    #    return self.label_shape
+
+    def get_shape_dict(self):
+        D = {}
+        for (k, v) in self.provide_data:
+            D[k] = v
+        for (k, v) in self.provide_label:
+            D[k] = v
+        return D
+
+    def get_label_names(self):
+        D = []
+        for (k, v) in self.provide_label:
+            D.append(k)
+        return D
+
+    def reset(self):
+        #print('reset')
+        if self.aug_level == 0:
+            self.seq = self.oseq
+        else:
+            self.seq = []
+            for _ in range(self.exf):
+                _seq = self.oseq[:]
+                random.shuffle(_seq)
+                self.seq += _seq
+            print('train size after reset', len(self.seq))
+        self.cur = 0
+
+    def next_sample(self):
+        """Helper function for reading in next sample."""
+        if self.cur >= len(self.seq):
+            raise StopIteration
+        idx = self.seq[self.cur]
+        self.cur += 1
+        s = self.imgrec.read_idx(idx)
+        header, img = recordio.unpack(s)
+        img = mx.image.imdecode(img).asnumpy()
+        hlabel = np.array(header.label).reshape((self.num_classes, 2))
+        if not config.label_xfirst:
+            hlabel = hlabel[:, ::-1]  #convert to X/W first
+        annot = {'scale': config.base_scale}
+
+        #ul = np.array( (50000,50000), dtype=np.int32)
+        #br = np.array( (0,0), dtype=np.int32)
+        #for i in range(hlabel.shape[0]):
+        #  h = int(hlabel[i][0])
+        #  w = int(hlabel[i][1])
+        #  key = np.array((h,w))
+        #  ul = np.minimum(key, ul)
+        #  br = np.maximum(key, br)
+
+        return img, hlabel, annot
+
+    def get_flip(self, data, label):
+        data_flip = np.zeros_like(data)
+        label_flip = np.zeros_like(label)
+        for k in range(data_flip.shape[2]):
+            data_flip[:, :, k] = np.fliplr(data[:, :, k])
+        for k in range(label_flip.shape[0]):
+            label_flip[k, :] = np.fliplr(label[k, :])
+        #print(label[0,:].shape)
+        label_flip = label_flip[self.flip_order, :]
+        return data_flip, label_flip
+
+    def get_data(self, data, label, annot):
+        if self.vis:
+            self.img_num += 1
+            #if self.img_num<=self.vis:
+            #  filename = './vis/raw_%d.jpg' % (self.img_num)
+            #  print('save', filename)
+            #  draw = data.copy()
+            #  for i in range(label.shape[0]):
+            #    cv2.circle(draw, (label[i][1], label[i][0]), 1, (0, 0, 255), 2)
+            #  scipy.misc.imsave(filename, draw)
+
+        rotate = 0
+        #scale = 1.0
+        if 'scale' in annot:
+            scale = annot['scale']
+        else:
+            scale = max(data.shape[0], data.shape[1])
+        if 'center' in annot:
+            center = annot['center']
+        else:
+            center = np.array((data.shape[1] / 2, data.shape[0] / 2))
+        max_retry = 3
+        if self.aug_level == 0:  #validation mode
+            max_retry = 6
+        retry = 0
+        found = False
+        base_scale = scale
+        while retry < max_retry:
+            retry += 1
+            succ = True
+            _scale = base_scale
+            if self.aug_level > 0:
+                rotate = np.random.randint(-40, 40)
+                scale_config = 0.2
+                #rotate = 0
+                #scale_config = 0.0
+                scale_ratio = min(
+                    1 + scale_config,
+                    max(1 - scale_config,
+                        (np.random.randn() * scale_config) + 1))
+                _scale = int(base_scale * scale_ratio)
+                #translate = np.random.randint(-5, 5, size=(2,))
+                #center += translate
+            data_out, trans = img_helper.transform(data, center,
+                                                   self.input_img_size, _scale,
+                                                   rotate)
+            #data_out = img_helper.crop2(data, center, _scale, (self.input_img_size, self.input_img_size), rot=rotate)
+            label_out = np.zeros(self.label_shape, dtype=np.float32)
+            #print('out shapes', data_out.shape, label_out.shape)
+            for i in range(label.shape[0]):
+                pt = label[i].copy()
+                #pt = pt[::-1]
+                npt = img_helper.transform_pt(pt, trans)
+                if npt[0] >= data_out.shape[1] or npt[1] >= data_out.shape[
+                        0] or npt[0] < 0 or npt[1] < 0:
+                    succ = False
+                    #print('err npt', npt)
+                    break
+                if config.losstype == 'heatmap':
+                    pt_scale = float(
+                        self.output_label_size) / self.input_img_size
+                    npt *= pt_scale
+                    npt = npt.astype(np.int32)
+                    img_helper.gaussian(label_out[i], npt, config.gaussian)
+                else:
+                    label_out[i] = (npt / self.input_img_size)
+                #print('before gaussian', label_out[i].shape, pt.shape)
+                #trans = img_helper.transform(pt, center, _scale, (self.output_label_size, self.output_label_size), rot=rotate)
+                #print(trans.shape)
+                #if not img_helper.gaussian(label_out[i], trans, _g):
+                #    succ = False
+                #    break
+            if not succ:
+                if self.aug_level == 0:
+                    base_scale += 20
+                continue
+
+            flip_data_out = None
+            flip_label_out = None
+            if config.net_coherent:
+                flip_data_out, flip_label_out = self.get_flip(
+                    data_out, label_out)
+            elif ((self.aug_level > 0 and np.random.rand() < 0.5)
+                  or self.force_mirror):  #flip aug
+                flip_data_out, flip_label_out = self.get_flip(
+                    data_out, label_out)
+                data_out, label_out = flip_data_out, flip_label_out
+
+            found = True
+            break
+
+        #self.stats[0]+=1
+        if not found:
+            #self.stats[1]+=1
+            #print('find aug error', retry)
+            #print(self.stats)
+            #print('!!!ERR')
+            return None
+        #print('found with scale', _scale, rotate)
+
+        if self.vis > 0 and self.img_num <= self.vis:
+            print('crop', data.shape, center, _scale, rotate, data_out.shape)
+            filename = './vis/cropped_%d.jpg' % (self.img_num)
+            print('save', filename)
+            draw = data_out.copy()
+            alabel = label_out.copy()
+            for i in range(label.shape[0]):
+                a = cv2.resize(alabel[i],
+                               (self.input_img_size, self.input_img_size))
+                ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
+                cv2.circle(draw, (ind[1], ind[0]), 1, (0, 0, 255), 2)
+            scipy.misc.imsave(filename, draw)
+            filename = './vis/raw_%d.jpg' % (self.img_num)
+            scipy.misc.imsave(filename, data)
+
+        return data_out, label_out, flip_data_out, flip_label_out
+
+    def next(self):
+        """Returns the next batch of data."""
+        #print('next')
+        batch_size = self.batch_size
+        batch_data = nd.empty((batch_size, ) + self.data_shape)
+        batch_label = nd.empty((batch_size, ) + self.label_shape)
+        i = 0
+        #self.cutoff = random.randint(800,1280)
+        try:
+            while i < batch_size:
+                #print('N', i)
+                data, label, annot = self.next_sample()
+                R = self.get_data(data, label, annot)
+                if R is None:
+                    continue
+                data_out, label_out, flip_data_out, flip_label_out = R
+                if not self.use_coherent:
+                    data = nd.array(data_out)
+                    data = nd.transpose(data, axes=(2, 0, 1))
+                    label = nd.array(label_out)
+                    #print(data.shape, label.shape)
+                    batch_data[i][:] = data
+                    batch_label[i][:] = label
+                    i += 1
+                else:
+                    data = nd.array(data_out)
+                    data = nd.transpose(data, axes=(2, 0, 1))
+                    label = nd.array(label_out)
+                    data2 = nd.array(flip_data_out)
+                    data2 = nd.transpose(data2, axes=(2, 0, 1))
+                    label2 = nd.array(flip_label_out)
+                    #M = nd.array(M)
+                    #print(data.shape, label.shape)
+                    batch_data[i][:] = data
+                    batch_label[i][:] = label
+                    #i+=1
+                    j = i + self.per_batch_size // 2
+                    batch_data[j][:] = data2
+                    batch_label[j][:] = label2
+                    i += 1
+                    if j % self.per_batch_size == self.per_batch_size - 1:
+                        i = j + 1
+        except StopIteration:
+            if i < batch_size:
+                raise StopIteration
+
+        #return {self.data_name  :  batch_data,
+        #        self.label_name :  batch_label}
+        #print(batch_data.shape, batch_label.shape)
+        return mx.io.DataBatch([batch_data], [batch_label], batch_size - i)
diff --git a/insightface/alignment/heatmap/img_helper.py b/insightface/alignment/heatmap/img_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b37123ff9f5c93cb347b58cdeab76e48d34b65cb
--- /dev/null
+++ b/insightface/alignment/heatmap/img_helper.py
@@ -0,0 +1,86 @@
+import numpy as np
+import math
+import cv2
+from skimage import transform as stf
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = float(output_size) / scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = stf.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = stf.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = stf.SimilarityTransform(rotation=rot)
+    t4 = stf.SimilarityTransform(translation=(output_size / 2,
+                                              output_size / 2))
+    t = t1 + t2 + t3 + t4
+    trans = t.params[0:2]
+    #print('M', scale, rotation, trans)
+    cropped = cv2.warpAffine(data,
+                             trans, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, trans
+
+
+def transform_pt(pt, trans):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(trans, new_pt)
+    #print('new_pt', new_pt.shape, new_pt)
+    return new_pt[:2]
+
+
+def gaussian(img, pt, sigma):
+    # Draw a 2D gaussian
+    assert (sigma >= 0)
+    if sigma == 0:
+        img[pt[1], pt[0]] = 1.0
+        return True
+    #assert pt[0]<=img.shape[1]
+    #assert pt[1]<=img.shape[0]
+
+    # Check that any part of the gaussian is in-bounds
+    ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
+    br = [int(pt[0] + 3 * sigma + 1), int(pt[1] + 3 * sigma + 1)]
+    if (ul[0] > img.shape[1] or ul[1] >= img.shape[0] or br[0] < 0
+            or br[1] < 0):
+        # If not, just return the image as is
+        #print('gaussian error')
+        return False
+        #return img
+
+    # Generate gaussian
+    size = 6 * sigma + 1
+    x = np.arange(0, size, 1, float)
+    y = x[:, np.newaxis]
+    x0 = y0 = size // 2
+    # The gaussian is not normalized, we want the center value to equal 1
+    g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+    # Usable gaussian range
+    g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
+    g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
+    # Image range
+    img_x = max(0, ul[0]), min(br[0], img.shape[1])
+    img_y = max(0, ul[1]), min(br[1], img.shape[0])
+
+    img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+    return True
+    #return img
+
+
+def estimate_trans_bbox(face, input_size, s=2.0):
+    w = face[2] - face[0]
+    h = face[3] - face[1]
+    wc = int((face[2] + face[0]) / 2)
+    hc = int((face[3] + face[1]) / 2)
+    im_size = max(w, h)
+    #size = int(im_size*1.2)
+    scale = input_size / (max(w, h) * s)
+    M = [
+        [scale, 0, input_size / 2 - wc * scale],
+        [0, scale, input_size / 2 - hc * scale],
+    ]
+    M = np.array(M)
+    return M
diff --git a/insightface/alignment/heatmap/metric.py b/insightface/alignment/heatmap/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ddc96c9fb4fb92b3648bbb59c62f4eb46289f47
--- /dev/null
+++ b/insightface/alignment/heatmap/metric.py
@@ -0,0 +1,107 @@
+import mxnet as mx
+import numpy as np
+import math
+import cv2
+from config import config
+
+
+class LossValueMetric(mx.metric.EvalMetric):
+    def __init__(self):
+        self.axis = 1
+        super(LossValueMetric, self).__init__('lossvalue',
+                                              axis=self.axis,
+                                              output_names=None,
+                                              label_names=None)
+        self.losses = []
+
+    def update(self, labels, preds):
+        loss = preds[0].asnumpy()[0]
+        self.sum_metric += loss
+        self.num_inst += 1.0
+
+
+class NMEMetric(mx.metric.EvalMetric):
+    def __init__(self):
+        self.axis = 1
+        super(NMEMetric, self).__init__('NME',
+                                        axis=self.axis,
+                                        output_names=None,
+                                        label_names=None)
+        #self.losses = []
+        self.count = 0
+
+    def cal_nme(self, label, pred_label):
+        nme = []
+        for b in range(pred_label.shape[0]):
+            record = [None] * 6
+            item = []
+            if label.ndim == 4:
+                _heatmap = label[b][36]
+                if np.count_nonzero(_heatmap) == 0:
+                    continue
+            else:  #ndim==3
+                #print(label[b])
+                if np.count_nonzero(label[b]) == 0:
+                    continue
+            for p in range(pred_label.shape[1]):
+                if label.ndim == 4:
+                    heatmap_gt = label[b][p]
+                    ind_gt = np.unravel_index(np.argmax(heatmap_gt, axis=None),
+                                              heatmap_gt.shape)
+                    ind_gt = np.array(ind_gt)
+                else:
+                    ind_gt = label[b][p]
+                    #ind_gt = ind_gt.astype(np.int)
+                    #print(ind_gt)
+                heatmap_pred = pred_label[b][p]
+                heatmap_pred = cv2.resize(
+                    heatmap_pred,
+                    (config.input_img_size, config.input_img_size))
+                ind_pred = np.unravel_index(np.argmax(heatmap_pred, axis=None),
+                                            heatmap_pred.shape)
+                ind_pred = np.array(ind_pred)
+                #print(ind_gt.shape)
+                #print(ind_pred)
+                if p == 36:
+                    #print('b', b, p, ind_gt, np.count_nonzero(heatmap_gt))
+                    record[0] = ind_gt
+                elif p == 39:
+                    record[1] = ind_gt
+                elif p == 42:
+                    record[2] = ind_gt
+                elif p == 45:
+                    record[3] = ind_gt
+                if record[4] is None or record[5] is None:
+                    record[4] = ind_gt
+                    record[5] = ind_gt
+                else:
+                    record[4] = np.minimum(record[4], ind_gt)
+                    record[5] = np.maximum(record[5], ind_gt)
+                #print(ind_gt.shape, ind_pred.shape)
+                value = np.sqrt(np.sum(np.square(ind_gt - ind_pred)))
+                item.append(value)
+            _nme = np.mean(item)
+            if config.landmark_type == '2d':
+                left_eye = (record[0] + record[1]) / 2
+                right_eye = (record[2] + record[3]) / 2
+                _dist = np.sqrt(np.sum(np.square(left_eye - right_eye)))
+                #print('eye dist', _dist, left_eye, right_eye)
+                _nme /= _dist
+            else:
+                #_dist = np.sqrt(float(label.shape[2]*label.shape[3]))
+                _dist = np.sqrt(np.sum(np.square(record[5] - record[4])))
+                #print(_dist)
+                _nme /= _dist
+            nme.append(_nme)
+        return np.mean(nme)
+
+    def update(self, labels, preds):
+        self.count += 1
+        label = labels[0].asnumpy()
+        pred_label = preds[-1].asnumpy()
+        nme = self.cal_nme(label, pred_label)
+
+        #print('nme', nme)
+        #nme = np.mean(nme)
+        self.sum_metric += np.mean(nme)
+        self.num_inst += 1.0
diff --git a/insightface/alignment/heatmap/optimizer.py b/insightface/alignment/heatmap/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..36290de84bd54236e98775033756fc67dd73b082
--- /dev/null
+++ b/insightface/alignment/heatmap/optimizer.py
@@ -0,0 +1,65 @@
+import mxnet as mx
+import mxnet.optimizer as optimizer
+from mxnet.ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as
+                           NDabs)
+#from mxnet.ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
+#                      mp_sgd_update, mp_sgd_mom_update, square, ftrl_update)
+
+
+class ONadam(optimizer.Optimizer):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 schedule_decay=0.004,
+                 **kwargs):
+        super(ONadam, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.schedule_decay = schedule_decay
+        self.m_schedule = 1.
+
+    def create_state(self, index, weight):
+        return (
+            zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+            zeros(weight.shape, weight.context,
+                  dtype=weight.dtype))  # variance
+
+    def update(self, index, weight, grad, state):
+        assert (isinstance(weight, NDArray))
+        assert (isinstance(grad, NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        t = self._index_update_count[index]
+
+        # preprocess grad
+        #grad = grad * self.rescale_grad + wd * weight
+        grad *= self.rescale_grad + wd * weight
+        if self.clip_gradient is not None:
+            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        # warming momentum schedule
+        momentum_t = self.beta1 * (1. - 0.5 *
+                                   (pow(0.96, t * self.schedule_decay)))
+        momentum_t_1 = self.beta1 * (1. - 0.5 *
+                                     (pow(0.96,
+                                          (t + 1) * self.schedule_decay)))
+        self.m_schedule = self.m_schedule * momentum_t
+        m_schedule_next = self.m_schedule * momentum_t_1
+
+        # update m_t and v_t
+        m_t, v_t = state
+        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
+        v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad
+
+        grad_prime = grad / (1. - self.m_schedule)
+        m_t_prime = m_t / (1. - m_schedule_next)
+        v_t_prime = v_t / (1. - pow(self.beta2, t))
+        m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
+
+        # update weight
+        weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
diff --git a/insightface/alignment/heatmap/sample_config.py b/insightface/alignment/heatmap/sample_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb9ada18540f5996c5bef77de4845d93c9efc794
--- /dev/null
+++ b/insightface/alignment/heatmap/sample_config.py
@@ -0,0 +1,98 @@
+import numpy as np
+from easydict import EasyDict as edict
+
+config = edict()
+
+#default training/dataset config
+config.num_classes = 68
+config.record_img_size = 384
+config.base_scale = 256
+config.input_img_size = 128
+config.output_label_size = 64
+config.label_xfirst = False
+config.losstype = 'heatmap'
+config.net_coherent = False
+config.multiplier = 1.0
+
+config.gaussian = 0
+
+# network settings
+network = edict()
+
+network.hourglass = edict()
+network.hourglass.net_coherent = False
+network.hourglass.net_sta = 0
+network.hourglass.net_n = 3
+network.hourglass.net_dcn = 0
+network.hourglass.net_stacks = 2
+network.hourglass.net_block = 'resnet'
+network.hourglass.net_binarize = False
+network.hourglass.losstype = 'heatmap'
+
+network.sdu = edict()
+network.sdu.net_coherent = False
+network.sdu.net_sta = 1
+network.sdu.net_n = 3
+network.sdu.net_dcn = 3
+network.sdu.net_stacks = 2
+network.sdu.net_block = 'cab'
+network.sdu.net_binarize = False
+network.sdu.losstype = 'heatmap'
+
+# dataset settings
+dataset = edict()
+
+dataset.i2d = edict()
+dataset.i2d.dataset = '2D'
+dataset.i2d.landmark_type = '2d'
+dataset.i2d.dataset_path = './data_2d'
+dataset.i2d.num_classes = 68
+dataset.i2d.record_img_size = 384
+dataset.i2d.base_scale = 256
+dataset.i2d.input_img_size = 128
+dataset.i2d.output_label_size = 64
+dataset.i2d.label_xfirst = False
+dataset.i2d.val_targets = ['ibug', 'cofw_testset', '300W']
+
+dataset.i3d = edict()
+dataset.i3d.dataset = '3D'
+dataset.i3d.landmark_type = '3d'
+dataset.i3d.dataset_path = './data_3d'
+dataset.i3d.num_classes = 68
+dataset.i3d.record_img_size = 384
+dataset.i3d.base_scale = 256
+dataset.i3d.input_img_size = 128
+dataset.i3d.output_label_size = 64
+dataset.i3d.label_xfirst = False
+dataset.i3d.val_targets = ['AFLW2000-3D']
+
+# default settings
+default = edict()
+
+# default network
+default.network = 'hourglass'
+default.pretrained = ''
+default.pretrained_epoch = 0
+# default dataset
+default.dataset = 'i2d'
+default.frequent = 20
+default.verbose = 200
+default.kvstore = 'device'
+
+default.prefix = 'model/A'
+default.end_epoch = 10000
+default.lr = 0.00025
+default.wd = 0.0
+default.per_batch_size = 20
+default.lr_step = '16000,24000,30000'
+
+
+def generate_config(_network, _dataset):
+    for k, v in network[_network].items():
+        config[k] = v
+        default[k] = v
+    for k, v in dataset[_dataset].items():
+        config[k] = v
+        default[k] = v
+    config.network = _network
+    config.dataset = _dataset
diff --git a/insightface/alignment/heatmap/symbol/sym_heatmap.py b/insightface/alignment/heatmap/symbol/sym_heatmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..338411292529e79cdab4b99a698e8b75563441f5
--- /dev/null
+++ b/insightface/alignment/heatmap/symbol/sym_heatmap.py
@@ -0,0 +1,1085 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import mxnet as mx
+import numpy as np
+from config import config
+
+ACT_BIT = 1
+bn_mom = 0.9
+workspace = 256
+memonger = False
+
+
+def Conv(**kwargs):
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+#def lin(data, num_filter, workspace, name, binarize, dcn):
+#  bit = 1
+#  if not binarize:
+#    if not dcn:
+#        conv1 = Conv(data=data, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0),
+#                                      no_bias=True, workspace=workspace, name=name + '_conv')
+#        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#        act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#        return act1
+#    else:
+#        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#        act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#        conv1_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = act1,
+#                num_filter=18, pad=(1, 1), kernel=(3, 3), stride=(1, 1))
+#        conv1 = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=act1, offset=conv1_offset,
+#                num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=1, stride=(1, 1), dilate=(1, 1), no_bias=False)
+#        #conv1 = Conv(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+#        #                              no_bias=False, workspace=workspace, name=name + '_conv')
+#        return conv1
+#  else:
+#    bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#    act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#    conv1 = mx.sym.QConvolution_v1(data=act1, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0),
+#                               no_bias=True, workspace=workspace, name=name + '_conv', act_bit=ACT_BIT, weight_bit=bit)
+#    conv1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+#    return conv1
+
+
+def lin3(data, num_filter, workspace, name, k, g=1, d=1):
+    if k != 3:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(k, k),
+                     stride=(1, 1),
+                     pad=((k - 1) // 2, (k - 1) // 2),
+                     num_group=g,
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(k, k),
+                     stride=(1, 1),
+                     pad=(d, d),
+                     num_group=g,
+                     dilate=(d, d),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv')
+    bn1 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           momentum=bn_mom,
+                           eps=2e-5,
+                           name=name + '_bn')
+    act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+    ret = act1
+    return ret
+
+
+def ConvFactory(data,
+                num_filter,
+                kernel,
+                stride=(1, 1),
+                pad=(0, 0),
+                act_type="relu",
+                mirror_attr={},
+                with_act=True,
+                dcn=False,
+                name=''):
+    if not dcn:
+        conv = mx.symbol.Convolution(data=data,
+                                     num_filter=num_filter,
+                                     kernel=kernel,
+                                     stride=stride,
+                                     pad=pad,
+                                     no_bias=True,
+                                     workspace=workspace,
+                                     name=name + '_conv')
+    else:
+        conv_offset = mx.symbol.Convolution(name=name + '_conv_offset',
+                                            data=data,
+                                            num_filter=18,
+                                            pad=(1, 1),
+                                            kernel=(3, 3),
+                                            stride=(1, 1))
+        conv = mx.contrib.symbol.DeformableConvolution(name=name + "_conv",
+                                                       data=data,
+                                                       offset=conv_offset,
+                                                       num_filter=num_filter,
+                                                       pad=(1, 1),
+                                                       kernel=(3, 3),
+                                                       num_deformable_group=1,
+                                                       stride=stride,
+                                                       dilate=(1, 1),
+                                                       no_bias=False)
+    bn = mx.symbol.BatchNorm(data=conv,
+                             fix_gamma=False,
+                             momentum=bn_mom,
+                             eps=2e-5,
+                             name=name + '_bn')
+    if with_act:
+        act = Act(bn, act_type, name=name + '_relu')
+        #act = mx.symbol.Activation(
+        #    data=bn, act_type=act_type, attr=mirror_attr, name=name+'_relu')
+        return act
+    else:
+        return bn
+
+
+class CAB:
+    def __init__(self, data, nFilters, nModules, n, workspace, name, dilate,
+                 group):
+        self.data = data
+        self.nFilters = nFilters
+        self.nModules = nModules
+        self.n = n
+        self.workspace = workspace
+        self.name = name
+        self.dilate = dilate
+        self.group = group
+        self.sym_map = {}
+
+    def get_output(self, w, h):
+        key = (w, h)
+        if key in self.sym_map:
+            return self.sym_map[key]
+        ret = None
+        if h == self.n:
+            if w == self.n:
+                ret = (self.data, self.nFilters)
+            else:
+                x = self.get_output(w + 1, h)
+                f = int(x[1] * 0.5)
+                if w != self.n - 1:
+                    body = lin3(x[0], f, self.workspace,
+                                "%s_w%d_h%d_1" % (self.name, w, h), 3,
+                                self.group, 1)
+                else:
+                    body = lin3(x[0], f, self.workspace,
+                                "%s_w%d_h%d_1" % (self.name, w, h), 3,
+                                self.group, self.dilate)
+                ret = (body, f)
+        else:
+            x = self.get_output(w + 1, h + 1)
+            y = self.get_output(w, h + 1)
+            if h % 2 == 1 and h != w:
+                xbody = lin3(x[0], x[1], self.workspace,
+                             "%s_w%d_h%d_2" % (self.name, w, h), 3, x[1])
+                #xbody = xbody+x[0]
+            else:
+                xbody = x[0]
+            #xbody = x[0]
+            #xbody = lin3(x[0], x[1], self.workspace, "%s_w%d_h%d_2"%(self.name, w, h), 3, x[1])
+            if w == 0:
+                ybody = lin3(y[0], y[1], self.workspace,
+                             "%s_w%d_h%d_3" % (self.name, w, h), 3, self.group)
+            else:
+                ybody = y[0]
+            ybody = mx.sym.concat(y[0], ybody, dim=1)
+            body = mx.sym.add_n(xbody,
+                                ybody,
+                                name="%s_w%d_h%d_add" % (self.name, w, h))
+            body = body / 2
+            ret = (body, x[1])
+        self.sym_map[key] = ret
+        return ret
+
+    def get(self):
+        return self.get_output(1, 1)[0]
+
+
+def conv_resnet(data, num_filter, stride, dim_match, name, binarize, dcn,
+                dilate, **kwargs):
+    bit = 1
+    #print('in unit2')
+    # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    if not binarize:
+        act1 = Act(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.5),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+    else:
+        act1 = mx.sym.QActivation(data=bn1,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu1',
+                                  backward_only=True)
+        conv1 = mx.sym.QConvolution(data=act1,
+                                    num_filter=int(num_filter * 0.5),
+                                    kernel=(1, 1),
+                                    stride=(1, 1),
+                                    pad=(0, 0),
+                                    no_bias=True,
+                                    workspace=workspace,
+                                    name=name + '_conv1',
+                                    act_bit=ACT_BIT,
+                                    weight_bit=bit)
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    if not binarize:
+        act2 = Act(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=int(num_filter * 0.5),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+    else:
+        act2 = mx.sym.QActivation(data=bn2,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu2',
+                                  backward_only=True)
+        conv2 = mx.sym.QConvolution(data=act2,
+                                    num_filter=int(num_filter * 0.5),
+                                    kernel=(3, 3),
+                                    stride=(1, 1),
+                                    pad=(1, 1),
+                                    no_bias=True,
+                                    workspace=workspace,
+                                    name=name + '_conv2',
+                                    act_bit=ACT_BIT,
+                                    weight_bit=bit)
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+    if not binarize:
+        act3 = Act(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = Conv(data=act3,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+    else:
+        act3 = mx.sym.QActivation(data=bn3,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu3',
+                                  backward_only=True)
+        conv3 = mx.sym.QConvolution(data=act3,
+                                    num_filter=num_filter,
+                                    kernel=(1, 1),
+                                    stride=(1, 1),
+                                    pad=(0, 0),
+                                    no_bias=True,
+                                    workspace=workspace,
+                                    name=name + '_conv3',
+                                    act_bit=ACT_BIT,
+                                    weight_bit=bit)
+    #if binarize:
+    #  conv3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn4')
+    if dim_match:
+        shortcut = data
+    else:
+        if not binarize:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        else:
+            shortcut = mx.sym.QConvolution(data=act1,
+                                           num_filter=num_filter,
+                                           kernel=(1, 1),
+                                           stride=stride,
+                                           pad=(0, 0),
+                                           no_bias=True,
+                                           workspace=workspace,
+                                           name=name + '_sc',
+                                           act_bit=ACT_BIT,
+                                           weight_bit=bit)
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return conv3 + shortcut
+
+
+def conv_hpm(data, num_filter, stride, dim_match, name, binarize, dcn,
+             dilation, **kwargs):
+    bit = 1
+    #print('in unit2')
+    # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    if not binarize:
+        act1 = Act(data=bn1, act_type='relu', name=name + '_relu1')
+        if not dcn:
+            conv1 = Conv(data=act1,
+                         num_filter=int(num_filter * 0.5),
+                         kernel=(3, 3),
+                         stride=(1, 1),
+                         pad=(dilation, dilation),
+                         dilate=(dilation, dilation),
+                         no_bias=True,
+                         workspace=workspace,
+                         name=name + '_conv1')
+        else:
+            conv1_offset = mx.symbol.Convolution(name=name + '_conv1_offset',
+                                                 data=act1,
+                                                 num_filter=18,
+                                                 pad=(1, 1),
+                                                 kernel=(3, 3),
+                                                 stride=(1, 1))
+            conv1 = mx.contrib.symbol.DeformableConvolution(
+                name=name + '_conv1',
+                data=act1,
+                offset=conv1_offset,
+                num_filter=int(num_filter * 0.5),
+                pad=(1, 1),
+                kernel=(3, 3),
+                num_deformable_group=1,
+                stride=(1, 1),
+                dilate=(1, 1),
+                no_bias=True)
+    else:
+        act1 = mx.sym.QActivation(data=bn1,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu1',
+                                  backward_only=True)
+        conv1 = mx.sym.QConvolution_v1(data=act1,
+                                       num_filter=int(num_filter * 0.5),
+                                       kernel=(3, 3),
+                                       stride=(1, 1),
+                                       pad=(1, 1),
+                                       no_bias=True,
+                                       workspace=workspace,
+                                       name=name + '_conv1',
+                                       act_bit=ACT_BIT,
+                                       weight_bit=bit)
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    if not binarize:
+        act2 = Act(data=bn2, act_type='relu', name=name + '_relu2')
+        if not dcn:
+            conv2 = Conv(data=act2,
+                         num_filter=int(num_filter * 0.25),
+                         kernel=(3, 3),
+                         stride=(1, 1),
+                         pad=(dilation, dilation),
+                         dilate=(dilation, dilation),
+                         no_bias=True,
+                         workspace=workspace,
+                         name=name + '_conv2')
+        else:
+            conv2_offset = mx.symbol.Convolution(name=name + '_conv2_offset',
+                                                 data=act2,
+                                                 num_filter=18,
+                                                 pad=(1, 1),
+                                                 kernel=(3, 3),
+                                                 stride=(1, 1))
+            conv2 = mx.contrib.symbol.DeformableConvolution(
+                name=name + '_conv2',
+                data=act2,
+                offset=conv2_offset,
+                num_filter=int(num_filter * 0.25),
+                pad=(1, 1),
+                kernel=(3, 3),
+                num_deformable_group=1,
+                stride=(1, 1),
+                dilate=(1, 1),
+                no_bias=True)
+    else:
+        act2 = mx.sym.QActivation(data=bn2,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu2',
+                                  backward_only=True)
+        conv2 = mx.sym.QConvolution_v1(data=act2,
+                                       num_filter=int(num_filter * 0.25),
+                                       kernel=(3, 3),
+                                       stride=(1, 1),
+                                       pad=(1, 1),
+                                       no_bias=True,
+                                       workspace=workspace,
+                                       name=name + '_conv2',
+                                       act_bit=ACT_BIT,
+                                       weight_bit=bit)
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+    if not binarize:
+        act3 = Act(data=bn3, act_type='relu', name=name + '_relu3')
+        if not dcn:
+            conv3 = Conv(data=act3,
+                         num_filter=int(num_filter * 0.25),
+                         kernel=(3, 3),
+                         stride=(1, 1),
+                         pad=(dilation, dilation),
+                         dilate=(dilation, dilation),
+                         no_bias=True,
+                         workspace=workspace,
+                         name=name + '_conv3')
+        else:
+            conv3_offset = mx.symbol.Convolution(name=name + '_conv3_offset',
+                                                 data=act3,
+                                                 num_filter=18,
+                                                 pad=(1, 1),
+                                                 kernel=(3, 3),
+                                                 stride=(1, 1))
+            conv3 = mx.contrib.symbol.DeformableConvolution(
+                name=name + '_conv3',
+                data=act3,
+                offset=conv3_offset,
+                num_filter=int(num_filter * 0.25),
+                pad=(1, 1),
+                kernel=(3, 3),
+                num_deformable_group=1,
+                stride=(1, 1),
+                dilate=(1, 1),
+                no_bias=True)
+    else:
+        act3 = mx.sym.QActivation(data=bn3,
+                                  act_bit=ACT_BIT,
+                                  name=name + '_relu3',
+                                  backward_only=True)
+        conv3 = mx.sym.QConvolution_v1(data=act3,
+                                       num_filter=int(num_filter * 0.25),
+                                       kernel=(3, 3),
+                                       stride=(1, 1),
+                                       pad=(1, 1),
+                                       no_bias=True,
+                                       workspace=workspace,
+                                       name=name + '_conv3',
+                                       act_bit=ACT_BIT,
+                                       weight_bit=bit)
+    conv4 = mx.symbol.Concat(*[conv1, conv2, conv3])
+    if binarize:
+        conv4 = mx.sym.BatchNorm(data=conv4,
+                                 fix_gamma=False,
+                                 eps=2e-5,
+                                 momentum=bn_mom,
+                                 name=name + '_bn4')
+    if dim_match:
+        shortcut = data
+    else:
+        if not binarize:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        else:
+            #assert(False)
+            shortcut = mx.sym.QConvolution_v1(data=act1,
+                                              num_filter=num_filter,
+                                              kernel=(1, 1),
+                                              stride=stride,
+                                              pad=(0, 0),
+                                              no_bias=True,
+                                              workspace=workspace,
+                                              name=name + '_sc',
+                                              act_bit=ACT_BIT,
+                                              weight_bit=bit)
+            shortcut = mx.sym.BatchNorm(data=shortcut,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc_bn')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return conv4 + shortcut
+    #return bn4 + shortcut
+    #return act4 + shortcut
+
+
+def block17(net,
+            input_num_channels,
+            scale=1.0,
+            with_act=True,
+            act_type='relu',
+            mirror_attr={},
+            name=''):
+    tower_conv = ConvFactory(net, 192, (1, 1), name=name + '_conv')
+    tower_conv1_0 = ConvFactory(net, 129, (1, 1), name=name + '_conv1_0')
+    tower_conv1_1 = ConvFactory(tower_conv1_0,
+                                160, (1, 7),
+                                pad=(1, 2),
+                                name=name + '_conv1_1')
+    tower_conv1_2 = ConvFactory(tower_conv1_1,
+                                192, (7, 1),
+                                pad=(2, 1),
+                                name=name + '_conv1_2')
+    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2])
+    tower_out = ConvFactory(tower_mixed,
+                            input_num_channels, (1, 1),
+                            with_act=False,
+                            name=name + '_conv_out')
+    net = net + scale * tower_out
+    if with_act:
+        act = mx.symbol.Activation(data=net,
+                                   act_type=act_type,
+                                   attr=mirror_attr)
+        return act
+    else:
+        return net
+
+
+def block35(net,
+            input_num_channels,
+            scale=1.0,
+            with_act=True,
+            act_type='relu',
+            mirror_attr={},
+            name=''):
+    M = 1.0
+    tower_conv = ConvFactory(net,
+                             int(input_num_channels * 0.25 * M), (1, 1),
+                             name=name + '_conv')
+    tower_conv1_0 = ConvFactory(net,
+                                int(input_num_channels * 0.25 * M), (1, 1),
+                                name=name + '_conv1_0')
+    tower_conv1_1 = ConvFactory(tower_conv1_0,
+                                int(input_num_channels * 0.25 * M), (3, 3),
+                                pad=(1, 1),
+                                name=name + '_conv1_1')
+    tower_conv2_0 = ConvFactory(net,
+                                int(input_num_channels * 0.25 * M), (1, 1),
+                                name=name + '_conv2_0')
+    tower_conv2_1 = ConvFactory(tower_conv2_0,
+                                int(input_num_channels * 0.375 * M), (3, 3),
+                                pad=(1, 1),
+                                name=name + '_conv2_1')
+    tower_conv2_2 = ConvFactory(tower_conv2_1,
+                                int(input_num_channels * 0.5 * M), (3, 3),
+                                pad=(1, 1),
+                                name=name + '_conv2_2')
+    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_1, tower_conv2_2])
+    tower_out = ConvFactory(tower_mixed,
+                            input_num_channels, (1, 1),
+                            with_act=False,
+                            name=name + '_conv_out')
+
+    net = net + scale * tower_out
+    if with_act:
+        act = mx.symbol.Activation(data=net,
+                                   act_type=act_type,
+                                   attr=mirror_attr)
+        return act
+    else:
+        return net
+
+
+def conv_inception(data, num_filter, stride, dim_match, name, binarize, dcn,
+                   dilate, **kwargs):
+    assert not binarize
+    if stride[0] > 1 or not dim_match:
+        return conv_resnet(data, num_filter, stride, dim_match, name, binarize,
+                           dcn, dilate, **kwargs)
+    conv4 = block35(data, num_filter, name=name + '_block35')
+    return conv4
+
+
+def conv_cab(data, num_filter, stride, dim_match, name, binarize, dcn, dilate,
+             **kwargs):
+    if stride[0] > 1 or not dim_match:
+        return conv_hpm(data, num_filter, stride, dim_match, name, binarize,
+                        dcn, dilate, **kwargs)
+    cab = CAB(data, num_filter, 1, 4, workspace, name, dilate, 1)
+    return cab.get()
+
+
+def conv_block(data, num_filter, stride, dim_match, name, binarize, dcn,
+               dilate):
+    if config.net_block == 'resnet':
+        return conv_resnet(data, num_filter, stride, dim_match, name, binarize,
+                           dcn, dilate)
+    elif config.net_block == 'inception':
+        return conv_inception(data, num_filter, stride, dim_match, name,
+                              binarize, dcn, dilate)
+    elif config.net_block == 'hpm':
+        return conv_hpm(data, num_filter, stride, dim_match, name, binarize,
+                        dcn, dilate)
+    elif config.net_block == 'cab':
+        return conv_cab(data, num_filter, stride, dim_match, name, binarize,
+                        dcn, dilate)
+
+
+def hourglass(data, nFilters, nModules, n, workspace, name, binarize, dcn):
+    s = 2
+    _dcn = False
+    up1 = data
+    for i in range(nModules):
+        up1 = conv_block(up1, nFilters, (1, 1), True, "%s_up1_%d" % (name, i),
+                         binarize, _dcn, 1)
+    low1 = mx.sym.Pooling(data=data,
+                          kernel=(s, s),
+                          stride=(s, s),
+                          pad=(0, 0),
+                          pool_type='max')
+    for i in range(nModules):
+        low1 = conv_block(low1, nFilters, (1, 1), True,
+                          "%s_low1_%d" % (name, i), binarize, _dcn, 1)
+    if n > 1:
+        low2 = hourglass(low1, nFilters, nModules, n - 1, workspace,
+                         "%s_%d" % (name, n - 1), binarize, dcn)
+    else:
+        low2 = low1
+        for i in range(nModules):
+            low2 = conv_block(low2, nFilters, (1, 1), True,
+                              "%s_low2_%d" % (name, i), binarize, _dcn,
+                              1)  #TODO
+    low3 = low2
+    for i in range(nModules):
+        low3 = conv_block(low3, nFilters, (1, 1), True,
+                          "%s_low3_%d" % (name, i), binarize, _dcn, 1)
+    up2 = mx.symbol.UpSampling(low3,
+                               scale=s,
+                               sample_type='nearest',
+                               workspace=512,
+                               name='%s_upsampling_%s' % (name, n),
+                               num_args=1)
+    return mx.symbol.add_n(up1, up2)
+
+
+class STA:
+    def __init__(self, data, nFilters, nModules, n, workspace, name):
+        self.data = data
+        self.nFilters = nFilters
+        self.nModules = nModules
+        self.n = n
+        self.workspace = workspace
+        self.name = name
+        self.sym_map = {}
+
+    def get_conv(self, data, name, dilate=1, group=1):
+        cab = CAB(data, self.nFilters, self.nModules, 4, self.workspace, name,
+                  dilate, group)
+        return cab.get()
+
+    def get_output(self, w, h):
+        #print(w,h)
+        assert w >= 1 and w <= config.net_n + 1
+        assert h >= 1 and h <= config.net_n + 1
+        s = 2
+        bn_mom = 0.9
+        key = (w, h)
+        if key in self.sym_map:
+            return self.sym_map[key]
+        ret = None
+        if h == self.n:
+            if w == self.n:
+                ret = self.data, 64
+            else:
+                x = self.get_output(w + 1, h)
+                body = self.get_conv(x[0], "%s_w%d_h%d_1" % (self.name, w, h))
+                body = mx.sym.Pooling(data=body,
+                                      kernel=(s, s),
+                                      stride=(s, s),
+                                      pad=(0, 0),
+                                      pool_type='max')
+                body = self.get_conv(body, "%s_w%d_h%d_2" % (self.name, w, h))
+                ret = body, x[1] // 2
+        else:
+            x = self.get_output(w + 1, h + 1)
+            y = self.get_output(w, h + 1)
+
+            HC = False
+
+            if h % 2 == 1 and h != w:
+                xbody = lin3(x[0], self.nFilters, self.workspace,
+                             "%s_w%d_h%d_x" % (self.name, w, h), 3,
+                             self.nFilters, 1)
+                HC = True
+                #xbody = x[0]
+            else:
+                xbody = x[0]
+            if x[1] // y[1] == 2:
+                if w > 1:
+                    ybody = mx.symbol.Deconvolution(
+                        data=y[0],
+                        num_filter=self.nFilters,
+                        kernel=(s, s),
+                        stride=(s, s),
+                        name='%s_upsampling_w%d_h%d' % (self.name, w, h),
+                        attr={'lr_mult': '1.0'},
+                        workspace=self.workspace)
+                    ybody = mx.sym.BatchNorm(data=ybody,
+                                             fix_gamma=False,
+                                             momentum=bn_mom,
+                                             eps=2e-5,
+                                             name="%s_w%d_h%d_y_bn" %
+                                             (self.name, w, h))
+                    ybody = Act(data=ybody,
+                                act_type='relu',
+                                name="%s_w%d_h%d_y_act" % (self.name, w, h))
+                else:
+                    if h >= 1:
+                        ybody = mx.symbol.UpSampling(
+                            y[0],
+                            scale=s,
+                            sample_type='nearest',
+                            workspace=512,
+                            name='%s_upsampling_w%d_h%d' % (self.name, w, h),
+                            num_args=1)
+                        ybody = self.get_conv(
+                            ybody, "%s_w%d_h%d_4" % (self.name, w, h))
+                    else:
+                        ybody = mx.symbol.Deconvolution(
+                            data=y[0],
+                            num_filter=self.nFilters,
+                            kernel=(s, s),
+                            stride=(s, s),
+                            name='%s_upsampling_w%d_h%d' % (self.name, w, h),
+                            attr={'lr_mult': '1.0'},
+                            workspace=self.workspace)
+                        ybody = mx.sym.BatchNorm(data=ybody,
+                                                 fix_gamma=False,
+                                                 momentum=bn_mom,
+                                                 eps=2e-5,
+                                                 name="%s_w%d_h%d_y_bn" %
+                                                 (self.name, w, h))
+                        ybody = Act(data=ybody,
+                                    act_type='relu',
+                                    name="%s_w%d_h%d_y_act" %
+                                    (self.name, w, h))
+                        ybody = Conv(data=ybody,
+                                     num_filter=self.nFilters,
+                                     kernel=(3, 3),
+                                     stride=(1, 1),
+                                     pad=(1, 1),
+                                     no_bias=True,
+                                     name="%s_w%d_h%d_y_conv2" %
+                                     (self.name, w, h),
+                                     workspace=self.workspace)
+                        ybody = mx.sym.BatchNorm(data=ybody,
+                                                 fix_gamma=False,
+                                                 momentum=bn_mom,
+                                                 eps=2e-5,
+                                                 name="%s_w%d_h%d_y_bn2" %
+                                                 (self.name, w, h))
+                        ybody = Act(data=ybody,
+                                    act_type='relu',
+                                    name="%s_w%d_h%d_y_act2" %
+                                    (self.name, w, h))
+            else:
+                ybody = self.get_conv(y[0], "%s_w%d_h%d_5" % (self.name, w, h))
+            #if not HC:
+            if config.net_sta == 2 and h == 3 and w == 2:
+                z = self.get_output(w + 1, h)
+                zbody = z[0]
+                zbody = mx.sym.Pooling(data=zbody,
+                                       kernel=(z[1], z[1]),
+                                       stride=(z[1], z[1]),
+                                       pad=(0, 0),
+                                       pool_type='avg')
+                body = xbody + ybody
+                body = body / 2
+                body = mx.sym.broadcast_mul(body, zbody)
+            else:  #sta==1
+                body = xbody + ybody
+                body = body / 2
+            ret = body, x[1]
+
+        assert ret is not None
+        self.sym_map[key] = ret
+        return ret
+
+    def get(self):
+        return self.get_output(1, 1)[0]
+
+
+class SymCoherent:
+    def __init__(self, per_batch_size):
+        self.per_batch_size = per_batch_size
+        self.flip_order = [
+            16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 26, 25,
+            24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35, 34, 33, 32, 31,
+            45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41, 40, 54, 53, 52, 51, 50,
+            49, 48, 59, 58, 57, 56, 55, 64, 63, 62, 61, 60, 67, 66, 65
+        ]
+
+    def get(self, data):
+        #data.shape[0]==per_batch_size
+        b = self.per_batch_size // 2
+        ux = mx.sym.slice_axis(data, axis=0, begin=0, end=b)
+        dx = mx.sym.slice_axis(data, axis=0, begin=b, end=b * 2)
+        ux = mx.sym.flip(ux, axis=3)
+        #ux = mx.sym.take(ux, indices = self.flip_order, axis=0)
+        ux_list = []
+        for o in self.flip_order:
+            _ux = mx.sym.slice_axis(ux, axis=1, begin=o, end=o + 1)
+            ux_list.append(_ux)
+        ux = mx.sym.concat(*ux_list, dim=1)
+        return ux, dx
+
+
+def l2_loss(x, y):
+    loss = x - y
+    loss = mx.symbol.smooth_l1(loss, scalar=1.0)
+    #loss = loss*loss
+    loss = mx.symbol.mean(loss)
+    return loss
+
+
+def ce_loss(x, y):
+    #loss = mx.sym.SoftmaxOutput(data = x, label = y, normalization='valid', multi_output=True)
+    x_max = mx.sym.max(x, axis=[2, 3], keepdims=True)
+    x = mx.sym.broadcast_minus(x, x_max)
+    body = mx.sym.exp(x)
+    sums = mx.sym.sum(body, axis=[2, 3], keepdims=True)
+    body = mx.sym.broadcast_div(body, sums)
+    loss = mx.sym.log(body)
+    loss = loss * y * -1.0
+    loss = mx.symbol.mean(loss, axis=[1, 2, 3])
+    #loss = mx.symbol.mean(loss)
+    return loss
+
+
+def get_symbol(num_classes):
+    m = config.multiplier
+    sFilters = max(int(64 * m), 32)
+    mFilters = max(int(128 * m), 32)
+    nFilters = int(256 * m)
+
+    nModules = 1
+    nStacks = config.net_stacks
+    binarize = config.net_binarize
+    input_size = config.input_img_size
+    label_size = config.output_label_size
+    use_coherent = config.net_coherent
+    use_STA = config.net_sta
+    N = config.net_n
+    DCN = config.net_dcn
+    per_batch_size = config.per_batch_size
+    print('binarize', binarize)
+    print('use_coherent', use_coherent)
+    print('use_STA', use_STA)
+    print('use_N', N)
+    print('use_DCN', DCN)
+    print('per_batch_size', per_batch_size)
+    #assert(label_size==64 or label_size==32)
+    #assert(input_size==128 or input_size==256)
+    coherentor = SymCoherent(per_batch_size)
+    D = input_size // label_size
+    print(input_size, label_size, D)
+    data = mx.sym.Variable(name='data')
+    data = data - 127.5
+    data = data * 0.0078125
+    gt_label = mx.symbol.Variable(name='softmax_label')
+    losses = []
+    closses = []
+    ref_label = gt_label
+    if D == 4:
+        body = Conv(data=data,
+                    num_filter=sFilters,
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+    else:
+        body = Conv(data=data,
+                    num_filter=sFilters,
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+    body = mx.sym.BatchNorm(data=body,
+                            fix_gamma=False,
+                            eps=2e-5,
+                            momentum=bn_mom,
+                            name='bn0')
+    body = Act(data=body, act_type='relu', name='relu0')
+
+    dcn = False
+    body = conv_block(body, mFilters, (1, 1), sFilters == mFilters, 'res0',
+                      False, dcn, 1)
+
+    body = mx.sym.Pooling(data=body,
+                          kernel=(2, 2),
+                          stride=(2, 2),
+                          pad=(0, 0),
+                          pool_type='max')
+
+    body = conv_block(body, mFilters, (1, 1), True, 'res1', False, dcn,
+                      1)  #TODO
+    body = conv_block(body, nFilters, (1, 1), mFilters == nFilters, 'res2',
+                      binarize, dcn, 1)  #binarize=True?
+
+    heatmap = None
+
+    for i in range(nStacks):
+        shortcut = body
+        if config.net_sta > 0:
+            sta = STA(body, nFilters, nModules, config.net_n + 1, workspace,
+                      'sta%d' % (i))
+            body = sta.get()
+        else:
+            body = hourglass(body, nFilters, nModules, config.net_n, workspace,
+                             'stack%d_hg' % (i), binarize, dcn)
+        for j in range(nModules):
+            body = conv_block(body, nFilters, (1, 1), True,
+                              'stack%d_unit%d' % (i, j), binarize, dcn, 1)
+        _dcn = True if config.net_dcn >= 2 else False
+        ll = ConvFactory(body,
+                         nFilters, (1, 1),
+                         dcn=_dcn,
+                         name='stack%d_ll' % (i))
+        _name = "heatmap%d" % (i) if i < nStacks - 1 else "heatmap"
+        _dcn = True if config.net_dcn >= 2 else False
+        if not _dcn:
+            out = Conv(data=ll,
+                       num_filter=num_classes,
+                       kernel=(1, 1),
+                       stride=(1, 1),
+                       pad=(0, 0),
+                       name=_name,
+                       workspace=workspace)
+        else:
+            out_offset = mx.symbol.Convolution(name=_name + '_offset',
+                                               data=ll,
+                                               num_filter=18,
+                                               pad=(1, 1),
+                                               kernel=(3, 3),
+                                               stride=(1, 1))
+            out = mx.contrib.symbol.DeformableConvolution(
+                name=_name,
+                data=ll,
+                offset=out_offset,
+                num_filter=num_classes,
+                pad=(1, 1),
+                kernel=(3, 3),
+                num_deformable_group=1,
+                stride=(1, 1),
+                dilate=(1, 1),
+                no_bias=False)
+            #out = Conv(data=ll, num_filter=num_classes, kernel=(3,3), stride=(1,1), pad=(1,1),
+            #                          name=_name, workspace=workspace)
+        if i == nStacks - 1:
+            heatmap = out
+        loss = ce_loss(out, ref_label)
+        #loss = loss/nStacks
+        #loss = l2_loss(out, ref_label)
+        losses.append(loss)
+        if config.net_coherent > 0:
+            ux, dx = coherentor.get(out)
+            closs = l2_loss(ux, dx)
+            closs = closs / nStacks
+            closses.append(closs)
+
+        if i < nStacks - 1:
+            ll2 = Conv(data=ll,
+                       num_filter=nFilters,
+                       kernel=(1, 1),
+                       stride=(1, 1),
+                       pad=(0, 0),
+                       name="stack%d_ll2" % (i),
+                       workspace=workspace)
+            out2 = Conv(data=out,
+                        num_filter=nFilters,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name="stack%d_out2" % (i),
+                        workspace=workspace)
+            body = mx.symbol.add_n(shortcut, ll2, out2)
+            _dcn = True if (config.net_dcn == 1
+                            or config.net_dcn == 3) else False
+            if _dcn:
+                _name = "stack%d_out3" % (i)
+                out3_offset = mx.symbol.Convolution(name=_name + '_offset',
+                                                    data=body,
+                                                    num_filter=18,
+                                                    pad=(1, 1),
+                                                    kernel=(3, 3),
+                                                    stride=(1, 1))
+                out3 = mx.contrib.symbol.DeformableConvolution(
+                    name=_name,
+                    data=body,
+                    offset=out3_offset,
+                    num_filter=nFilters,
+                    pad=(1, 1),
+                    kernel=(3, 3),
+                    num_deformable_group=1,
+                    stride=(1, 1),
+                    dilate=(1, 1),
+                    no_bias=False)
+                body = out3
+
+    pred = mx.symbol.BlockGrad(heatmap)
+    #loss = mx.symbol.add_n(*losses)
+    #loss = mx.symbol.MakeLoss(loss)
+    #syms = [loss]
+    syms = []
+    for loss in losses:
+        loss = mx.symbol.MakeLoss(loss)
+        syms.append(loss)
+    if len(closses) > 0:
+        coherent_weight = 0.0001
+        closs = mx.symbol.add_n(*closses)
+        closs = mx.symbol.MakeLoss(closs, grad_scale=coherent_weight)
+        syms.append(closs)
+    syms.append(pred)
+    sym = mx.symbol.Group(syms)
+    return sym
+
+
+def init_weights(sym, data_shape_dict):
+    #print('in hg')
+    arg_name = sym.list_arguments()
+    aux_name = sym.list_auxiliary_states()
+    arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(arg_name, arg_shape))
+    aux_shape_dict = dict(zip(aux_name, aux_shape))
+    #print(aux_shape)
+    #print(aux_params)
+    #print(arg_shape_dict)
+    arg_params = {}
+    aux_params = {}
+    for k in arg_shape_dict:
+        v = arg_shape_dict[k]
+        #print(k,v)
+        if k.endswith('offset_weight') or k.endswith('offset_bias'):
+            print('initializing', k)
+            arg_params[k] = mx.nd.zeros(shape=v)
+        elif k.startswith('fc6_'):
+            if k.endswith('_weight'):
+                print('initializing', k)
+                arg_params[k] = mx.random.normal(0, 0.01, shape=v)
+            elif k.endswith('_bias'):
+                print('initializing', k)
+                arg_params[k] = mx.nd.zeros(shape=v)
+        elif k.find('upsampling') >= 0:
+            print('initializing upsampling_weight', k)
+            arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k])
+            init = mx.init.Initializer()
+            init._init_bilinear(k, arg_params[k])
+    return arg_params, aux_params
diff --git a/insightface/alignment/heatmap/test.py b/insightface/alignment/heatmap/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..160e5acaff080787e1596b3a56d190c2bf6dda9e
--- /dev/null
+++ b/insightface/alignment/heatmap/test.py
@@ -0,0 +1,100 @@
+import argparse
+import cv2
+import sys
+import numpy as np
+import os
+import mxnet as mx
+import datetime
+import img_helper
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'deploy'))
+from mtcnn_detector import MtcnnDetector
+
+
+class Handler:
+    def __init__(self, prefix, epoch, ctx_id=0):
+        print('loading', prefix, epoch)
+        if ctx_id >= 0:
+            ctx = mx.gpu(ctx_id)
+        else:
+            ctx = mx.cpu()
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        all_layers = sym.get_internals()
+        sym = all_layers['heatmap_output']
+        image_size = (128, 128)
+        self.image_size = image_size
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        #model = mx.mod.Module(symbol=sym, context=ctx)
+        model.bind(for_training=False,
+                   data_shapes=[('data', (1, 3, image_size[0], image_size[1]))
+                                ])
+        model.set_params(arg_params, aux_params)
+        self.model = model
+        mtcnn_path = os.path.join(os.path.dirname(__file__), '..', 'deploy',
+                                  'mtcnn-model')
+        self.det_threshold = [0.6, 0.7, 0.8]
+        self.detector = MtcnnDetector(model_folder=mtcnn_path,
+                                      ctx=ctx,
+                                      num_worker=1,
+                                      accurate_landmark=True,
+                                      threshold=self.det_threshold)
+
+    def get(self, img):
+        ret = self.detector.detect_face(img, det_type=0)
+        if ret is None:
+            return None
+        bbox, points = ret
+        if bbox.shape[0] == 0:
+            return None
+        bbox = bbox[0, 0:4]
+        points = points[0, :].reshape((2, 5)).T
+        M = img_helper.estimate_trans_bbox(bbox, self.image_size[0], s=2.0)
+        rimg = cv2.warpAffine(img, M, self.image_size, borderValue=0.0)
+        img = cv2.cvtColor(rimg, cv2.COLOR_BGR2RGB)
+        img = np.transpose(img, (2, 0, 1))  #3*112*112, RGB
+        input_blob = np.zeros((1, 3, self.image_size[1], self.image_size[0]),
+                              dtype=np.uint8)
+        input_blob[0] = img
+        ta = datetime.datetime.now()
+        data = mx.nd.array(input_blob)
+        db = mx.io.DataBatch(data=(data, ))
+        self.model.forward(db, is_train=False)
+        alabel = self.model.get_outputs()[-1].asnumpy()[0]
+        tb = datetime.datetime.now()
+        print('module time cost', (tb - ta).total_seconds())
+        ret = np.zeros((alabel.shape[0], 2), dtype=np.float32)
+        for i in range(alabel.shape[0]):
+            a = cv2.resize(alabel[i], (self.image_size[1], self.image_size[0]))
+            ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
+            #ret[i] = (ind[0], ind[1]) #h, w
+            ret[i] = (ind[1], ind[0])  #w, h
+        return ret, M
+
+
+ctx_id = 4
+img_path = '../deploy/Tom_Hanks_54745.png'
+img = cv2.imread(img_path)
+#img = np.zeros( (128,128,3), dtype=np.uint8 )
+
+handler = Handler('./model/HG', 1, ctx_id)
+for _ in range(10):
+    ta = datetime.datetime.now()
+    landmark, M = handler.get(img)
+    tb = datetime.datetime.now()
+    print('get time cost', (tb - ta).total_seconds())
+#visualize landmark
+IM = cv2.invertAffineTransform(M)
+for i in range(landmark.shape[0]):
+    p = landmark[i]
+    point = np.ones((3, ), dtype=np.float32)
+    point[0:2] = p
+    point = np.dot(IM, point)
+    landmark[i] = point[0:2]
+
+for i in range(landmark.shape[0]):
+    p = landmark[i]
+    point = (int(p[0]), int(p[1]))
+    cv2.circle(img, point, 1, (0, 255, 0), 2)
+
+filename = './landmark_test.png'
+print('writing', filename)
+cv2.imwrite(filename, img)
diff --git a/insightface/alignment/heatmap/test_rec_nme.py b/insightface/alignment/heatmap/test_rec_nme.py
new file mode 100644
index 0000000000000000000000000000000000000000..baad7c7ba9e0ebf9fde52db926e4159400f91885
--- /dev/null
+++ b/insightface/alignment/heatmap/test_rec_nme.py
@@ -0,0 +1,71 @@
+import argparse
+import cv2
+import sys
+import numpy as np
+import os
+import mxnet as mx
+import datetime
+import img_helper
+from config import config
+from data import FaceSegIter
+from metric import LossValueMetric, NMEMetric
+
+parser = argparse.ArgumentParser(description='test nme on rec data')
+# general
+parser.add_argument('--rec',
+                    default='./data_2d/ibug.rec',
+                    help='rec data path')
+parser.add_argument('--prefix', default='', help='model prefix')
+parser.add_argument('--epoch', type=int, default=1, help='model epoch')
+parser.add_argument('--gpu', type=int, default=0, help='')
+parser.add_argument('--landmark-type', default='2d', help='')
+parser.add_argument('--image-size', type=int, default=128, help='')
+args = parser.parse_args()
+
+rec_path = args.rec
+ctx_id = args.gpu
+prefix = args.prefix
+epoch = args.epoch
+image_size = (args.image_size, args.image_size)
+config.landmark_type = args.landmark_type
+config.input_img_size = image_size[0]
+
+if ctx_id >= 0:
+    ctx = mx.gpu(ctx_id)
+else:
+    ctx = mx.cpu()
+sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+all_layers = sym.get_internals()
+sym = all_layers['heatmap_output']
+#model = mx.mod.Module(symbol=sym, context=ctx, data_names=['data'], label_names=['softmax_label'])
+model = mx.mod.Module(symbol=sym,
+                      context=ctx,
+                      data_names=['data'],
+                      label_names=None)
+#model = mx.mod.Module(symbol=sym, context=ctx)
+model.bind(for_training=False,
+           data_shapes=[('data', (1, 3, image_size[0], image_size[1]))])
+model.set_params(arg_params, aux_params)
+
+val_iter = FaceSegIter(
+    path_imgrec=rec_path,
+    batch_size=1,
+    aug_level=0,
+)
+_metric = NMEMetric()
+#val_metric = mx.metric.create(_metric)
+#val_metric.reset()
+#val_iter.reset()
+nme = []
+for i, eval_batch in enumerate(val_iter):
+    if i % 10 == 0:
+        print('processing', i)
+    #print(eval_batch.data[0].shape, eval_batch.label[0].shape)
+    batch_data = mx.io.DataBatch(eval_batch.data)
+    model.forward(batch_data, is_train=False)
+    #model.update_metric(val_metric, eval_batch.label, True)
+    pred_label = model.get_outputs()[-1].asnumpy()
+    label = eval_batch.label[0].asnumpy()
+    _nme = _metric.cal_nme(label, pred_label)
+    nme.append(_nme)
+print(np.mean(nme))
diff --git a/insightface/alignment/heatmap/train.py b/insightface/alignment/heatmap/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..81412e47f45ea95911bcffe3a5e7cceafbf42d54
--- /dev/null
+++ b/insightface/alignment/heatmap/train.py
@@ -0,0 +1,236 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+from data import FaceSegIter
+import mxnet as mx
+import mxnet.optimizer as optimizer
+import numpy as np
+import os
+import sys
+import math
+import random
+import cv2
+from config import config, default, generate_config
+from optimizer import ONadam
+from metric import LossValueMetric, NMEMetric
+sys.path.append(os.path.join(os.path.dirname(__file__), 'symbol'))
+import sym_heatmap
+#import sym_fc
+#from symbol import fc
+
+args = None
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+def main(args):
+    _seed = 727
+    random.seed(_seed)
+    np.random.seed(_seed)
+    mx.random.seed(_seed)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    #ctx = [mx.gpu(0)]
+    args.ctx_num = len(ctx)
+
+    args.batch_size = args.per_batch_size * args.ctx_num
+    config.per_batch_size = args.per_batch_size
+
+    print('Call with', args, config)
+    train_iter = FaceSegIter(
+        path_imgrec=os.path.join(config.dataset_path, 'train.rec'),
+        batch_size=args.batch_size,
+        per_batch_size=args.per_batch_size,
+        aug_level=1,
+        exf=args.exf,
+        args=args,
+    )
+
+    data_shape = train_iter.get_data_shape()
+    #label_shape = train_iter.get_label_shape()
+    sym = sym_heatmap.get_symbol(num_classes=config.num_classes)
+    if len(args.pretrained) == 0:
+        #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
+        data_shape_dict = train_iter.get_shape_dict()
+        arg_params, aux_params = sym_heatmap.init_weights(sym, data_shape_dict)
+    else:
+        vec = args.pretrained.split(',')
+        print('loading', vec)
+        _, arg_params, aux_params = mx.model.load_checkpoint(
+            vec[0], int(vec[1]))
+        #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
+
+    model = mx.mod.Module(
+        context=ctx,
+        symbol=sym,
+        label_names=train_iter.get_label_names(),
+    )
+    #lr = 1.0e-3
+    #lr = 2.5e-4
+    _rescale_grad = 1.0 / args.ctx_num
+    #_rescale_grad = 1.0/args.batch_size
+    #lr = args.lr
+    #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
+    if args.optimizer == 'onadam':
+        opt = ONadam(learning_rate=args.lr,
+                     wd=args.wd,
+                     rescale_grad=_rescale_grad,
+                     clip_gradient=5.0)
+    elif args.optimizer == 'nadam':
+        opt = optimizer.Nadam(learning_rate=args.lr,
+                              rescale_grad=_rescale_grad)
+    elif args.optimizer == 'rmsprop':
+        opt = optimizer.RMSProp(learning_rate=args.lr,
+                                rescale_grad=_rescale_grad)
+    elif args.optimizer == 'adam':
+        opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
+    else:
+        opt = optimizer.SGD(learning_rate=args.lr,
+                            momentum=0.9,
+                            wd=args.wd,
+                            rescale_grad=_rescale_grad)
+    initializer = mx.init.Xavier(rnd_type='gaussian',
+                                 factor_type="in",
+                                 magnitude=2)
+    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
+    _metric = LossValueMetric()
+    #_metric = NMEMetric()
+    #_metric2 = AccMetric()
+    #eval_metrics = [_metric, _metric2]
+    eval_metrics = [_metric]
+    lr_steps = [int(x) for x in args.lr_step.split(',')]
+    print('lr-steps', lr_steps)
+    global_step = [0]
+
+    def val_test():
+        all_layers = sym.get_internals()
+        vsym = all_layers['heatmap_output']
+        vmodel = mx.mod.Module(symbol=vsym, context=ctx, label_names=None)
+        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+        vmodel.bind(data_shapes=[('data', (args.batch_size, ) + data_shape)])
+        arg_params, aux_params = model.get_params()
+        vmodel.set_params(arg_params, aux_params)
+        for target in config.val_targets:
+            _file = os.path.join(config.dataset_path, '%s.rec' % target)
+            if not os.path.exists(_file):
+                continue
+            val_iter = FaceSegIter(
+                path_imgrec=_file,
+                batch_size=args.batch_size,
+                #batch_size = 4,
+                aug_level=0,
+                args=args,
+            )
+            _metric = NMEMetric()
+            val_metric = mx.metric.create(_metric)
+            val_metric.reset()
+            val_iter.reset()
+            for i, eval_batch in enumerate(val_iter):
+                #print(eval_batch.data[0].shape, eval_batch.label[0].shape)
+                batch_data = mx.io.DataBatch(eval_batch.data)
+                model.forward(batch_data, is_train=False)
+                model.update_metric(val_metric, eval_batch.label)
+            nme_value = val_metric.get_name_value()[0][1]
+            print('[%d][%s]NME: %f' % (global_step[0], target, nme_value))
+
+    def _batch_callback(param):
+        _cb(param)
+        global_step[0] += 1
+        mbatch = global_step[0]
+        for _lr in lr_steps:
+            if mbatch == _lr:
+                opt.lr *= 0.2
+                print('lr change to', opt.lr)
+                break
+        if mbatch % 1000 == 0:
+            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
+        if mbatch > 0 and mbatch % args.verbose == 0:
+            val_test()
+            if args.ckpt == 1:
+                msave = mbatch // args.verbose
+                print('saving', msave)
+                arg, aux = model.get_params()
+                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
+                                         aux)
+        if mbatch == lr_steps[-1]:
+            if args.ckpt == 2:
+                #msave = mbatch//args.verbose
+                msave = 1
+                print('saving', msave)
+                arg, aux = model.get_params()
+                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
+                                         aux)
+            sys.exit(0)
+
+    train_iter = mx.io.PrefetchingIter(train_iter)
+
+    model.fit(
+        train_iter,
+        begin_epoch=0,
+        num_epoch=9999,
+        #eval_data          = val_iter,
+        eval_data=None,
+        eval_metric=eval_metrics,
+        kvstore='device',
+        optimizer=opt,
+        initializer=initializer,
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True,
+        batch_end_callback=_batch_callback,
+        epoch_end_callback=None,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train face alignment')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--prefix',
+                        default=default.prefix,
+                        help='directory to save model.')
+    parser.add_argument('--pretrained', default=default.pretrained, help='')
+    parser.add_argument('--optimizer', default='nadam', help='')
+    parser.add_argument('--lr', type=float, default=default.lr, help='')
+    parser.add_argument('--wd', type=float, default=default.wd, help='')
+    parser.add_argument('--per-batch-size',
+                        type=int,
+                        default=default.per_batch_size,
+                        help='')
+    parser.add_argument('--lr-step',
+                        help='learning rate steps (in epoch)',
+                        default=default.lr_step,
+                        type=str)
+    parser.add_argument('--ckpt', type=int, default=1, help='')
+    parser.add_argument('--norm', type=int, default=0, help='')
+    parser.add_argument('--exf', type=int, default=1, help='')
+    parser.add_argument('--frequent',
+                        type=int,
+                        default=default.frequent,
+                        help='')
+    parser.add_argument('--verbose',
+                        type=int,
+                        default=default.verbose,
+                        help='')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/alignment/synthetics/README.md b/insightface/alignment/synthetics/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..698cc3ee7b7569b36ff4ba31d17433376769a068
--- /dev/null
+++ b/insightface/alignment/synthetics/README.md
@@ -0,0 +1,63 @@
+# Introduction
+
+We provide training and testing tools on synthetics data.
+
+
+## Dataset
+
+### Training dataset
+
+Download `Face Synthetics dataset` from [https://github.com/microsoft/FaceSynthetics](https://github.com/microsoft/FaceSynthetics) and put it somewhere.
+
+<div align="left">
+  <img src="https://github.com/microsoft/FaceSynthetics/raw/main/docs/img/dataset_samples_2.jpg" width="640"/>
+</div>
+<br/>
+
+Then use [tools/prepare_synthetics.py](tools/prepare_synthetics.py) for training data preparation.
+
+
+### Testing dataset
+
+[300-W](https://ibug.doc.ic.ac.uk/resources/300-W/)
+
+
+## Pretrained Model
+
+[ResNet50d](https://drive.google.com/file/d/1kNP7qEl3AYNbaHFUg_ZiyRB1CtfDWXR4/view?usp=sharing)
+
+
+## Train and Test
+
+### Prerequisites
+
+- pytorch_lightning
+- timm
+- albumentations
+
+### Training
+
+`` python -u trainer_synthetics.py ``
+
+which uses `resnet50d` as backbone by default, please check the [code](trainer_synthetics.py) for detail.
+
+### Testing
+
+Please check [test_synthetics.py](test_synthetics.py) for detail.
+
+
+## Result Visualization(3D 68 Keypoints)
+
+<div align="left">
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/image_008_1.jpg?raw=true" width="320"/>
+</div>
+
+<div align="left">
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/image_017_1.jpg?raw=true" width="320"/>
+</div>
+
+<div align="left">
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/alignment/images/image_039.jpg?raw=true" width="320"/>
+</div>
+
+
diff --git a/insightface/alignment/synthetics/datasets/augs.py b/insightface/alignment/synthetics/datasets/augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f47fccdc526f069494fc85b5c53d2153161c79
--- /dev/null
+++ b/insightface/alignment/synthetics/datasets/augs.py
@@ -0,0 +1,40 @@
+import numpy as np
+import albumentations as A
+from albumentations.core.transforms_interface import ImageOnlyTransform
+
+class RectangleBorderAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            fill_value = 0,
+            limit = 0.3,
+            always_apply=False,
+            p=1.0,
+            ):
+        super(RectangleBorderAugmentation, self).__init__(always_apply, p)
+        assert limit>0.0 and limit<1.0
+        self.fill_value = 0
+        self.limit = limit
+
+
+    def apply(self, image, border_size_limit, **params):
+        assert len(border_size_limit)==4
+        border_size = border_size_limit.copy()
+        border_size[0] *= image.shape[1]
+        border_size[2] *= image.shape[1]
+        border_size[1] *= image.shape[0]
+        border_size[3] *= image.shape[0]
+        border_size = border_size.astype(np.int)
+        image[:,:border_size[0],:] = self.fill_value
+        image[:border_size[1],:,:] = self.fill_value
+        image[:,-border_size[2]:,:] = self.fill_value
+        image[-border_size[3]:,:,:] = self.fill_value
+        return image
+
+    def get_params(self):
+        border_size_limit = np.random.uniform(0.0, self.limit, size=4)
+        return {'border_size_limit': border_size_limit}
+
+    def get_transform_init_args_names(self):
+        return ('fill_value', 'limit')
+
diff --git a/insightface/alignment/synthetics/datasets/dataset_synthetics.py b/insightface/alignment/synthetics/datasets/dataset_synthetics.py
new file mode 100644
index 0000000000000000000000000000000000000000..eed440c0c3da385ba39d77701d4a37e78d760548
--- /dev/null
+++ b/insightface/alignment/synthetics/datasets/dataset_synthetics.py
@@ -0,0 +1,163 @@
+import os
+import os.path as osp
+import queue as Queue
+import pickle
+import threading
+import logging
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from .augs import RectangleBorderAugmentation
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank,
+                                                 non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+
+class FaceDataset(Dataset):
+    def __init__(self, root_dir, is_train):
+        super(FaceDataset, self).__init__()
+
+        #self.local_rank = local_rank
+        self.is_train = is_train
+        self.input_size = 256
+        self.num_kps = 68
+        transform_list = []
+        if is_train:
+            transform_list += \
+                [
+                    A.ColorJitter(brightness=0.8, contrast=0.5, p=0.5),
+                    A.ToGray(p=0.1),
+                    A.ISONoise(p=0.1),
+                    A.MedianBlur(blur_limit=(1,7), p=0.1),
+                    A.GaussianBlur(blur_limit=(1,7), p=0.1),
+                    A.MotionBlur(blur_limit=(5,12), p=0.1),
+                    A.ImageCompression(quality_lower=50, quality_upper=90, p=0.05),
+                    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=40, interpolation=cv2.INTER_LINEAR, 
+                        border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.8),
+                    A.HorizontalFlip(p=0.5),
+                    RectangleBorderAugmentation(limit=0.33, fill_value=0, p=0.2),
+                ]
+        transform_list += \
+            [
+                A.geometric.resize.Resize(self.input_size, self.input_size, interpolation=cv2.INTER_LINEAR, always_apply=True),
+                A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+                ToTensorV2(),
+            ]
+        self.transform = A.ReplayCompose(
+            transform_list,
+            keypoint_params=A.KeypointParams(format='xy', remove_invisible=False)
+        )
+        self.root_dir = root_dir
+        with open(osp.join(root_dir, 'annot.pkl'), 'rb') as f:
+            annot = pickle.load(f)
+            self.X, self.Y = annot
+        train_size = int(len(self.X)*0.99)
+        
+        if is_train:
+            self.X = self.X[:train_size]
+            self.Y = self.Y[:train_size]
+        else:
+            self.X = self.X[train_size:]
+            self.Y = self.Y[train_size:]
+        #if local_rank==0:
+        #    logging.info('data_transform_list:%s'%transform_list)
+        flip_parts = ([1, 17], [2, 16], [3, 15], [4, 14], [5, 13], [6, 12], [7, 11], [8, 10],
+            [18, 27], [19, 26], [20, 25], [21, 24], [22, 23],
+            [32, 36], [33, 35],
+            [37, 46], [38, 45], [39, 44], [40, 43], [41, 48], [42, 47],
+            [49, 55], [50, 54], [51, 53], [62, 64], [61, 65], [68, 66], [59, 57], [60, 56])
+        self.flip_order = np.arange(self.num_kps)
+        for pair in flip_parts:
+            self.flip_order[pair[1]-1] = pair[0]-1
+            self.flip_order[pair[0]-1] = pair[1]-1
+        logging.info('len:%d'%len(self.X))
+        print('!!!len:%d'%len(self.X))
+
+    def __getitem__(self, index):
+        x = self.X[index]
+        y = self.Y[index]
+        image_path = os.path.join(self.root_dir, x)
+        img = cv2.imread(image_path)[:,:,::-1]
+        label = y
+        if self.transform is not None:
+            t = self.transform(image=img, keypoints=label)
+            flipped = False
+            for trans in t["replay"]["transforms"]:
+                if trans["__class_fullname__"].endswith('HorizontalFlip'):
+                    if trans["applied"]:
+                        flipped = True
+            img = t['image']
+            label = t['keypoints']
+            label = np.array(label, dtype=np.float32)
+            #print(img.shape)
+            if flipped:
+                #label[:, 0] = self.input_size - 1 - label[:, 0]  #already applied in horizantal flip aug
+                label = label[self.flip_order,:]
+            label /= (self.input_size/2)
+            label -= 1.0
+            label = label.flatten()
+            label = torch.tensor(label, dtype=torch.float32)
+        return img, label
+
+    def __len__(self):
+        return len(self.X)
+
diff --git a/insightface/alignment/synthetics/test_synthetics.py b/insightface/alignment/synthetics/test_synthetics.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a46f810b54e9bc19c5821d4215c6b4e07d1239
--- /dev/null
+++ b/insightface/alignment/synthetics/test_synthetics.py
@@ -0,0 +1,104 @@
+
+from trainer_synthetics import FaceSynthetics
+import sys
+import glob
+import torch
+import os
+import numpy as np
+import cv2
+import os.path as osp
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.utils import face_align
+
+flip_parts = ([1, 17], [2, 16], [3, 15], [4, 14], [5, 13], [6, 12], [7, 11], [8, 10],
+    [18, 27], [19, 26], [20, 25], [21, 24], [22, 23],
+    [32, 36], [33, 35],
+    [37, 46], [38, 45], [39, 44], [40, 43], [41, 48], [42, 47],
+    [49, 55], [50, 54], [51, 53], [62, 64], [61, 65], [68, 66], [59, 57], [60, 56])
+
+app = FaceAnalysis()
+app.prepare(ctx_id=0, det_size=(224, 224))
+input_size = 256
+USE_FLIP = False
+
+root = 'data/300W/Validation'
+output_dir = 'outputs/'
+
+if not osp.exists(output_dir):
+    os.makedirs(output_dir)
+
+outf = open(osp.join(output_dir, 'pred.txt'), 'w')
+
+model = FaceSynthetics.load_from_checkpoint(sys.argv[1]).cuda()
+model.eval()
+for line in open(osp.join(root, '300W_validation.txt'), 'r'):
+    line = line.strip().split()
+    img_path = osp.join(root, line[0])
+    gt = line[1:]
+    #print(len(gt))
+    name = img_path.split('/')[-1]
+    img = cv2.imread(img_path)
+    dimg = img.copy()
+    faces = app.get(img, max_num=1)
+    if len(faces)!=1:
+        continue
+    bbox = faces[0].bbox
+    w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+    center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+    rotate = 0
+    _scale = input_size  / (max(w, h)*1.5)
+    aimg, M = face_align.transform(img, center, input_size, _scale, rotate)
+    #cv2.imwrite("outputs/a_%s"%name, aimg)
+    aimg = cv2.cvtColor(aimg, cv2.COLOR_BGR2RGB)
+    kps = None
+    flips = [0, 1] if USE_FLIP else [0]
+    for flip in flips:
+        input = aimg.copy()
+        if flip:
+            input = input[:,::-1,:].copy()
+        input = np.transpose(input, (2, 0, 1))
+        input = np.expand_dims(input, 0)
+        imgs = torch.Tensor(input).cuda()
+        imgs.div_(255).sub_(0.5).div_(0.5)
+        pred = model(imgs).detach().cpu().numpy().flatten().reshape( (-1, 2) )
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (input_size // 2)
+        if flip:
+            pred_flip = pred.copy()
+            pred_flip[:, 0] = input_size - 1 - pred_flip[:, 0] 
+            for pair in flip_parts:
+                tmp = pred_flip[pair[0] - 1, :].copy()
+                pred_flip[pair[0] - 1, :] = pred_flip[pair[1] - 1, :]
+                pred_flip[pair[1] - 1, :] = tmp
+            pred = pred_flip
+        if kps is None:
+            kps = pred
+        else:
+            kps += pred
+            kps /= 2.0
+    #print(pred.shape)
+
+    IM = cv2.invertAffineTransform(M)
+    kps = face_align.trans_points(kps, IM)
+    outf.write(line[0])
+    outf.write(' ')
+    outf.write(' '.join(["%.5f"%x for x in kps.flatten()]))
+    outf.write("\n")
+    box = bbox.astype(np.int)
+    color = (0, 0, 255)
+    cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
+    kps = kps.astype(np.int)
+    #print(landmark.shape)
+    for l in range(kps.shape[0]):
+        color = (0, 0, 255)
+        cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color, 2)
+
+    cv2.imwrite("outputs/%s"%name, dimg)
+
+    #ret = np.argmax(feat)
+    #print(feat)
+    #outf.write("%s %.4f %.4f %.4f\n"%(line[0], feat[0], feat[1], feat[2]))
+
+outf.close()
+
diff --git a/insightface/alignment/synthetics/tools/prepare_synthetics.py b/insightface/alignment/synthetics/tools/prepare_synthetics.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e977d6110a82cccf300bd7d4845c67f87251733
--- /dev/null
+++ b/insightface/alignment/synthetics/tools/prepare_synthetics.py
@@ -0,0 +1,70 @@
+
+import sys
+import glob
+import torch
+import pickle
+import os
+import numpy as np
+import cv2
+import os.path as osp
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.utils import face_align
+
+app = FaceAnalysis()
+app.prepare(ctx_id=0, det_size=(224, 224))
+output_size = 384
+
+input_dir = '/root/codebase/FaceSynthetics'
+output_dir = 'data/synthetics'
+
+if not osp.exists(output_dir):
+    os.makedirs(output_dir)
+
+X = []
+Y = []
+
+for i in range(0, 100000):
+    if i%1000==0:
+        print('loading', i)
+    x = "%06d.png"%i
+    img_path = osp.join(input_dir, x)
+    img = cv2.imread(img_path)
+    dimg = img.copy()
+    ylines = open(osp.join(input_dir, "%06d_ldmks.txt"%i)).readlines()
+    ylines = ylines[:68]
+    y = []
+    for yline in ylines:
+        lmk = [float(x) for x in yline.strip().split()]
+        y.append( tuple(lmk) )
+    pred = np.array(y)
+    faces = app.get(img, max_num=1)
+    if len(faces)!=1:
+        continue
+    bbox = faces[0].bbox
+    w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+    center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+    rotate = 0
+    _scale = output_size  / (max(w, h)*1.5)
+    aimg, M = face_align.transform(dimg, center, output_size, _scale, rotate)
+    pred = face_align.trans_points(pred, M)
+    #box = bbox.astype(np.int)
+    #color = (0, 0, 255)
+    #cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
+
+    #kps = pred.astype(np.int)
+    #for l in range(kps.shape[0]):
+    #    color = (0, 0, 255)
+    #    cv2.circle(aimg, (kps[l][0], kps[l][1]), 1, color, 2)
+    x = x.replace('png', 'jpg')
+    X.append(x)
+    y = []
+    for k in range(pred.shape[0]):
+        y.append( (pred[k][0], pred[k][1]) )
+    Y.append(y)
+    cv2.imwrite("%s/%s"%(output_dir, x), aimg)
+
+
+with open(osp.join(output_dir, 'annot.pkl'), 'wb') as pfile:
+    pickle.dump((X, Y), pfile, protocol=pickle.HIGHEST_PROTOCOL)
+
diff --git a/insightface/alignment/synthetics/trainer_synthetics.py b/insightface/alignment/synthetics/trainer_synthetics.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de7743dbb9a9b517ffc99248ab6a37bf96a793d
--- /dev/null
+++ b/insightface/alignment/synthetics/trainer_synthetics.py
@@ -0,0 +1,140 @@
+from argparse import ArgumentParser
+
+import os
+import os.path as osp
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.loggers import TensorBoardLogger
+import timm
+from datasets.dataset_synthetics import FaceDataset, DataLoaderX
+
+
+class FaceSynthetics(pl.LightningModule):
+    def __init__(self, backbone):
+        super().__init__()
+        self.save_hyperparameters()
+        backbone = timm.create_model(backbone, num_classes=68*2)
+        self.backbone = backbone
+        self.loss = nn.L1Loss(reduction='mean')
+        self.hard_mining = False
+
+    def forward(self, x):
+        # use forward for inference/predictions
+        y = self.backbone(x)
+        return y
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        if self.hard_mining:
+            loss = torch.abs(y_hat - y) #(B,K)
+            loss = torch.mean(loss, dim=1) #(B,)
+            B = len(loss)
+            S = int(B*0.5)
+            loss, _ = torch.sort(loss, descending=True)
+            loss = loss[:S]
+            loss = torch.mean(loss) * 5.0
+        else:
+            loss = self.loss(y_hat, y) * 5.0
+        self.log('train_loss', loss, on_epoch=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = self.loss(y_hat, y)
+        self.log('val_loss', loss, on_step=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = self.loss(y_hat, y)
+        self.log('test_loss', loss)
+
+    def configure_optimizers(self):
+        #return torch.optim.Adam(self.parameters(), lr=0.0002)
+        opt = torch.optim.SGD(self.parameters(), lr = 0.1, momentum=0.9, weight_decay = 0.0005)
+        def lr_step_func(epoch):
+            return 0.1 ** len([m for m in [15, 25, 28] if m <= epoch])
+        scheduler = torch.optim.lr_scheduler.LambdaLR(
+                optimizer=opt, lr_lambda=lr_step_func)
+        lr_scheduler = {
+                'scheduler': scheduler,
+                'name': 'learning_rate',
+                'interval':'epoch',
+                'frequency': 1}
+        return [opt], [lr_scheduler]
+
+
+
+def cli_main():
+    pl.seed_everything(727)
+
+    # ------------
+    # args
+    # ------------
+    parser = ArgumentParser()
+    parser.add_argument('--backbone', default='resnet50d', type=str)
+    parser.add_argument('--batch_size', default=64, type=int)
+    parser.add_argument('--root', default='data/synthetics', type=str)
+    parser.add_argument('--num-gpus', default=2, type=int)
+    parser.add_argument('--tf32', action='store_true')
+    parser = pl.Trainer.add_argparse_args(parser)
+    args = parser.parse_args()
+
+    if not args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+
+    # ------------
+    # data
+    # ------------
+    train_set = FaceDataset(root_dir=args.root, is_train=True)
+    val_set = FaceDataset(root_dir=args.root, is_train=False)
+
+    train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=3, pin_memory=True)
+    val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False)
+
+    # ------------
+    # model
+    # ------------
+    model = FaceSynthetics(backbone=args.backbone)
+    ckpt_path = 'work_dirs/synthetics'
+    if not os.path.exists(ckpt_path):
+        os.makedirs(ckpt_path)
+
+    # ------------
+    # training
+    # ------------
+    checkpoint_callback = ModelCheckpoint(
+            monitor='val_loss',
+            dirpath=ckpt_path,
+            filename='{epoch:02d}-{val_loss:.6f}',
+            save_top_k=10,
+            mode='min',
+            )
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer(
+        gpus = args.num_gpus,
+        accelerator="ddp",
+        benchmark=True,
+		logger=TensorBoardLogger(osp.join(ckpt_path, 'logs')),
+        callbacks=[checkpoint_callback, lr_monitor],
+        check_val_every_n_epoch=1,
+        progress_bar_refresh_rate=1,
+        max_epochs=30,
+    )
+    trainer.fit(model, train_loader, val_loader)
+
+if __name__ == '__main__':
+    cli_main()
+
diff --git a/insightface/attribute/README.md b/insightface/attribute/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..43fb2b2bd85559f025f42befcae5658dbba0152b
--- /dev/null
+++ b/insightface/attribute/README.md
@@ -0,0 +1,33 @@
+## Face Attribute
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="320"/>
+</div>
+
+
+## Introduction
+
+These are the face attribute methods of [InsightFace](https://insightface.ai)
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/github/t1_genderage.jpg" width="600"/>
+</div>
+
+
+
+## Methods
+
+
+Supported methods:
+
+- [x] [Gender_Age](gender_age)
+
+
+
+## Contributing
+
+We appreciate all contributions to improve the face attribute module of InsightFace. 
+
+
diff --git a/insightface/attribute/_datasets_/README.md b/insightface/attribute/_datasets_/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b0096cf8310702121c2dcb53a4bd01db9b3ee6f
--- /dev/null
+++ b/insightface/attribute/_datasets_/README.md
@@ -0,0 +1,15 @@
+# Face Attribute Datasets
+
+(Updating)
+
+## Training Datasets
+
+### CelebA
+
+https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html
+
+
+
+## Test Datasets
+
+
diff --git a/insightface/attribute/gender_age/test.py b/insightface/attribute/gender_age/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92b21645480f85fb696eecad53f49d78af0fdaf
--- /dev/null
+++ b/insightface/attribute/gender_age/test.py
@@ -0,0 +1,24 @@
+import argparse
+import cv2
+import sys
+import numpy as np
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+
+parser = argparse.ArgumentParser(description='insightface gender-age test')
+# general
+parser.add_argument('--ctx', default=0, type=int, help='ctx id, <0 means using cpu')
+args = parser.parse_args()
+
+app = FaceAnalysis(allowed_modules=['detection', 'genderage'])
+app.prepare(ctx_id=args.ctx, det_size=(640,640))
+
+img = ins_get_image('t1')
+faces = app.get(img)
+assert len(faces)==6
+for face in faces:
+    print(face.bbox)
+    print(face.sex, face.age)
+
diff --git a/insightface/benchmarks/train/nvidia_a10.md b/insightface/benchmarks/train/nvidia_a10.md
new file mode 100644
index 0000000000000000000000000000000000000000..f18620bddc1fc4e2725672dfa20b7b7325839535
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_a10.md
@@ -0,0 +1,48 @@
+# Training performance report on NVIDIA A10
+
+[NVIDIA A10 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) 
+
+We can use A10 to train deep learning models by its FP16 and TF32 supports.
+
+
+
+## Test Server Spec
+
+| Key          | Value                                            |
+| ------------ | ------------------------------------------------ |
+| System       | ServMax G408-X2 Rackmountable Server             |
+| CPU          | 2 x Intel(R) Xeon(R) Gold 5220R CPU @ 2.20GHz    |
+| Memory       | 384GB, 12 x Samsung 32GB DDR4-2933               |
+| GPU          | 8 x NVIDIA A10 22GB                              |
+| Cooling      | 2x Customized GPU Kit for GPU support FAN-1909L2 |
+| Hard Drive   | Intel SSD S4500 1.9TB/SATA/TLC/2.5"              |
+| OS           | Ubuntu 16.04.7 LTS                               |
+| Installation | CUDA 11.1, cuDNN 8.0.5                           |
+| Installation | Python 3.7.10                                    |
+| Installation | PyTorch 1.9.0 (conda)                            |
+
+This server is donated by [AMAX](https://www.amaxchina.com/), many thanks!
+
+
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is in mxnet record format and located on SSD hard drive.
+
+2. Embedding-size are all set to 512.
+
+3. We use a large dataset which contains about 618K identities to simulate real cases.
+
+| Dataset     | Classes | Backbone    | Batch-size | FP16 | TF32 | Samples/sec |
+| ----------- | ------- | ----------- | ---------- | ---- | ---- | ----------- |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | ×    | ~2040       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | √    | ~2255       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | √    | ×    | ~3300       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | √    | √    | ~3360       |
+| WebFace600K | 618K    | IResNet-50  | 2048       | √    | √    | ~3940       |
+| WebFace600K | 618K    | IResNet-100 | 1024       | √    | √    | ~2210       |
+| WebFace600K | 618K    | IResNet-180 | 1024       | √    | √    | ~1410       |
+
+
diff --git a/insightface/benchmarks/train/nvidia_a100.md b/insightface/benchmarks/train/nvidia_a100.md
new file mode 100644
index 0000000000000000000000000000000000000000..9af56fcfd3e5d4eb4cd85f9f745ef608fc46703a
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_a100.md
@@ -0,0 +1,53 @@
+# Training performance report on NVIDIA A100
+
+[NVIDIA A100 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/a100/) 
+
+
+
+## Test Server Spec
+
+| Key          | Value                                            |
+| ------------ | ------------------------------------------------ |
+| System       | ServMax G408-X2 Rackmountable Server             |
+| CPU          | 2 x Intel(R) Xeon(R) Gold 5220R CPU @ 2.20GHz    |
+| Memory       | 384GB, 12 x Samsung 32GB DDR4-2933               |
+| GPU          | 8 x NVIDIA A100 80GB                             |
+| Cooling      | 2x Customized GPU Kit for GPU support FAN-1909L2 |
+| Hard Drive   | Intel SSD S4500 1.9TB/SATA/TLC/2.5"              |
+| OS           | Ubuntu 16.04.7 LTS                               |
+| Installation | CUDA 11.1, cuDNN 8.0.5                           |
+| Installation | Python 3.7.10                                    |
+| Installation | PyTorch 1.9.0 (conda)                            |
+
+This server is donated by [AMAX](https://www.amaxchina.com/), many thanks!
+
+
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is in mxnet record format and located on SSD hard drive.
+2. Embedding-size are all set to 512.
+3. We use large datasets with about 618K/2M identities to simulate real cases.
+4. We test the 10K batch-size on real dataset to take the full advantage of 80GB memory.
+5. We also test on huge synthetic datasets which include 50M~80M classes.
+
+| Dataset     | Classes | Backbone    | Batch-size | PFC  | FP16 | TF32 | Samples/sec | GPU Mem(GB) |
+| ----------- | ------- | ----------- | ---------- | ---- | ---- | ---- | ----------- | ----------- |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | ×    | ×    | ~3670       | ~18.2       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | ×    | √    | ~4760       | ~15.0       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | √    | ×    | ~5170       | ~10.1       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | √    | √    | ~5400       | ~10.1       |
+| WebFace600K | 618K    | IResNet-50  | 2048       | ×    | √    | √    | ~7780       | ~16.4       |
+| WebFace600K | 618K    | IResNet-50  | 10240      | ×    | √    | √    | ~9400       | ~66.7       |
+| WebFace600K | 618K    | IResNet-100 | 1024       | ×    | √    | √    | ~3700       | ~13.1       |
+| WebFace600K | 618K    | IResNet-180 | 1024       | ×    | √    | √    | ~2380       | ~17.5       |
+| WebFace2M   | 2M      | IResNet-100 | 1024       | ×    | √    | √    | ~3480       | ~20.5       |
+| WebFace2M   | 2M      | IResNet-180 | 1024       | ×    | √    | √    | ~2350       | ~25.0       |
+| WebFace2M   | 2M      | IResNet-300 | 1024       | ×    | √    | √    | ~1541       | ~32.6       |
+| Virtual     | 50M     | IResNet-50  | 1024       | 0.1  | √    | √    | ~2700       | ~54.1       |
+| Virtual     | 70M     | IResNet-50  | 1024       | 0.1  | √    | √    | ~2170       | ~73.7       |
+| Virtual     | 80M     | IResNet-50  | 1024       | 0.1  | √    | √    | ~1080       | ~79.6       |
+
+
diff --git a/insightface/benchmarks/train/nvidia_a30.md b/insightface/benchmarks/train/nvidia_a30.md
new file mode 100644
index 0000000000000000000000000000000000000000..2053300a1a0fd4487e66b7ee4c6d4f5962ea8395
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_a30.md
@@ -0,0 +1,52 @@
+# Training performance report on NVIDIA A30
+
+[NVIDIA A30 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/products/a30-gpu/) is the most versatile mainstream
+compute GPU for AI inference and mainstream enterprise
+workloads. 
+
+Besides, we can also use A30 to train deep learning models by its FP16 and TF32 supports.
+
+
+
+## Test Server Spec
+
+| Key          | Value                                            |
+| ------------ | ------------------------------------------------ |
+| System       | ServMax G408-X2 Rackmountable Server             |
+| CPU          | 2 x Intel(R) Xeon(R) Gold 5220R CPU @ 2.20GHz    |
+| Memory       | 384GB, 12 x Samsung 32GB DDR4-2933               |
+| GPU          | 8 x NVIDIA A30 24GB                              |
+| Cooling      | 2x Customized GPU Kit for GPU support FAN-1909L2 |
+| Hard Drive   | Intel SSD S4500 1.9TB/SATA/TLC/2.5"              |
+| OS           | Ubuntu 16.04.7 LTS                               |
+| Installation | CUDA 11.1, cuDNN 8.0.5                           |
+| Installation | Python 3.7.10                                    |
+| Installation | PyTorch 1.9 (conda)                              |
+
+This server is donated by [AMAX](https://www.amaxchina.com/), many thanks!
+
+
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is in mxnet record format and located on SSD hard drive.
+
+2. Embedding-size are all set to 512.
+
+3. We use a large dataset which contains about 618K identities to simulate real cases.
+
+| Dataset     | Classes | Backbone    | Batch-size | FP16 | TF32 | Samples/sec |
+| ----------- | ------- | ----------- | ---------- | ---- | ---- | ----------- |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | ×    | ~2230       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | ×    | √    | ~3200       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | √    | ×    | ~3940       |
+| WebFace600K | 618K    | IResNet-50  | 1024       | √    | √    | ~4350       |
+| WebFace600K | 618K    | IResNet-50  | 2048       | √    | √    | ~5100       |
+| WebFace600K | 618K    | IResNet-100 | 1024       | √    | √    | ~2810       |
+| WebFace600K | 618K    | IResNet-180 | 1024       | √    | √    | ~1800       |
+
+
+
+
diff --git a/insightface/benchmarks/train/nvidia_rtx3080.md b/insightface/benchmarks/train/nvidia_rtx3080.md
new file mode 100644
index 0000000000000000000000000000000000000000..80af02d58547dea33301b6f9fd4c9bafd79f37d1
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_rtx3080.md
@@ -0,0 +1,58 @@
+# Training performance report on NVIDIA RTX3080
+
+[GeForce RTX 3080](https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3080-3080ti/) 
+The GeForce RTX™ 3080 Ti and RTX 3080 graphics cards deliver the ultra performance that gamers crave, powered by Ampere—NVIDIA’s 2nd gen RTX architecture. They are built with enhanced RT Cores and Tensor Cores, new streaming multiprocessors, and superfast G6X memory for an amazing gaming experience.
+
+Besides, we can also use GeForce RTX™ 3080 to train deep learning models by its FP16 and TF32 supports.
+
+
+
+## Test Server Spec
+
+| Key          | Value                                             |
+|--------------|---------------------------------------------------|
+| CPU          | 2 x Intel(R) Xeon(R) Platinum 8255C CPU @ 2.50GHz |
+| Memory       | 384GB                                             |
+| GPU          | 8 x GeForce RTX™ 3080                             |
+| OS           | Ubuntu 18.04.4 LTS                                |
+| Installation | CUDA 11.1,                                        |
+| Installation | Python 3.7.3                                      |
+| Installation | PyTorch 1.9.0 (pip)                               |
+
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is SyntheticDataset.
+
+2. Embedding-size are all set to 512.
+
+
+### 1. 2 Million Identities
+
+We use a large dataset which contains about 2 millions identities to simulate real cases.
+
+
+| Dataset    | Classes    | Backbone   | Batch-size | FP16 | Partial FC | Samples/sec |
+|------------|------------|------------|------------|------|------------|-------------|
+| WebFace40M | 2 Millions | IResNet-50 | 512        | ×    | ×          | Fail        |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | x    | √          | ~2190       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | ×          | Fail        |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | √          | ~2620       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | ×    | ×          | Fail        |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | x    | √          | Fail        |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | ×          | Fail        |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | √          | ~3800       |
+
+### 2. 600K Identities
+
+We use a large dataset which contains about 600k identities to simulate real cases.
+
+| Dataset     | Classes | Backbone   | Batch-size | Partial FC | FP16 | Samples/sec |
+|-------------|---------|------------|------------|------------|------|-------------|
+| WebFace600K | 618K    | IResNet-50 | 512        | ×          | ×    | ~2023       |
+| WebFace600K | 618K    | IResNet-50 | 512        | ×          | √    | ~2392       |
+| WebFace600K | 618K    | IResNet-50 | 1024       | ×          | ×    | Fail        |
+| WebFace600K | 618K    | IResNet-50 | 1024       | ×          | √    | Fail        |
+| WebFace600K | 618K    | IResNet-50 | 1024       | √          | √    | ~4010       |
diff --git a/insightface/benchmarks/train/nvidia_rtx3090.md b/insightface/benchmarks/train/nvidia_rtx3090.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbc15c84f4a9c0574ce5d8a4414cb27e4a63eb1e
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_rtx3090.md
@@ -0,0 +1,57 @@
+# Training performance report on NVIDIA RTX3090
+
+[GEFORCE RTX 3090](https://www.nvidia.com/en-us/geforce/graphics-cards/30-series/rtx-3090/) 
+The GeForce RTX™ 3090 is a big ferocious GPU (BFGPU) with TITAN class performance.
+
+Besides, we can also use GeForce RTX™ 3090 to train deep learning models by its FP16 and TF32 supports.
+
+
+
+## Test Server Spec
+
+| Key          | Value                                             |
+|--------------|---------------------------------------------------|
+| CPU          | 2 x Intel(R) Xeon(R) Platinum 8255C CPU @ 2.50GHz |
+| Memory       | 384GB                                             |
+| GPU          | 8 x GeForce RTX™ 3090                             |
+| OS           | Ubuntu 18.04.4 LTS                                |
+| Installation | CUDA 11.1,                                        |
+| Installation | Python 3.7.3                                      |
+| Installation | PyTorch 1.9.0 (pip)                               |
+
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is SyntheticDataset.
+
+2. Embedding-size are all set to 512.
+
+
+### 1. 2 Million Identities
+
+We use a large dataset which contains about 2 millions identities to simulate real cases.
+
+| Dataset    | Classes    | Backbone   | Batch-size | FP16 | TF32 | Partial FC | Samples/sec |
+|------------|------------|------------|------------|------|------|------------|-------------|
+| WebFace40M | 2 Millions | IResNet-50 | 512        | ×    | ×    | ×          | ~1750       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | ×    | √    | ×          | ~1810       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | √    | ×          | ~2056       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | √    | √          | ~2850       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | √    | ×          | ~2810       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | √    | √          | ~4220       |
+| WebFace40M | 2 Millions | IResNet-50 | 2048       | √    | √    | √          | ~5330       |
+
+
+### 2. 600K Identities
+
+We use a large dataset which contains about 600k identities to simulate real cases.
+
+| Dataset     | Classes | Backbone   | Batch-size | FP16 | Samples/sec |
+|-------------|---------|------------|------------|------|-------------|
+| WebFace600K | 618K    | IResNet-50 | 512        | ×    | ~2220       |
+| WebFace600K | 618K    | IResNet-50 | 512        | √    | ~2610       |
+| WebFace600K | 618K    | IResNet-50 | 1024       | ×    | ~2940       |
+| WebFace600K | 618K    | IResNet-50 | 1024       | √    | ~3790       |
+| WebFace600K | 618K    | IResNet-50 | 2048       | √    | ~4680       |
\ No newline at end of file
diff --git a/insightface/benchmarks/train/nvidia_v100.md b/insightface/benchmarks/train/nvidia_v100.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f3aee107e7245555928d2a1c43a843999d59dfb
--- /dev/null
+++ b/insightface/benchmarks/train/nvidia_v100.md
@@ -0,0 +1,54 @@
+# Training performance report on NVIDIA® V100
+
+[NVIDIA® V100](https://www.nvidia.com/en-us/data-center/v100/) 
+NVIDIA® V100 Tensor Core is the most advanced data center GPU ever built to accelerate AI, high performance computing (HPC), data science and graphics. It’s powered by NVIDIA Volta architecture, comes in 16 and 32GB configurations, and offers the performance of up to 32 CPUs in a single GPU.
+
+Besides, we can also use NVIDIA® V100 to train deep learning models by its FP16 and FP32 supports.
+
+## Test Server Spec
+
+| Key          | Value                                        |
+|--------------|----------------------------------------------|
+| CPU          | 2 x Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz |
+| Memory       | 384GB                                        |
+| GPU          | 8 x Tesla V100-SXM2-32GB                     |
+| OS           | Ubuntu 16.04 LTS                             |
+| Installation | CUDA 10.2                                    |
+| Installation | Python 3.7.3                                 |
+| Installation | PyTorch 1.9.0 (pip)                          |
+
+## Experiments on arcface_torch
+
+We report training speed in following table, please also note that:
+
+1. The training dataset is SyntheticDataset.
+
+2. Embedding-size are all set to 512.
+
+### 1. 2 Million Identities
+
+We use a large dataset which contains about 2 millions identities to simulate real cases.
+
+| Dataset    | Classes    | Backbone   | Batch-size | FP16 | Partial FC | Samples/sec |
+|------------|------------|------------|------------|------|------------|-------------|
+| WebFace40M | 2 Millions | IResNet-50 | 512        | ×    | ×          | ~1868       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | x    | √          | ~2712       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | ×          | ~2576       |
+| WebFace40M | 2 Millions | IResNet-50 | 512        | √    | √          | ~4501       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | ×    | ×          | ~1960       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | x    | √          | ~2922       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | ×          | ~2810       |
+| WebFace40M | 2 Millions | IResNet-50 | 1024       | √    | √          | ~5430       |
+| WebFace40M | 2 Millions | IResNet-50 | 2048       | √    | √          | ~6095       |
+
+### 2. 600K Identities
+
+We use a large dataset which contains about 600k identities to simulate real cases.
+
+| Dataset     | Classes | Backbone   | Batch-size | FP16 | Samples/sec |
+|-------------|---------|------------|------------|------|-------------|
+| WebFace600K | 618K    | IResNet-50 | 512        | ×    | ~2430       |
+| WebFace600K | 618K    | IResNet-50 | 512        | √    | ~3889       |
+| WebFace600K | 618K    | IResNet-50 | 1024       | ×    | ~2607       |
+| WebFace600K | 618K    | IResNet-50 | 1024       | √    | ~4322       |
+| WebFace600K | 618K    | IResNet-50 | 2048       | √    | ~4921       |
diff --git a/insightface/body/human_pose/ambiguity_aware/README.md b/insightface/body/human_pose/ambiguity_aware/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd0fac66bf16f35620c4e4910285692f74a363ca
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/README.md
@@ -0,0 +1,94 @@
+# Towards Alleviating the Modeling Ambiguity of Unsupervised Monocular 3D Human Pose Estimation
+
+## Introduction
+
+**Ambiguity-Aware** studies the ambiguity problem in the task of unsupervised 3D human pose estimation from 2D counterpart, please refer to [ICCV2022](https://openaccess.thecvf.com/content/ICCV2021/papers/Yu_Towards_Alleviating_the_Modeling_Ambiguity_of_Unsupervised_Monocular_3D_Human_ICCV_2021_paper.pdf) for more details.
+
+
+<div align="center">
+ <img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/all.gif" alt="videovis" width="800">
+</div>
+
+
+## Installation 
+```
+conda create -n uvhpe python=3.6 
+conda activate uvhpe
+pip install -r requirements.txt
+# for output,  tensorboard, visualization  
+mkdir log output vis models data
+```
+
+## Dataset And Pretrained Models 
+Download our preprocessed dataset into `data` and pretrained models into `models` from [webpage](https://sites.google.com/view/ambiguity-aware-hpe)
+
+This part will be updated soon.
+## Inference 
+We put some samples with preprocessed 2d keypoints at `scripts/demo_input`. Run inference with command `sh demo.sh` and output can be found at `scripts/demo_output`. 
+
+## Evaluation
+### Evaluation on Human3.6M 
+##### 2D ground-truth as inputs
+* baseline `python main.py --cfg ../cfg/h36m_gt_adv.yaml --pretrain ../models/adv.pth.tar --gpu 0 --eval `
+* scale `python main.py --cfg ../cfg/h36m_gt_scale.yaml --pretrain ../models/tmc_klbone.pth.tar  --eval --gpu 0`
+
+##### 2D predictions as inputs
+* baseline `python main.py --cfg ../cfg/pre_adv.yaml --pretrain ../models/pre_adv.pth.tar --gpu 0 --eval `
+* scale `python main.py --cfg ../cfg/pre_tmc_klbone.yaml --pretrain ../models/pre_tmc_klbone.pth.tar --gpu 0 --eval `
+
+**Note:** baseline is our reproduced version fo "Unsupervised 3d pose estimation with geometric self-supervision"
+
+### Evaluation on LSP
+use the pretrained model from Human3.6M
+
+`python eval_lsp.py --cfg ../cfg/h36m_gt_scale.yaml --pretrain ../models/tmc_klbone.pth.tar`
+
+### Results
+
+The expected **MPJPE** and **P-MPJPE**  results on **Human36M** dataset are shown here:
+
+| Input  | Model                         |     MPJPE     |     PMPJPE     | 
+| :--------- | :------------                  | :------------: | :------------: | 
+| GT | baseline                              |      105.0      |       46.0    |   
+| GT | best                             |      87.85      |       42.0     |     
+| Pre | baseline                             |      113.3     |    54.9     | 
+| Pre | best                            |      93.1       |    52.3     | 
+
+
+**Note:**  MPJPE from the evaluation is slightly different from the performance we release in the paper. This is because MPJPE in the paper is the best MPJPE during training process.
+
+
+
+## Training 
+### Human3.6M 
+* Using ground-truth 2D as inputs: 
+    
+    baseline `python main.py --cfg ../cfg/h36m_gt_adv.yaml --gpu 0 `
+
+    best `python main.py --cfg ../cfg/h36m_gt_scale.yaml --gpu 0`
+
+* Using predicted 2D as inputs: 
+
+    baseline `python main.py --cfg ../cfg/pre_adv.yaml --gpu 0 `
+
+    best `python main.py --cfg ../cfg/pre_tmc_klbone.yaml --gpu 0`
+
+## Visualization
+
+### Human3.6M
+<div align="center">
+ <img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/S9_Discussion 1.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/S9_Phoning 1.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/S9_Photo.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/S9_WalkTogether 1.gif" width="200"/>
+</div>
+
+### Sureal
+<div align="center">
+ <img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/surreal1.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/surreal2.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/surreal3.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/surreal4.gif" width="200"/>
+</div>
+
+### MPI-3DHP
+<div align="center">
+ <img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/TS1.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/TS2.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/TS3.gif" width="200"/><img src="https://github.com/yuzhenbo/yuzhenbo.github.io/raw/main/assets/extra/ICCV2022/TS6.gif" width="200"/>
+</div>
+
+
+### The code of our another paper in ICCV2022 Skeleton2Mesh will be coming soon! 
diff --git a/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_adv.yaml b/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_adv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8cf76a7206e10b1855dfaed3718233804eebe905
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_adv.yaml
@@ -0,0 +1,36 @@
+BATCH_SIZE: 512
+DATA:
+  NUM_FRAMES: 1
+  SCALE_MID_MEAN: 0.720643
+  SCALE_MID_STD: 0.058
+  USE_RANDOM_DIFF: true
+NETWORK:
+  DIS_RES_BLOCKS: 2
+  DIS_TEMP_RES_BLOCKS: 2
+  DIS_USE_SPECTRAL_NORM: false
+  SCALER_INPUT_SIZE: 34
+TRAIN:
+  BOUND_AZIM: 2.44346
+  BOUND_ELEV: 0.34906585
+  DIS_LR: 0.0002
+  LOSS_TYPE: ss_adv
+  LOSS_WEIGHTS:
+  - 1.0
+  - 1.0
+  - 1.0
+  - 1.0
+  MAINNET_CRITICS: 4
+  NUM_CRITICS: 3
+  NUM_CRITICS_TEMP: 3
+  POSE_LR: 0.0002
+  PRETRAIN_LIFTER: false
+  SCALE_LOSS_WEIGHTS:
+  - 0.001
+  - 1.0
+  SUBNET_CRITICS: 1
+  TEMP_LR: 0.0002
+  USE_CYCLE: false
+  USE_NEW_ROT: false
+  USE_NEW_TEMP: false
+  USE_SCALER: false
+USE_GT: true
diff --git a/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_scale.yaml b/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_scale.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eee4ca0945b62c3a7abf9abf9bef9b0e50675c5b
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/cfg/h36m_gt_scale.yaml
@@ -0,0 +1,40 @@
+BATCH_SIZE: 512
+DATA:
+  NUM_FRAMES: 1
+  SCALE_MID_MEAN: 0.720643
+  SCALE_MID_STD: 0.058
+  USE_RANDOM_DIFF: true
+NETWORK:
+  DIS_RES_BLOCKS: 2
+  DIS_TEMP_RES_BLOCKS: 2
+  DIS_USE_SPECTRAL_NORM: false
+  SCALER_INPUT_SIZE: 34
+TRAIN:
+  BOUND_AZIM: 2.44346
+  BOUND_ELEV: 0.34906585
+  DIS_LR: 0.0001
+  LOSS_TYPE: ss_adv
+  LOSS_WEIGHTS:
+  - 0.5
+  - 5.0
+  - 1.0
+  - 1.0
+  MAINNET_CRITICS: 4
+  NUM_CRITICS: 3
+  NUM_CRITICS_TEMP: 3
+  POSE_LR: 0.00015
+  PRETRAIN_LIFTER: false
+  SCALE_LOSS_WEIGHTS:
+  - 0.001
+  - 1.0
+  SUBNET_CRITICS: 1
+  TEMP_LR: 0.0001
+  SCHEDULER_STEP_SIZE: 5
+  USE_CYCLE: true
+  USE_NEW_ROT: false
+  USE_NEW_TEMP: true
+  USE_SCALER: true
+USE_GT: true
+FIX:
+  FIX_TRAJ: true 
+  FIX_TRAJ_BY_ROT: false
diff --git a/insightface/body/human_pose/ambiguity_aware/cfg/pre_adv.yaml b/insightface/body/human_pose/ambiguity_aware/cfg/pre_adv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..603af24ebaad7f610f80b6a36e107d872ed47f80
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/cfg/pre_adv.yaml
@@ -0,0 +1,36 @@
+BATCH_SIZE: 512
+DATA:
+  NUM_FRAMES: 1
+  SCALE_MID_MEAN: 0.720643
+  SCALE_MID_STD: 0.058
+  USE_RANDOM_DIFF: true
+NETWORK:
+  DIS_RES_BLOCKS: 2
+  DIS_TEMP_RES_BLOCKS: 2
+  DIS_USE_SPECTRAL_NORM: false
+  SCALER_INPUT_SIZE: 34
+TRAIN:
+  BOUND_AZIM: 2.44346
+  BOUND_ELEV: 0.34906585
+  DIS_LR: 0.0001
+  LOSS_TYPE: ss_adv
+  LOSS_WEIGHTS:
+  - 0.5
+  - 5.0
+  - 1.0
+  - 1.0
+  MAINNET_CRITICS: 4
+  NUM_CRITICS: 3
+  NUM_CRITICS_TEMP: 3
+  POSE_LR: 0.0001
+  PRETRAIN_LIFTER: false
+  SCALE_LOSS_WEIGHTS:
+  - 0.001
+  - 1.0
+  SUBNET_CRITICS: 1
+  TEMP_LR: 0.0002
+  USE_CYCLE: false
+  USE_NEW_ROT: false
+  USE_NEW_TEMP: false
+  USE_SCALER: false
+USE_GT: false
diff --git a/insightface/body/human_pose/ambiguity_aware/cfg/pre_tmc_klbone.yaml b/insightface/body/human_pose/ambiguity_aware/cfg/pre_tmc_klbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95c3fda6649093d661244998207127fa0d59d6c5
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/cfg/pre_tmc_klbone.yaml
@@ -0,0 +1,39 @@
+BATCH_SIZE: 512
+DATA:
+  EXP_TMC: true
+  EXP_TMC_DETERMINISTIC: true
+  EXP_TMC_INTERVAL: 3
+  NUM_FRAMES: 1
+  SCALE_MID_MEAN: 0.720643
+  SCALE_MID_STD: 0.058
+  USE_RANDOM_DIFF: true
+NETWORK:
+  DIS_RES_BLOCKS: 2
+  DIS_TEMP_RES_BLOCKS: 2
+  DIS_USE_SPECTRAL_NORM: false
+  SCALER_INPUT_SIZE: 34
+TRAIN:
+  BOUND_AZIM: 2.44346
+  BOUND_ELEV: 0.34906585
+  DIS_LR: 0.0001
+  LOSS_TYPE: ss_adv
+  LOSS_WEIGHTS:
+  - 0.5
+  - 5.0
+  - 1.0
+  - 1.0
+  MAINNET_CRITICS: 4
+  NUM_CRITICS: 3
+  NUM_CRITICS_TEMP: 3
+  POSE_LR: 0.0001
+  PRETRAIN_LIFTER: false
+  SCALE_LOSS_WEIGHTS:
+  - 0.01
+  - 1.0
+  SUBNET_CRITICS: 1
+  TEMP_LR: 0.0002
+  USE_CYCLE: true
+  USE_NEW_ROT: false
+  USE_NEW_TEMP: false
+  USE_SCALER: true
+USE_GT: false
diff --git a/insightface/body/human_pose/ambiguity_aware/requirements.txt b/insightface/body/human_pose/ambiguity_aware/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e11323eeca2257ef7db28116c2cd306240b8121
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/requirements.txt
@@ -0,0 +1,12 @@
+torch==1.13.1
+torchvision==0.5.0
+tqdm 
+opencv-python 
+scikit-learn 
+matplotlib
+h5py 
+pyyaml
+seaborn 
+imageio 
+easydict
+tensorboardX
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/_init_paths.py b/insightface/body/human_pose/ambiguity_aware/scripts/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..125a82bdd4b024a1ea4f9646c69fb8639c97b7a4
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/_init_paths.py
@@ -0,0 +1,13 @@
+import os.path as osp
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = osp.dirname(__file__)
+
+lib_path = osp.join(this_dir, '..')
+add_path(lib_path)
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo.sh b/insightface/body/human_pose/ambiguity_aware/scripts/demo.sh
new file mode 100755
index 0000000000000000000000000000000000000000..008e0ec348e874f2fb4825805ffb13c12182656a
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+rm -rf demo_output
+
+python inference.py --indir demo_input --outdir demo_output --cfg ../cfg/h36m_gt_scale.yaml --pretrain ../models/tmc_klbone.pth.tar
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bb0433aae2e61a367dd6e910491543efda4e0c6a
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..847c5d1ac6e0b166a5d61cb1c41d72de27ebf314
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15e90d8305cb0946a6f3c08c625dad617032102e520eb9c5d4c17d0af1609482
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e25834de127bfea3b75ad613ae96d3d7bec7a861
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..59afaca4f05f52c0453afeb92cbcf525ad1dd1c5
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/1.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6aa3178631c3dd2613b3def95a2c17e284bb262ee62cefdf647fe768a8c6efc6
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1652d95b5a518d6ee310846e8175f2adc41aa6dd
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2a729c25ef6e5cb49a8931fa87be2092c71a3b3e
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/10.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01f9a4cc1b06719e65b4af28a3ac62e27afaadacc1028c9ae8e8e96d5724e23e
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..85e098f6d9a7a9679d866bd534b7e1b5f9d75503
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0fde1dee90296709c407d34e451fcfaa9fbeea34
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/11.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88c1b12e23512a66e8cb7802c06b447924ad5efae06098caf9de42ec996cd1ef
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9a629ee9bb0e412a588e8b7b0f170b4df05531e7
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a03b5b2f46cd387f507eebb25918aef62327bb9f
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/2.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65880459ce2507018ead98d3e6a137c4267c5a81c7b5a77bf4d7224222dc3351
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..98774475e5a3c5ef7080c38ef11f6e63cb97fda6
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..dcb1b7a5478ca14d1793a20579f31f8ef513ef06
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/3.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50d063215f2aa045499d9a4e80a873678c0b47308f90996d0d3933ba9db04a8a
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f48b5d16e6462d57918a312c85a84dadd3c28fdc
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1c2c9d56efa7d9f3eca9d23ada854e6e83120f0b
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/4.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e21a4df9f03717d45cdb474bcd6bb0c4c1458f09823e1232ff861b0891bf8a7a
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..258a447bc5a2add14ae691ed540ed8698d2c53b2
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..cabd0d62d08cf5cb5074ed50557a2984b8169a0e
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/5.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:751fa527dd29ab95ae0be4f9e9f45fd655a39eddb7a377cb570dc9df51fad740
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..15fa679c91a22a900beeb18d53da52aaa596dea9
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..65285b54ca5b32d919399e106e7f995ed4ab0408
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/6.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69ca75495d43c587dfb08c753b57a493b401e090634fad55edbc04f833ea12bd
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..708562da70c45014b38bc8175ebcd42e6f79cb63
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a63155059847b29112a073833a427e99b40932b3
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/7.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cab1c6c82ce3e81dbaae9d0a9e8691355d5d0299bdb6f797c5a1b9d0acd55e5
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..76f0b8132b15b401aefc22724d95b0f3b1f4cff3
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..9d274ccb94cc89a824cfef72e38e9cf433edc264
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/8.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70be7a3513c84438d27dd9cc18d2e43cbc2e5a874cb991023bb6b8c0e7e7ec73
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9e87baef6da59bed5ff53554a694c117f6e5450b
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.pkl b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..9bf4632e337cd15818acb7159e3fdc4c9cb13fa7
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/demo_input/9.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3dc28b12e86c7c8359888bd0f3a43b895dfcbd6caeeb915a299ac14e7817b8c
+size 513
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/0_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/0_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c611607b8a033b61d69b97363944c8b15c06a78f
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/0_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/10_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/10_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..10360b8076428b0825490da3ddd8f177938715a6
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/10_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/11_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/11_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ae4d8db4fd18d3f6f3403986a6b743b919f9264e
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/11_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/1_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/1_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..06c545dd51cee622d01fb797c93fed89a37ba6fb
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/1_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/2_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/2_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..51d395fd7dd7091fade5f966ea4bb22eb0859350
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/2_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/3_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/3_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..34b071f954e1494e1d6d193e7badebed591fb130
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/3_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/4_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/4_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c06bc935b64f3dd916628b20a768bf423e8e0
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/4_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/5_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/5_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fde6bf0921108a7d4b2549d292de6c4e5946d788
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/5_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/6_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/6_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..70bdd20d54ab9162f477b7eb4b66919a3b063f78
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/6_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/7_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/7_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..27f770e159c3a35eb092c65b177db1f93fe7eca2
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/7_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/8_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/8_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b7ac30eec7a98ec3bc65a8a2e48943e8fa51ba62
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/8_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/9_out.jpg b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/9_out.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1631baea2ef10a023dd129dece2483e4e59ae142
Binary files /dev/null and b/insightface/body/human_pose/ambiguity_aware/scripts/demo_output/9_out.jpg differ
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/eval_lsp.py b/insightface/body/human_pose/ambiguity_aware/scripts/eval_lsp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2307212dc9efea0f94c86d331804e2fc79150ca5
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/eval_lsp.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import _init_paths
+import os 
+import os.path as osp
+import cv2
+import numpy as np
+import torch
+import argparse 
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from lib.models.model import get_pose_model, get_discriminator
+from lib.core.config import  config, update_config, update_dir
+from lib.dataloader.lsp import LSPDataset 
+
+image_root = "../data/lsp_images"
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--cfg', default='../cfg/h36m_gt_adv.yaml')
+parser.add_argument('--pretrain', default='../models/adv.pth.tar')
+# parser.add_argument('--cfg', default='../cfg/best_samenorm.yaml')
+args = parser.parse_args()
+
+update_config(args.cfg)
+
+pose_model = get_pose_model(config)
+print(pose_model)
+# state_dict = torch.load("../output/model4118.pth.tar")['pose_model_state_dict']
+assert osp.exists(args.pretrain), "Can not find pretrained model at {}".format(args.pretrain)
+state_dict = torch.load(args.pretrain)['pose_model_state_dict']
+pose_model.load_state_dict(state_dict)
+pose_model.eval()
+torch.set_grad_enabled(False)
+
+lsp_dataset = LSPDataset()
+lsp_loader = DataLoader(lsp_dataset, batch_size=32, num_workers=4, drop_last=False)
+
+all_joints_2d = []
+all_joints_3d_pre = []
+for joints_2d, original_joints_2d in tqdm(lsp_loader): 
+    joints_3d, _, _ = pose_model(joints_2d, is_train=False)
+    all_joints_2d.append(original_joints_2d.numpy())
+    all_joints_3d_pre.append(joints_3d.numpy())
+
+all_joints_2d = np.concatenate(all_joints_2d, axis=0)
+all_joints_3d_pre = np.concatenate(all_joints_3d_pre, axis=0)
+print(all_joints_3d_pre.shape)
+
+
+for idx, joints_3d_pre in tqdm(enumerate(all_joints_3d_pre)):
+    joints_2d = all_joints_2d[idx]
+    joints_3d_pre = joints_3d_pre - joints_3d_pre[13:14]
+    image_path = osp.join(image_root, "im%04d.jpg" % (idx + 1))
+    print(image_path)
+    image = cv2.imread(image_path)
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+
+    for pair in pairs: 
+        i, j = pair
+        if pair in pairs_left: 
+            color = "blue"
+            cv_color = (255, 0, 0)
+        elif pair in pairs_right: 
+            color = "green"
+            cv_color = (0, 255, 0)
+        else: 
+            color = "darkorange"
+            cv_color = (89, 141, 252)
+        x1, y1 = joints_2d[i].astype(np.int)
+        x2, y2 = joints_2d[j].astype(np.int)
+        
+        cv2.line(image, (x1, y1), (x2, y2), cv_color, 2)
+        x1, y1, z1 = joints_3d_pre[i]
+        x2, y2, z2 = joints_3d_pre[j]
+        ax.plot([z1, z2], [x1, x2], [-y1, -y2], c=color, linewidth=3)
+
+    image = image[::-1, :, ::-1].copy().astype(np.float) / 255.
+    r = 0.95
+    xroot = yroot = zroot = 0.
+    # radius = max(4, (np.mean(image.shape[:2]) * 0.01).astype(int))
+    radius = 0.75
+    xx = np.linspace(-r * radius + xroot, r * radius + xroot, image.shape[1])
+    yy = np.linspace(-r * radius + yroot, r * radius + yroot, image.shape[0])
+    xx, yy = np.meshgrid(xx, yy)
+    zz = np.ones_like(xx) * (-3.2* radius + zroot)
+    ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=image, shade=False)
+    ax.set_xlabel('Z', fontsize=13)
+    ax.set_ylabel("X", fontsize=13)
+    ax.set_zlabel("Y", fontsize=13)
+    ax.set_ylim3d([-radius+xroot, radius+xroot])
+    ax.set_zlim3d([-radius+yroot, radius+yroot])
+    ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+    ax.get_xaxis().set_ticklabels([])
+    ax.get_yaxis().set_ticklabels([])
+    ax.set_zticklabels([])
+
+    white = (1.0, 1.0, 1.0, 0.0)
+    ax.w_xaxis.set_pane_color(white)
+    ax.w_yaxis.set_pane_color(white)
+
+    ax.w_xaxis.line.set_color(white)
+    ax.w_yaxis.line.set_color(white)
+    ax.w_zaxis.line.set_color(white)
+
+    plt.savefig("lsp_vis/{}.png".format(idx+1))
+    plt.close()
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/get_diff.py b/insightface/body/human_pose/ambiguity_aware/scripts/get_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe41f027ad8a9ed678db0278a66b878cbfbfd6dd
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/get_diff.py
@@ -0,0 +1,91 @@
+import numpy as np 
+import h5py 
+import argparse 
+
+np.random.seed(2019)
+parser = argparse.ArgumentParser(description="Generate the diff data")
+parser.add_argument("--valid", action="store_true")
+parser.add_argument("--use_random", action="store_true")
+# specify the interval 
+parser.add_argument("--bound", default=1, type=int, required=False)
+parser.add_argument("--use_previous", action="store_true", help="Specify whether to use previous frames or not")
+parser.add_argument('--use_pre', action='store_true')
+args = parser.parse_args()
+
+# compute the difference of frames 
+bound = args.bound
+is_train = not args.valid
+use_random = args.use_random
+use_previous = args.use_previous
+use_pre = args.use_pre
+# in_filename = "../data/kinetics_final.h5"
+suffix = str(bound) if bound > 1 else ""
+if use_random: 
+    suffix += "_rand"
+if use_pre: 
+    suffix += "_pre"
+
+in_filename = "../data/h36m_{}_pred3.h5".format("train" if is_train else "valid")
+out_filename = "../data/h36m_{}_diff{}.h5".format("train" if is_train else "valid", suffix)
+
+f = h5py.File(in_filename, "r")
+names = [name.decode() for name in f['imagename'][:]]
+joints_2d = np.array(f['joint_2d_gt' if not use_pre else "joint_2d_pre"])
+f.close()
+print("Load from", in_filename)
+
+size = joints_2d.shape[0]
+splits = [name.split('/') for name in names]
+sequences = ['/'.join(split[:3]) for split in splits]
+indices = [int(split[-1]) for split in splits]
+
+# calculate the length of each sequence
+seq_lens = {}
+for split in splits: 
+    seq = '/'.join(split[:3])
+    if seq not in seq_lens: 
+        seq_lens[seq] = 0 
+    seq_lens[seq] += 1
+
+intervals = np.random.randint(1, bound + 1, (size, ))
+if not use_random:
+    intervals.fill(bound)
+
+if use_previous: 
+    spec_indices = [i for i, index in enumerate(indices) if index < intervals[i]]
+    diff_indices = np.arange(0, size, 1) - intervals
+    diff_indices[spec_indices] += 2 * intervals[spec_indices]
+else: 
+    spec_indices = [i for i, index in enumerate(indices) if index >= seq_lens[sequences[i]] - intervals[i]]
+    diff_indices = np.arange(0, size, 1) + intervals
+    diff_indices[spec_indices] -= 2 * intervals[spec_indices]
+
+# before_joints = np.concatenate((joints_2d[:1].copy(), joints_2d[:-1].copy()), axis=0)
+# after_joints = np.concatenate((joints_2d[1:].copy(), joints_2d[-1:].copy()), axis=0)
+# print(before_joints.shape, after_joints.shape)
+
+# diff_before = joints_2d - before_joints
+# diff_after = joints_2d - after_joints
+# diff_before, diff_after = before_joints, after_joints
+# diff_before, diff_after = diff_before[:, np.newaxis], diff_after[:, np.newaxis]
+
+# finally process the special cases 
+# diff_before[start_indices] = diff_after[start_indices]
+# diff_after[end_indices] = diff_before[end_indices]
+
+# diff = np.concatenate((diff_before, diff_after), axis=1)
+# print(diff.shape)
+
+# diff_types = np.ones((len(diff), ), dtype=np.uint8)
+# diff_types[start_indices] = 0
+# diff_types[end_indices] = 2
+
+diff = joints_2d[diff_indices]
+dist = np.linalg.norm((joints_2d - diff).reshape(size, -1), axis=1).mean()
+print("Mean distance bewteen diff and original: {:.3f}".format(dist))
+
+f = h5py.File(out_filename, "w")
+f['gt_diff'] = diff 
+# f['gt_diff_type'] = diff_types
+f.close()
+print("Saved to", out_filename)
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/inference.py b/insightface/body/human_pose/ambiguity_aware/scripts/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31c9e8d2d17c7af5a174c21b8d277ecee4e83f6
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/inference.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import _init_paths
+import os 
+import os.path as osp
+import cv2
+import numpy as np
+import torch
+import argparse 
+import pickle as pkl 
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from lib.models.model import get_pose_model, get_discriminator
+from lib.core.config import  config, update_config, update_dir
+
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--indir', default='./demo_input')
+parser.add_argument('--outdir', default='./demo_output')
+parser.add_argument('--cfg', default='../cfg/h36m_gt/tmc_klbone.yaml')
+parser.add_argument('--pretrain', default='../models/tmc_klbone.pth.tar')
+# parser.add_argument('--cfg', default='../cfg/best_samenorm.yaml')
+args = parser.parse_args()
+
+indir = args.indir 
+outdir = args.outdir
+os.makedirs(outdir, exist_ok=True)
+
+update_config(args.cfg)
+
+pose_model = get_pose_model(config)
+print(pose_model)
+assert osp.exists(args.pretrain), "Can not find pretrained model at {}".format(args.pretrain)
+state_dict = torch.load(args.pretrain)['pose_model_state_dict']
+pose_model.load_state_dict(state_dict)
+pose_model.eval()
+torch.set_grad_enabled(False)
+
+img_files = [osp.join(indir, _file) for _file in os.listdir(indir) if _file.endswith(".jpg") and osp.exists(osp.join(indir, _file.replace(".jpg", ".pkl")))]
+kps_files = [_file.replace(".jpg", ".pkl") for _file in img_files]
+
+all_joints_2d = []
+all_joints_3d_pre = []
+
+for kps_file in kps_files: 
+    with open(kps_file, "rb") as f: 
+        data = pkl.load(f)
+    joints_2d = torch.from_numpy(data['joints_2d'][np.newaxis]).float()
+    original_joints_2d = torch.from_numpy(data['original_joints_2d'][np.newaxis]).float()
+    joints_3d, _, _ = pose_model(joints_2d, is_train=False)
+    all_joints_2d.append(original_joints_2d.numpy())
+    all_joints_3d_pre.append(joints_3d.numpy())
+
+all_joints_2d = np.concatenate(all_joints_2d, axis=0)
+all_joints_3d_pre = np.concatenate(all_joints_3d_pre, axis=0)
+# print(all_joints_3d_pre.shape)
+
+
+for idx, joints_3d_pre in tqdm(enumerate(all_joints_3d_pre), total=len(all_joints_3d_pre)):
+    joints_2d = all_joints_2d[idx]
+    joints_3d_pre = joints_3d_pre - joints_3d_pre[13:14]
+    image_path = img_files[idx]
+    save_path = osp.join(outdir, osp.basename(image_path).replace(".jpg", "_out.jpg"))
+    image = cv2.imread(image_path)
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+
+    for pair in pairs: 
+        i, j = pair
+        if pair in pairs_left: 
+            color = "blue"
+            cv_color = (255, 0, 0)
+        elif pair in pairs_right: 
+            color = "green"
+            cv_color = (0, 255, 0)
+        else: 
+            color = "darkorange"
+            cv_color = (89, 141, 252)
+        x1, y1 = joints_2d[i].astype(np.int)
+        x2, y2 = joints_2d[j].astype(np.int)
+        
+        cv2.line(image, (x1, y1), (x2, y2), cv_color, 2)
+        x1, y1, z1 = joints_3d_pre[i]
+        x2, y2, z2 = joints_3d_pre[j]
+        ax.plot([z1, z2], [x1, x2], [-y1, -y2], c=color, linewidth=3)
+
+    image = image[::-1, :, ::-1].copy().astype(np.float) / 255.
+    r = 0.95
+    xroot = yroot = zroot = 0.
+    # radius = max(4, (np.mean(image.shape[:2]) * 0.01).astype(int))
+    radius = 0.75
+    xx = np.linspace(-r * radius + xroot, r * radius + xroot, image.shape[1])
+    yy = np.linspace(-r * radius + yroot, r * radius + yroot, image.shape[0])
+    xx, yy = np.meshgrid(xx, yy)
+    zz = np.ones_like(xx) * (-3.2* radius + zroot)
+    ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=image, shade=False)
+    ax.set_xlabel('Z', fontsize=13)
+    ax.set_ylabel("X", fontsize=13)
+    ax.set_zlabel("Y", fontsize=13)
+    ax.set_ylim3d([-radius+xroot, radius+xroot])
+    ax.set_zlim3d([-radius+yroot, radius+yroot])
+    ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+    ax.get_xaxis().set_ticklabels([])
+    ax.get_yaxis().set_ticklabels([])
+    ax.set_zticklabels([])
+
+    white = (1.0, 1.0, 1.0, 0.0)
+    ax.w_xaxis.set_pane_color(white)
+    ax.w_yaxis.set_pane_color(white)
+
+    ax.w_xaxis.line.set_color(white)
+    ax.w_yaxis.line.set_color(white)
+    ax.w_zaxis.line.set_color(white)
+        
+    plt.savefig(save_path)
+    plt.close()
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/main.py b/insightface/body/human_pose/ambiguity_aware/scripts/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ff3e5dda7878416007dff0ddfb80999a6d3d3da
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/main.py
@@ -0,0 +1,317 @@
+import _init_paths
+import os, sys, datetime, shutil
+import os.path as osp
+import random
+import h5py 
+import pickle as pkl
+import numpy as np
+import argparse
+import subprocess
+import warnings 
+warnings.filterwarnings('ignore')
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.backends.cudnn as cudnn
+from torch.utils.data import TensorDataset,DataLoader
+from torch.optim import lr_scheduler
+
+from lib.dataloader.mpiinf import MPIINFDataset as mpiinf
+from lib.dataloader.h36m import Human36MDataset as h36m
+from lib.dataloader.surreal import SurrealDataset as surreal
+from lib.models.model import get_pose_model, get_discriminator
+from lib.core.config import  config, update_config, update_dir
+from lib.utils.misc import save_pickle, create_logger, load_pickle
+from lib.utils.utils import load_checkpoint, save_checkpoint, get_optimizer
+from lib.utils.vis import plot_scalemid_dist, plot_scalemid_seq_dist
+from tensorboardX import SummaryWriter
+
+def set_cudnn(config): 
+    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
+    torch.backends.cudnn.enabled = config.CUDNN.ENABLED
+    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
+
+def parse_args():
+    parser = argparse.ArgumentParser("Train the unsupervised human pose estimation network")    
+    parser.add_argument('--cfg', help="Specify the path of the path of the config(*.yaml)", default='../cfg/default.yaml')
+    parser.add_argument('--use_gt', action='store_true', help='Specify whether to use 2d gt / predictions as inputs')
+    parser.add_argument('--model_dir', help='Specify the directory of pretrained model', default='')
+    parser.add_argument('--data_dir', help="Specify the directory of data", default=config.DATA_DIR)
+    parser.add_argument('--log_dir', help='Specify the directory of output', default=config.LOG_DIR)
+    
+    parser.add_argument('--dataset_name', help="Specify which dataset to use", choices=["h36m", "mpi"], default="h36m")
+    parser.add_argument('--workers', help="Specify the number of workers for data loadering", default=config.NUM_WORKERS)
+    parser.add_argument('--gpu', help="Specify the gpu to use for training", default='')
+    parser.add_argument('--debug', action='store_true', help="Turn on the debug mode")
+    parser.add_argument('--print_info', action='store_true', help="Whether to print detailed information in tqdm processing")
+    parser.add_argument('--eval', action='store_true', help="Evaluate the model on the dataset(i.e. generate: joint_3d_pre)")
+    parser.add_argument('--eval_suffix', default=None, help="Specify the suffix to save predictions on 3D in evaluation mode")
+    parser.add_argument('--pretrain', default='', help="Whether to use pretrain model")
+    parser.add_argument('--finetune_rotater', action='store_true', help="Load pretrained model and finetune rotater")
+    parser.add_argument('--print_interval', type=int, default=50)
+    args = parser.parse_args()
+    if args.cfg:
+        update_config(args.cfg)
+    else:
+        print("Using default config...")
+    update_dir(args.model_dir, args.log_dir, args.data_dir, args.debug)
+    return args
+
+def reset_config(config, args):
+    if not config.USE_GT: 
+        config.USE_GT = True if args.use_gt else False
+    if args.gpu:
+        config.GPU = args.gpu
+    config.NUM_WORKERS = args.workers
+    config.DEBUG = args.debug
+    if args.print_info: 
+        config.PRINT_INFO = args.print_info
+    if args.pretrain: 
+        config.TRAIN.PRETRAIN_LIFTER = True 
+        config.TRAIN.LIFTER_PRETRAIN_PATH = args.pretrain
+    if args.finetune_rotater:
+        assert config.TRAIN.PRETRAIN_LIFTER
+        config.TRAIN.FINETUNE_ROTATER = args.finetune_rotater
+
+def main():
+    args = parse_args()
+    reset_config(config, args)
+    set_cudnn(config)
+    seed = config.RANDOM_SEED
+    np.random.seed(seed); random.seed(seed)
+    torch.manual_seed(seed) ; torch.cuda.manual_seed(seed)
+    os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU.strip()
+    gpus = list(range(len(config.GPU.strip().split(','))))
+
+    logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg)
+    summary_writer = SummaryWriter(log_dir=tb_log_dir)
+
+    this_dir = osp.dirname(__file__)
+    # backup the source code and the yaml config
+    if args.cfg:
+        shutil.copy(args.cfg, osp.join(final_output_dir, osp.basename(args.cfg)))
+    if not osp.exists(osp.join(final_output_dir, "lib")):
+        shutil.copytree(osp.join(this_dir, "../lib/"), osp.join(final_output_dir, "lib"))
+    for k, v in config.items():
+        logger.info(f"{k}: {v}")
+
+    # conditional import 
+    if config.TRAIN.FINETUNE_ROTATER: 
+        from lib.core.function3 import train, validate, evaluate
+    elif config.TRAIN.USE_CYCLE: 
+        from lib.core.function2 import train, validate, evaluate
+    else: 
+        from lib.core.function1 import train, validate, evaluate
+
+    # build model
+    logger.info('start building model.')
+    if len(gpus) > 1: 
+        pose_model = torch.nn.DataParallel(get_pose_model(config)).cuda(gpus[0])
+        discriminator = torch.nn.DataParallel(get_discriminator(config)).cuda(gpus[0])
+        temp_discriminator = torch.nn.DataParallel(get_discriminator(config)).cuda(gpus[0])
+    else:
+        pose_model = get_pose_model(config).cuda()
+        discriminator = get_discriminator(config, is_temp=False).cuda()
+        temp_discriminator = get_discriminator(config, is_temp=True).cuda()
+    optimizer_g = get_optimizer(config, pose_model, is_dis=False)
+    optimizer_d = get_optimizer(config, discriminator, is_dis=True)
+    optimizer_d_temp = get_optimizer(config, temp_discriminator, is_dis=True, is_temp=True)
+    step_size, gamma = config.TRAIN.SCHEDULER_STEP_SIZE, config.TRAIN.SCHEDULER_GAMMA
+    scheduler_g = lr_scheduler.StepLR(optimizer_g, step_size=step_size, gamma=gamma)
+    scheduler_d = lr_scheduler.StepLR(optimizer_d, step_size=step_size, gamma=gamma)
+    scheduler_temp = lr_scheduler.StepLR(optimizer_d_temp, step_size=step_size, gamma=gamma)
+    logger.info('finished building model.')
+    # print out the model arch 
+    if config.TRAIN.PRETRAIN_LIFTER: 
+        print("Load pretrained lifter...")
+        state_dict = torch.load(config.TRAIN.LIFTER_PRETRAIN_PATH)['pose_model_state_dict']
+        # state_dict = {k[7:]:v for k, v in state_dict.items()}
+        pose_model.load_state_dict(state_dict, strict=False)
+
+    if config.DATA.DATASET_NAME == 'surreal': 
+        loader_func = surreal
+    else:
+        loader_func = h36m if config.DATA.DATASET_NAME == "h36m" else mpiinf
+    dataset_train = loader_func(config, is_train=True)
+    dataset_test = loader_func(config, is_train=False)
+
+    train_loader = DataLoader(
+        dataset=dataset_train,
+        batch_size=config.BATCH_SIZE, 
+        shuffle=True,
+        drop_last=False,
+        pin_memory=True,
+        num_workers=config.NUM_WORKERS
+    )
+    test_loader = DataLoader(
+        dataset=dataset_test,
+        batch_size=config.BATCH_SIZE,
+        shuffle=False,
+        drop_last=False,
+        pin_memory=True,
+        num_workers=config.NUM_WORKERS
+    )
+    
+    if args.eval:
+        prefix = config.DATA.DATASET_NAME
+        # for mode in ['train', 'valid']:
+        for mode in ['valid']:
+            is_train = True if mode == 'train' else False
+            v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10] if prefix == "h36m" else np.arange(config.DATA.NUM_JOINTS)
+            mpi2h36m = [10, 9, 8, 11, 12, 13, 4, 3, 2, 5, 6, 7, 1, 14, 15, 16, 0]
+            if prefix == 'surreal': 
+                indices = np.arange(config.DATA.NUM_JOINTS)
+            else:
+                indices = v3d_to_ours if prefix == "h36m" else mpi2h36m 
+            mode = "train" if is_train else "valid"
+            read_name = f"../data/{prefix}_{mode}_pred3.h5"
+            # read_name = f"../../unsupervised_mesh/data/h36m_{mode}_pred_3d_mesh.h5"
+            save_name = f"../data/{prefix}_{mode}_pred_3d.h5"
+            if args.eval_suffix is not None: 
+                save_name = f"{save_name[:-3]}_{args.eval_suffix}.h5"
+
+            # eval mode, load the pretrained model and generate the 3d prediction of all 3ds 
+            if not config.TRAIN.PRETRAIN_LIFTER: 
+                raise Warning("You are not using a pretrain model... may be you can specify --pretrain flag")
+            dataloader = DataLoader(dataset_train if mode == "train" else dataset_test, batch_size=config.BATCH_SIZE, \
+                shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS)
+            all_out_data = evaluate(dataloader, pose_model, config, is_train=(mode == "train"))
+            p1_mpjpe, p2_mpjpe = all_out_data['p1_mpjpe'], all_out_data['p2_mpjpe']
+            # read out imagenames
+            print("Reading imagenames and joints 2d...")
+            fin = h5py.File(read_name, "r")
+            fout = h5py.File(save_name, "w")
+            imagenames = fin['imagename'][:].copy()
+            joints_2d_gt = np.array(fin['joint_2d_gt'])
+            fout['imagename'] = imagenames
+            fout['joint_2d_gt'] = joints_2d_gt[:, indices]
+            fout['joint_3d_gt'] = all_out_data['joint_3d_gt']
+            fout['joint_3d_pre'] = all_out_data['joint_3d_pre']
+            possible_same_keys = ['shape', 'pose', 'original_joint_2d_gt', 'joint_2d_pre', 'seqlen']
+
+            for key in possible_same_keys: 
+                if key in fin.keys(): 
+                    if 'joint' in key: 
+                        fout[key] = np.array(fin[key])[:, indices]
+                    else: 
+                        fout[key] = np.array(fin[key])
+            if 'seqname' in fin.keys(): 
+                fout['seqname'] = fin['seqname'][:].copy()
+
+            if 'auc' in all_out_data.keys(): 
+                fout['auc'] = all_out_data['auc']
+                fout['pckh5'] = all_out_data['pckh5']
+                fout['auc_p2'] = all_out_data['auc_p2']
+                fout['pckh5_p2'] = all_out_data['pckh5_p2']
+            if 'scales' in all_out_data.keys():
+                fout['scale_pre'] = all_out_data['scales'] 
+            if 'scale_mids' in all_out_data.keys():
+                fout['scale_mid_pre'] = all_out_data['scale_mids']
+
+            fin.close()
+            fout.close()
+            print("Evaluation on the {} set finished. P1 Mpjpe: {:.3f}, P2 Mpjpe: {:.3f}, saved to {}".format(
+                "training" if is_train else "test", p1_mpjpe, p2_mpjpe, save_name
+            ))
+            if prefix == "mpi":
+                print("PCKh@0.5: {:.3f}, AUC: {:.3f}".format(all_out_data['pckh5'], all_out_data['auc']))
+                print("P2: PCKh@0.5: {:.3f}, AUC: {:.3f}".format(all_out_data['pckh5_p2'], all_out_data['auc_p2']))
+        # uncomment this if you need to plot images
+        # print("Rendering sequences...")
+        # subprocess.call(f'python render.py --seq_num 10 --in_filename ../data/{prefix}_valid_pred_3d.h5 --save_dir ../vis', shell=True)
+        return 
+
+    # preparation for visualization & perseq optimization(optional)
+    if config.USE_GT: 
+        # note that the gt here is not the gt above(config.USE_GT)
+        train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales.pkl"
+        valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales.pkl"
+    else:
+        train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales_pre.pkl"
+        valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales_pre.pkl"
+
+    train_scale_mids_gt = load_pickle(train_path)['scale_mid'] if osp.exists(train_path) else None
+    valid_scale_mids_gt = load_pickle(valid_path)['scale_mid'] if osp.exists(valid_path) else None
+    train_seqnames, valid_seqnames = dataset_train.get_seqnames(), dataset_test.get_seqnames()
+    best_p1_mpjpe = best_p2_mpjpe = cur_p1_mpjpe = 10000.0
+    best_auc_val = best_pckh5 = 0.0
+    best_auc_val_p2 = best_pckh5_p2 = 0.0
+
+    for epoch in range(config.TRAIN.NUM_EPOCHS):
+        scheduler_d.step(); scheduler_g.step();scheduler_temp.step(); # scheduler_s.step()
+        avg_d_loss, avg_g_loss, avg_t_loss, train_scale_mids_pre = train(train_loader, pose_model, discriminator, temp_discriminator, optimizer_g,
+                optimizer_d, optimizer_d_temp, epoch, config, summary_writer=summary_writer, print_interval=config.PRINT_INTERVAL)
+        logger.info("***** Epoch: {}, Avg G Loss: {:.3f}, Avg D Loss: {:.3f} Avg T Loss: {:.3f} *****".format(
+            epoch, avg_g_loss, avg_d_loss, avg_t_loss))
+        p1_mpjpe, p2_mpjpe, vis_image, valid_scale_mids_pre, extra_dict = validate(test_loader, pose_model, epoch, config)
+        logger.info("Epoch: {}, P1 Mpjpe/Best P1: {:.3f}/{:.3f}, P2 Mpjpe/Best P2/Cur P1: {:.3f}/{:.3f}/{:.3f}".format(epoch, p1_mpjpe, best_p1_mpjpe, p2_mpjpe, best_p2_mpjpe, cur_p1_mpjpe))
+        if p2_mpjpe < best_p2_mpjpe: 
+            best_p2_mpjpe = p2_mpjpe
+            cur_p1_mpjpe = p1_mpjpe
+            is_best = True 
+        else: 
+            is_best = False
+
+        if p1_mpjpe < best_p1_mpjpe: 
+            best_p1_mpjpe = p1_mpjpe
+
+        if extra_dict is not None: 
+            auc_val, pckh5 = extra_dict['auc'], extra_dict['pckh5']
+            auc_val_p2, pckh5_p2 = extra_dict['auc_p2'], extra_dict['pckh5_p2']
+            if auc_val_p2 > best_auc_val_p2: 
+                best_auc_val_p2 = auc_val_p2
+                best_pckh5_p2 = pckh5_p2
+                is_best = True 
+            else: 
+                is_best = False
+
+            if auc_val > best_auc_val: 
+                best_auc_val = auc_val 
+                best_pckh5 = pckh5
+            logger.info("PCKh@0.5(Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})".format(pckh5, best_pckh5, auc_val, best_auc_val))
+            logger.info("P2: PCKh@0.5(Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})".format(pckh5_p2, best_pckh5_p2, auc_val_p2, best_auc_val_p2))
+
+        save_checkpoint({
+                "epoch": epoch, 
+                "auc": best_auc_val, 
+                "pckh5": best_pckh5, 
+                "auc_p2": best_auc_val_p2, 
+                "pckh5_p2": best_pckh5_p2, 
+                "p1_mpjpe": p1_mpjpe, 
+                "p2_mpjpe": p2_mpjpe, 
+                "pose_model_state_dict": pose_model.state_dict(), 
+                "discriminator_state_dict": discriminator.state_dict(), 
+                "temp_discriminator_state_dict": temp_discriminator.state_dict(), 
+                "optimizer_d": optimizer_d.state_dict(), 
+                "optimizer_g": optimizer_g.state_dict(), 
+                "optimizer_d_temp": optimizer_d_temp.state_dict()
+            }, is_best, final_output_dir)
+        summary_writer.add_scalar("p1_mpjpe_3d_test/epoch", p1_mpjpe, epoch)
+        summary_writer.add_scalar("p2_mpjpe_3d_test/epoch", p2_mpjpe, epoch)
+        summary_writer.add_image("test_joints/epoch", vis_image, epoch)
+        if extra_dict is not None: 
+            summary_writer.add_scalar("PCKh0.5/epoch", pckh5, epoch)
+            summary_writer.add_scalar("AUC/epoch", auc_val, epoch)
+
+        if train_scale_mids_gt is not None and train_scale_mids_pre is not None and len(train_scale_mids_pre) > 0: 
+            num_seq = config.VIS.SCALE_MID_NUM_SEQ
+            vis_image_scale_mid1 = plot_scalemid_dist(train_scale_mids_pre, train_scale_mids_gt.tolist())
+            vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type(torch.float32).permute(2, 0, 1) / 255
+            vis_image_scale_mid2 = plot_scalemid_seq_dist(train_scale_mids_pre, train_scale_mids_gt.tolist(), train_seqnames, num_seq=num_seq)
+            vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type(torch.float32).permute(2, 0, 1) / 255
+            summary_writer.add_image("train_scalemid_distribution/epoch", vis_image_scale_mid1, epoch)
+            summary_writer.add_image("train_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch)
+        if valid_scale_mids_gt is not None and valid_scale_mids_pre is not None and len(valid_scale_mids_pre) > 0: 
+            num_seq = config.VIS.SCALE_MID_NUM_SEQ
+            vis_image_scale_mid1 = plot_scalemid_dist(valid_scale_mids_pre, valid_scale_mids_gt.tolist())
+            vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type(torch.float32).permute(2, 0, 1) / 255
+            vis_image_scale_mid2 = plot_scalemid_seq_dist(valid_scale_mids_pre, valid_scale_mids_gt.tolist(), valid_seqnames, num_seq=num_seq)
+            vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type(torch.float32).permute(2, 0, 1) / 255
+            summary_writer.add_image("valid_scalemid_distribution/epoch", vis_image_scale_mid1, epoch)
+            summary_writer.add_image("valid_scalemid_seq_distribution/epoch", vis_image_scale_mid2, epoch)
+
+    summary_writer.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/mpi_get_diff.py b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_get_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ae6287ae8009e073008d8f0dff38a5516b8ea7
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_get_diff.py
@@ -0,0 +1,100 @@
+import numpy as np 
+import h5py 
+import argparse 
+
+np.random.seed(2019)
+parser = argparse.ArgumentParser(description="Generate the diff data")
+parser.add_argument("--valid", action="store_true")
+parser.add_argument("--use_random", action="store_true")
+# specify the interval 
+parser.add_argument("--bound", default=1, type=int, required=False)
+parser.add_argument("--use_previous", action="store_true", help="Specify whether to use previous frames or not")
+parser.add_argument('--use_pre', action='store_true')
+args = parser.parse_args()
+
+# compute the difference of frames 
+bound = args.bound
+is_train = not args.valid
+use_random = args.use_random
+use_previous = args.use_previous
+use_pre = args.use_pre
+# in_filename = "../data/kinetics_final.h5"
+suffix = str(bound) if bound > 1 else ""
+if use_random: 
+    suffix += "_rand"
+if use_pre: 
+    suffix += "_pre"
+
+in_filename = "../data/mpi_{}_pred3.h5".format("train" if is_train else "valid")
+out_filename = "../data/mpi_{}_diff{}.h5".format("train" if is_train else "valid", suffix)
+
+f = h5py.File(in_filename, "r")
+names = [name.decode() for name in f['imagename'][:]]
+joints_2d = np.array(f['joint_2d_gt' if not use_pre else "joint_2d_pre"])
+f.close()
+print("Load from", in_filename)
+
+size = joints_2d.shape[0]
+splits = [name.split('/') for name in names]
+sequences = ['/'.join(split[:4]) for split in splits]
+indices_ref = [int(split[-1].split(".")[0].split('_')[1]) for split in splits]
+indices = []
+i = 0 
+last_seqname = None
+for index, seqname in zip(indices_ref, sequences): 
+    if last_seqname is not None and seqname != last_seqname: 
+        i = 0 
+    last_seqname = seqname 
+    indices.append(i)
+    i += 1
+
+# calculate the length of each sequence
+seq_lens = {}
+for split in splits: 
+    seq = '/'.join(split[:4])
+    if seq not in seq_lens: 
+        seq_lens[seq] = 0 
+    seq_lens[seq] += 1
+
+intervals = np.random.randint(1, bound + 1, (size, ))
+if not use_random:
+    intervals.fill(bound)
+
+if use_previous: 
+    spec_indices = [i for i, index in enumerate(indices) if index < intervals[i]]
+    diff_indices = np.arange(0, size, 1) - intervals
+    diff_indices[spec_indices] += 2 * intervals[spec_indices]
+else: 
+    spec_indices = [i for i, index in enumerate(indices) if index >= seq_lens[sequences[i]] - intervals[i]]
+    diff_indices = np.arange(0, size, 1) + intervals
+    diff_indices[spec_indices] -= 2 * intervals[spec_indices]
+
+# before_joints = np.concatenate((joints_2d[:1].copy(), joints_2d[:-1].copy()), axis=0)
+# after_joints = np.concatenate((joints_2d[1:].copy(), joints_2d[-1:].copy()), axis=0)
+# print(before_joints.shape, after_joints.shape)
+
+# diff_before = joints_2d - before_joints
+# diff_after = joints_2d - after_joints
+# diff_before, diff_after = before_joints, after_joints
+# diff_before, diff_after = diff_before[:, np.newaxis], diff_after[:, np.newaxis]
+
+# finally process the special cases 
+# diff_before[start_indices] = diff_after[start_indices]
+# diff_after[end_indices] = diff_before[end_indices]
+
+# diff = np.concatenate((diff_before, diff_after), axis=1)
+# print(diff.shape)
+
+# diff_types = np.ones((len(diff), ), dtype=np.uint8)
+# diff_types[start_indices] = 0
+# diff_types[end_indices] = 2
+
+diff = joints_2d[diff_indices]
+dist = np.linalg.norm((joints_2d - diff).reshape(size, -1), axis=1).mean()
+print("Mean distance bewteen diff and original: {:.3f}".format(dist))
+
+f = h5py.File(out_filename, "w")
+f['gt_diff'] = diff 
+# f['gt_diff_type'] = diff_types
+f.close()
+print("Saved to", out_filename)
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot1.py b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot1.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06015705cfc36d464fe73c1b3b9e247a97edf8d
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot1.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import os
+import cv2
+import random
+import os.path as osp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+import h5py
+from tqdm import trange
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--seq_num', type=int, default=1, help='Specify the number of sequences to render')
+parser.add_argument('--save_dir', type=str, default="../vis/", help='Specify the directory the save the visualization')
+parser.add_argument('--in_filename', type=str, default= "../data/h36m_valid_pred_3d.h5", help="Speicfy the dataset to load from")
+args = parser.parse_args()
+seq_num = args.seq_num 
+save_dir = args.save_dir
+in_filename = args.in_filename
+os.makedirs(save_dir, exist_ok=True)
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+jcolors = [
+    'light_pink', 'light_pink', 'light_pink', 'pink', 'pink', 'pink',
+    'light_blue', 'light_blue', 'light_blue', 'blue', 'blue', 'blue',
+    'purple', 'purple', 'red', 'green', 'green', 'white', 'white'
+]
+ecolors = {
+    0: 'light_pink',
+    1: 'light_pink',
+    2: 'light_pink',
+    3: 'pink',
+    4: 'pink',
+    5: 'pink',
+    6: 'light_blue',
+    7: 'light_blue',
+    8: 'light_blue',
+    9: 'blue',
+    10: 'blue',
+    11: 'blue',
+    12: 'purple',
+    13: 'light_green',
+    14: 'light_green',
+    15: 'purple'
+}
+
+root = "/home/yuzhenbo/codebase/3D/multipose/data/mpi_inf/"
+image_root = root
+
+in_filename = "../data/mpi_valid_pred_3d.h5"
+
+print("Read from", in_filename)
+f = h5py.File(in_filename, "r")
+imagenames = [name.decode() for name in f['imagename'][:]]
+# 2d joints in the order of v3d convention
+# poses2d = np.array(f['joint_2d_gt'])[:, v3d_to_ours]
+poses2d = np.array(f['joint_2d_gt'])
+poses3d = np.array(f['joint_3d_pre'])
+poses3d_gt = np.array(f['joint_3d_gt'])
+poses3d_gt = poses3d_gt - poses3d_gt[:, 13:14]
+f.close()
+
+t = trange(0, len(imagenames))
+processed_video_names = []
+
+def plot_skeleton_2d(all_frames, joints_2d): 
+    out_frames = []
+    radius = max(4, (np.mean(all_frames[0].shape[:2]) * 0.01).astype(int))
+    for idx in range(len(all_frames)): 
+        for pair in pairs: 
+            i, j = pair 
+            pt1, pt2 = joints_2d[idx, i], joints_2d[idx, j] 
+            x11, y11 = pt1 
+            x22, y22 = pt2 
+            if pair in pairs_left: 
+                color = (205, 0, 0)
+            elif pair in pairs_right: 
+                color = (0, 205, 0)
+            else: 
+                color = (0, 165, 255)
+            cv2.line(all_frames[idx], (int(x11), int(y11)), (int(x22), int(y22)), color, radius-2)
+        
+def get_xxyys(names): 
+    xxyys = []
+    # should be subject, action, camera
+    splits = names[0].split('/')
+    video_name = '/'.join(splits[:-1])
+    part_label_path = osp.join(root, splits[0], 'MySegmentsMat', 'PartLabels',
+                splits[1] + ("cam" + splits[2]).replace('cam0', '.54138969').replace('cam2','.58860488').replace('cam1', '.55011271').replace('cam3', '.60457274') + ".mat")
+    f = h5py.File(part_label_path, "r")
+    for idx, name in enumerate(names): 
+        partmask = f[f['Feat'][idx*30, 0]][()].T 
+        yp, xp = np.where(partmask != 0)
+        xmin, xmax = np.min(xp), np.max(xp) + 1 
+        ymin, ymax = np.min(yp), np.max(yp) + 1 
+        xxyys.append((xmin, xmax, ymin, ymax))
+    f.close()
+    return xxyys
+
+def crop_image(all_frames, xxyys, scale_factor=0.25): 
+    out_frames = []
+    for frame, xxyy in zip(all_frames, xxyys): 
+        h, w = frame.shape[:2]
+        xmin, xmax, ymin, ymax = xxyy 
+        xc, yc = (xmin + xmax) / 2, (ymin + ymax) / 2
+        l = max(xmax - xmin, ymax - ymin)
+        xmin, xmax = max(0, xc - l/2), min(w, xc + l / 2)
+        ymin, ymax = max(0, yc - l/2), min(h, yc + l / 2)
+        xmin, xmax = int(xmin), int(xmax)
+        ymin, ymax = int(ymin), int(ymax)
+        frame = frame[ymin:ymax, xmin:xmax, :].copy()
+        frame = cv2.resize(frame, (int(scale_factor * w), int(scale_factor * h)))
+        frame = frame[::-1, :, ::-1] / 255
+        out_frames.append(frame)
+    return out_frames
+
+for imageid in t:
+    name = imagenames[imageid]
+    splits = name.split('/')
+    video_name = '/'.join(splits[:2])
+    if len(processed_video_names) == seq_num: 
+        print("Finished! Rendered {} sequences, saved to {}".format(seq_num, save_dir))
+        break
+    if video_name in processed_video_names:
+        continue 
+    else:
+        processed_video_names.append(video_name)
+    print(video_name)
+    recs = [(idx, name) for idx, name in enumerate(imagenames) if video_name in name]
+    # downsample 
+    recs = recs[::5]
+    # cand_list = [x*5 for x in [440, 565, 770]]
+    # cand_list = [200, 250, 300, 350, 400, 450, 500, 520, 550, 590, 620, 660, 700, 740, 770, 800, 830, 845]
+    # recs = list(filter(lambda x: x[0] in cand_list,  recs))
+    # recs = list(filter(lambda x: x[0] in [65*5, 100*5, 905*5, 1160*5], recs))
+    recs = sorted(recs, key=lambda x: int(x[1].split('/')[-1].split('_')[1].split('.')[0]))
+    names_in_video = [rec[1] for rec in recs]
+    indices_in_video = [rec[0] for rec in recs]
+    # path_format = osp.join(image_root, splits[0], splits[1], "img_{:06d}.jpg")
+    poses3d_in_video = poses3d[indices_in_video]
+    poses2d_in_video = poses2d[indices_in_video]
+    poses3d_gt_in_video = poses3d_gt[indices_in_video]
+    all_frames = [cv2.imread(osp.join(image_root, name)) for name in names_in_video]
+    # all_frames = [cv2.imread(path_format.format(int(name.split('/')[-1])+1)) for name in names_in_video]
+    print("Ploting 2d skeleton...")
+    plot_skeleton_2d(all_frames, poses2d_in_video)
+    scale_factor = 0.2
+    all_frames = [cv2.resize(frame, (int(scale_factor * frame.shape[1]), int(scale_factor * frame.shape[0])))[::-1, :, ::-1] / 255 for frame in all_frames]
+    # print("Getting bounding boxes...")
+    # xxyys = get_xxyys(names_in_video)
+    # print("Cropping images...")
+    # all_frames = crop_image(all_frames, xxyys, scale_factor=0.2)
+    print("Generating gifs...")
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+    lines_3d, lines_3d_gt = [], []
+    radius = 0.75 
+    initialized = False
+    num_render = len(names_in_video)
+    print(num_render, " frames to plot")
+
+    def update_video(frame_idx):
+        global initialized, lines_3d, lines_3d_gt
+        print("{}/{} ".format(frame_idx, num_render), end='\r')
+        pose2d = poses2d_in_video[frame_idx]
+        pose3d = poses3d_in_video[frame_idx]
+        pose3d_gt = poses3d_gt_in_video[frame_idx]
+        if not initialized:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                if pair in pairs_left: 
+                    color = "blue"
+                elif pair in pairs_right: 
+                    color = "green"
+                else: 
+                    color = "darkorange"
+                # pt1, pt2 = pose3d[i], pose3d[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c='red', linewidth=3, label="pre"))
+                pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                x11, y11, z11 = pt1 
+                x22, y22, z22 = pt2 
+                lines_3d_gt.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c=color, linewidth=3, label="gt"))
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c="red", linewidth=3, label="ssadv"))
+            initialized = True
+        else:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                # pt1, pt2 = pose3d[i], pose3d[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d[idx][0].set_xdata([z11, z22])
+                # lines_3d[idx][0].set_ydata([x11, x22])
+                # lines_3d[idx][0].set_3d_properties([-y11, -y22])
+                pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d_gt[idx][0].set_xdata([z11, z22])
+                lines_3d_gt[idx][0].set_ydata([x11, x22])
+                lines_3d_gt[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv[idx][0].set_xdata([z11, z22])
+                # lines_3d_ssadv[idx][0].set_ydata([x11, x22])
+                # lines_3d_ssadv[idx][0].set_3d_properties([-y11, -y22])
+
+        xroot, yroot, zroot = pose3d_gt[13, 0], -pose3d_gt[13, 1], pose3d_gt[13, 2]
+        ax.set_ylim3d([-radius+xroot, radius+xroot])
+        ax.set_zlim3d([-radius+yroot, radius+yroot])
+        ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+        ax.get_xaxis().set_ticklabels([])
+        ax.get_yaxis().set_ticklabels([])
+        ax.set_zticklabels([])
+
+        white = (1.0, 1.0, 1.0, 0.0)
+        ax.w_xaxis.set_pane_color(white)
+        ax.w_yaxis.set_pane_color(white)
+
+        ax.w_xaxis.line.set_color(white)
+        ax.w_yaxis.line.set_color(white)
+        ax.w_zaxis.line.set_color(white)
+
+        r = 0.95
+        xx = np.linspace(-r * radius + xroot, r * radius + xroot, all_frames[frame_idx].shape[1])
+        yy = np.linspace(-r * radius + yroot, r * radius + yroot, all_frames[frame_idx].shape[0])
+        xx, yy = np.meshgrid(xx, yy)
+        zz = np.ones_like(xx) * (-3.2* radius + zroot)
+        ax.set_xlabel('Z', fontsize=13)
+        ax.set_ylabel("X", fontsize=13)
+        ax.set_zlabel("Y", fontsize=13)
+        ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=all_frames[frame_idx], shade=False)
+        plt.savefig(osp.join(save_dir, f"{video_name.replace('/', '_')}_{frame_idx}.png"))
+
+    for idx in range(len(names_in_video)): 
+        update_video(idx)
+    ani = animation.FuncAnimation(fig, update_video, range(len(names_in_video)), interval=20)
+    save_name = name.replace('/', '_')
+    ani.save(osp.join(save_dir, f"{save_name}.gif"), writer='imagemagick', fps=20)
+    t.set_postfix(index=int(imageid))
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot2.py b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot2.py
new file mode 100644
index 0000000000000000000000000000000000000000..42233bae660871add68b48f23f49e0860f668927
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_plot2.py
@@ -0,0 +1,271 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import os
+import cv2
+import random
+import os.path as osp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+import h5py
+from tqdm import trange
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--seq_num', type=int, default=1, help='Specify the number of sequences to render')
+parser.add_argument('--save_dir', type=str, default="../vis/", help='Specify the directory the save the visualization')
+parser.add_argument('--in_filename', type=str, default= "../data/h36m_valid_pred_3d.h5", help="Speicfy the dataset to load from")
+args = parser.parse_args()
+seq_num = args.seq_num 
+save_dir = args.save_dir
+in_filename = args.in_filename
+os.makedirs(save_dir, exist_ok=True)
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+jcolors = [
+    'light_pink', 'light_pink', 'light_pink', 'pink', 'pink', 'pink',
+    'light_blue', 'light_blue', 'light_blue', 'blue', 'blue', 'blue',
+    'purple', 'purple', 'red', 'green', 'green', 'white', 'white'
+]
+ecolors = {
+    0: 'light_pink',
+    1: 'light_pink',
+    2: 'light_pink',
+    3: 'pink',
+    4: 'pink',
+    5: 'pink',
+    6: 'light_blue',
+    7: 'light_blue',
+    8: 'light_blue',
+    9: 'blue',
+    10: 'blue',
+    11: 'blue',
+    12: 'purple',
+    13: 'light_green',
+    14: 'light_green',
+    15: 'purple'
+}
+
+root = "/home/yuzhenbo/codebase/3D/multipose/data/mpi_inf/"
+image_root = root
+
+in_filename = "../data/mpi_valid_pred_3d.h5"
+
+print("Read from", in_filename)
+f = h5py.File(in_filename, "r")
+imagenames = [name.decode() for name in f['imagename'][:]]
+# 2d joints in the order of v3d convention
+# poses2d = np.array(f['joint_2d_gt'])[:, v3d_to_ours]
+poses2d = np.array(f['joint_2d_gt'])
+poses3d = np.array(f['joint_3d_pre'])
+poses3d_gt = np.array(f['joint_3d_gt'])
+poses3d_gt = poses3d_gt - poses3d_gt[:, 13:14]
+f.close()
+
+t = trange(0, len(imagenames))
+processed_video_names = []
+
+def plot_skeleton_2d(all_frames, joints_2d): 
+    out_frames = []
+    radius = max(4, (np.mean(all_frames[0].shape[:2]) * 0.01).astype(int))
+    for idx in range(len(all_frames)): 
+        for pair in pairs: 
+            i, j = pair 
+            pt1, pt2 = joints_2d[idx, i], joints_2d[idx, j] 
+            x11, y11 = pt1 
+            x22, y22 = pt2 
+            if pair in pairs_left: 
+                color = (205, 0, 0)
+            elif pair in pairs_right: 
+                color = (0, 205, 0)
+            else: 
+                color = (0, 165, 255)
+            cv2.line(all_frames[idx], (int(x11), int(y11)), (int(x22), int(y22)), color, radius-2)
+        
+def get_xxyys(names): 
+    xxyys = []
+    # should be subject, action, camera
+    splits = names[0].split('/')
+    video_name = '/'.join(splits[:-1])
+    part_label_path = osp.join(root, splits[0], 'MySegmentsMat', 'PartLabels',
+                splits[1] + ("cam" + splits[2]).replace('cam0', '.54138969').replace('cam2','.58860488').replace('cam1', '.55011271').replace('cam3', '.60457274') + ".mat")
+    f = h5py.File(part_label_path, "r")
+    for idx, name in enumerate(names): 
+        partmask = f[f['Feat'][idx*30, 0]][()].T 
+        yp, xp = np.where(partmask != 0)
+        xmin, xmax = np.min(xp), np.max(xp) + 1 
+        ymin, ymax = np.min(yp), np.max(yp) + 1 
+        xxyys.append((xmin, xmax, ymin, ymax))
+    f.close()
+    return xxyys
+
+def crop_image(all_frames, xxyys, scale_factor=0.25): 
+    out_frames = []
+    for frame, xxyy in zip(all_frames, xxyys): 
+        h, w = frame.shape[:2]
+        xmin, xmax, ymin, ymax = xxyy 
+        xc, yc = (xmin + xmax) / 2, (ymin + ymax) / 2
+        l = max(xmax - xmin, ymax - ymin)
+        xmin, xmax = max(0, xc - l/2), min(w, xc + l / 2)
+        ymin, ymax = max(0, yc - l/2), min(h, yc + l / 2)
+        xmin, xmax = int(xmin), int(xmax)
+        ymin, ymax = int(ymin), int(ymax)
+        frame = frame[ymin:ymax, xmin:xmax, :].copy()
+        frame = cv2.resize(frame, (int(scale_factor * w), int(scale_factor * h)))
+        frame = frame[::-1, :, ::-1] / 255
+        out_frames.append(frame)
+    return out_frames
+
+for imageid in t:
+    name = imagenames[imageid]
+    splits = name.split('/')
+    video_name = '/'.join(splits[:2])
+    if len(processed_video_names) == seq_num: 
+        print("Finished! Rendered {} sequences, saved to {}".format(seq_num, save_dir))
+        break
+    if video_name in processed_video_names:
+        continue 
+    else:
+        processed_video_names.append(video_name)
+    print(video_name)
+    recs = [(idx, name) for idx, name in enumerate(imagenames) if video_name in name]
+    # downsample 
+    recs = recs[::5]
+    # cand_list = [x*5 for x in [440, 565, 770]]
+    # cand_list = [200, 250, 300, 350, 400, 450, 500, 520, 550, 590, 620, 660, 700, 740, 770, 800, 830, 845]
+    # recs = list(filter(lambda x: x[0] in cand_list,  recs))
+    # recs = list(filter(lambda x: x[0] in [65*5, 100*5, 905*5, 1160*5], recs))
+    recs = sorted(recs, key=lambda x: int(x[1].split('/')[-1].split('_')[1].split('.')[0]))
+    names_in_video = [rec[1] for rec in recs]
+    indices_in_video = [rec[0] for rec in recs]
+    # path_format = osp.join(image_root, splits[0], splits[1], "img_{:06d}.jpg")
+    poses3d_in_video = poses3d[indices_in_video]
+    poses2d_in_video = poses2d[indices_in_video]
+    poses3d_gt_in_video = poses3d_gt[indices_in_video]
+    all_frames = [cv2.imread(osp.join(image_root, name)) for name in names_in_video]
+    # all_frames = [cv2.imread(path_format.format(int(name.split('/')[-1])+1)) for name in names_in_video]
+    print("Ploting 2d skeleton...")
+    plot_skeleton_2d(all_frames, poses2d_in_video)
+    scale_factor = 0.2
+    all_frames = [cv2.resize(frame, (int(scale_factor * frame.shape[1]), int(scale_factor * frame.shape[0])))[::-1, :, ::-1] / 255 for frame in all_frames]
+    # print("Getting bounding boxes...")
+    # xxyys = get_xxyys(names_in_video)
+    # print("Cropping images...")
+    # all_frames = crop_image(all_frames, xxyys, scale_factor=0.2)
+    print("Generating gifs...")
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+    lines_3d, lines_3d_gt = [], []
+    radius = 0.75 
+    initialized = False
+    num_render = len(names_in_video)
+    print(num_render, " frames to plot")
+
+    def update_video(frame_idx):
+        global initialized, lines_3d, lines_3d_gt
+        print("{}/{} ".format(frame_idx, num_render), end='\r')
+        pose2d = poses2d_in_video[frame_idx]
+        pose3d = poses3d_in_video[frame_idx]
+        pose3d_gt = poses3d_gt_in_video[frame_idx]
+        if not initialized:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                if pair in pairs_left: 
+                    color = "blue"
+                elif pair in pairs_right: 
+                    color = "green"
+                else: 
+                    color = "darkorange"
+                pt1, pt2 = pose3d[i], pose3d[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c=color, linewidth=3, label="pre"))
+                # pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2 
+                # lines_3d_gt.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c=color, linewidth=3, label="gt"))
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c="red", linewidth=3, label="ssadv"))
+            initialized = True
+        else:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                pt1, pt2 = pose3d[i], pose3d[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d[idx][0].set_xdata([z11, z22])
+                lines_3d[idx][0].set_ydata([x11, x22])
+                lines_3d[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_gt[idx][0].set_xdata([z11, z22])
+                # lines_3d_gt[idx][0].set_ydata([x11, x22])
+                # lines_3d_gt[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv[idx][0].set_xdata([z11, z22])
+                # lines_3d_ssadv[idx][0].set_ydata([x11, x22])
+                # lines_3d_ssadv[idx][0].set_3d_properties([-y11, -y22])
+
+        xroot, yroot, zroot = pose3d_gt[13, 0], -pose3d_gt[13, 1], pose3d_gt[13, 2]
+        ax.set_ylim3d([-radius+xroot, radius+xroot])
+        ax.set_zlim3d([-radius+yroot, radius+yroot])
+        ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+        ax.get_xaxis().set_ticklabels([])
+        ax.get_yaxis().set_ticklabels([])
+        ax.set_zticklabels([])
+
+        white = (1.0, 1.0, 1.0, 0.0)
+        ax.w_xaxis.set_pane_color(white)
+        ax.w_yaxis.set_pane_color(white)
+
+        ax.w_xaxis.line.set_color(white)
+        ax.w_yaxis.line.set_color(white)
+        ax.w_zaxis.line.set_color(white)
+
+        r = 0.95
+        xx = np.linspace(-r * radius + xroot, r * radius + xroot, all_frames[frame_idx].shape[1])
+        yy = np.linspace(-r * radius + yroot, r * radius + yroot, all_frames[frame_idx].shape[0])
+        xx, yy = np.meshgrid(xx, yy)
+        zz = np.ones_like(xx) * (-3.2* radius + zroot)
+        ax.set_xlabel('Z', fontsize=13)
+        ax.set_ylabel("X", fontsize=13)
+        ax.set_zlabel("Y", fontsize=13)
+        ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=all_frames[frame_idx], shade=False)
+        plt.savefig(osp.join(save_dir, f"{video_name.replace('/', '_')}_{frame_idx}.png"))
+
+    for idx in range(len(names_in_video)): 
+        update_video(idx)
+    ani = animation.FuncAnimation(fig, update_video, range(len(names_in_video)), interval=20)
+    save_name = name.replace('/', '_')
+    ani.save(osp.join(save_dir, f"{save_name}.gif"), writer='imagemagick', fps=20)
+    t.set_postfix(index=int(imageid))
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/mpi_validate_project.py b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_validate_project.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb6b0fa2d5950c0e1eb712c5d055766d693b166
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/mpi_validate_project.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import h5py 
+import numpy as np
+import pickle as pkl
+import argparse 
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--mode', default='train')
+parser.add_argument('--prefix', default="mpi")
+args = parser.parse_args()
+
+prefix = args.prefix 
+mode = args.mode
+
+mpi2h36m = [10, 9, 8, 11, 12, 13, 4, 3, 2, 5, 6, 7, 1, 14, 15, 16, 0] if prefix == "mpi" else list(range(17))
+readpath = f"../data/{prefix}_{mode}_pred3.h5"
+savepath = f"../data/mpi_{mode}_scales.pkl"
+
+f = h5py.File(readpath, "r")
+joints_2d_gt = np.array(f['joint_2d_gt'])[:, mpi2h36m]
+# joints_3d_pre = np.array(f['joint_3d_pre'])
+joints_3d_gt = np.array(f['joint_3d_gt'])[:, mpi2h36m] / 1000.0
+f.close()
+
+if prefix == "mpi":
+    factors = 0.7577316 if mode == "valid" else 0.7286965902 
+else: 
+    factors = 0.680019 if mode == "valid" else 0.6451607
+
+joints_2d_gt[:, :, 0] = (joints_2d_gt[:, :, 0] - 1024.0) / 1024.0
+joints_2d_gt[:, :, 1] = (joints_2d_gt[:, :, 1] - 1024.0) / 1024.0
+root2d = joints_2d_gt[:, 13:14].copy()
+joints_2d_gt = joints_2d_gt - root2d
+joints_2d_gt[:, 13:14] = 1e-5
+
+# factor_2d = 1 / 10 / np.linalg.norm(joints_2d_gt[:, -1] - joints_2d_gt[:, 13], axis=1).reshape(-1, 1, 1)
+factor_2d = 1 / 10 / np.linalg.norm(joints_2d_gt[:, -1] - joints_2d_gt[:, 13], axis=1).reshape(-1, 1, 1)
+# scale the 2d joints
+# joints_2d_gt = joints_2d_gt * factor_2d * factors[:, 0:1, 0:1]
+joints_2d_gt = joints_2d_gt * factor_2d
+
+# then we project the 3d joints 
+# minus the root and shift to (0, 0, 10)
+joints_3d_gt = joints_3d_gt - joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt / factors
+shift = np.array([0, 0, 10]).reshape(1, 1, 3)
+root3d_gt = joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt - root3d_gt + shift 
+
+# project the 3d joints 
+# N * J * 2
+project_gt_2d = joints_3d_gt[..., :2] / joints_3d_gt[..., 2:]
+x1_min, x1_max = joints_2d_gt[..., 0:1].min(axis=1, keepdims=True), joints_2d_gt[..., 0:1].max(axis=1, keepdims=True)
+y1_min, y1_max = joints_2d_gt[..., 1:].min(axis=1, keepdims=True), joints_2d_gt[..., 1:].max(axis=1, keepdims=True)
+x2_min, x2_max = project_gt_2d[..., 0:1].min(axis=1, keepdims=True), project_gt_2d[..., 0:1].max(axis=1, keepdims=True)
+y2_min, y2_max = project_gt_2d[..., 1:].min(axis=1, keepdims=True), project_gt_2d[..., 1:].max(axis=1, keepdims=True)
+scales = ((x2_max - x2_min) / (x1_max - x1_min) + (y2_max - y2_min) / (y1_max - y1_min)) / 2
+heights, widths = y1_max - y1_min, x1_max - x1_min
+scale_mids = (scales + (heights + widths) / 2) / 2
+print("Mean/Std of scale mid: {:.3f}/{:.3f}".format(scale_mids.mean(), scale_mids.std()))
+
+with open(savepath, "wb") as f: 
+    pkl.dump({"scale": scales.reshape(-1), "scale_mid": scale_mids.reshape(-1)}, f)
+
+err_gt = np.linalg.norm(project_gt_2d - joints_2d_gt, axis=-1).mean()
+print("Projection GT error is: {:.4f}".format(err_gt))
+
+# first descale, minus the root, and shift 
+# joints_3d_pre = joints_3d_pre / factors
+# root3d_pre = joints_3d_pre[:, 13:14].copy()
+# joints_3d_pre = joints_3d_pre - root3d_pre + shift 
+# project_pre_2d = joints_3d_pre[..., :2] / joints_3d_pre[..., 2:]
+# err_pre = np.linalg.norm(project_pre_2d - joints_2d_gt, axis=-1).mean()
+# print("Projection PRE error is: {:.4f}".format(err_pre))
+
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/plot1.py b/insightface/body/human_pose/ambiguity_aware/scripts/plot1.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b3e383cfd022d86a5df8028af59be69b4d628a
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/plot1.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import os
+import cv2
+import random
+import os.path as osp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+import h5py
+from tqdm import trange
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--seq_num', type=int, default=1, help='Specify the number of sequences to render')
+parser.add_argument('--save_dir', type=str, default="../vis/", help='Specify the directory the save the visualization')
+parser.add_argument('--in_filename', type=str, default= "../data/h36m_valid_pred_3d.h5", help="Speicfy the dataset to load from")
+args = parser.parse_args()
+seq_num = args.seq_num 
+save_dir = args.save_dir
+in_filename = args.in_filename
+os.makedirs(save_dir, exist_ok=True)
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+jcolors = [
+    'light_pink', 'light_pink', 'light_pink', 'pink', 'pink', 'pink',
+    'light_blue', 'light_blue', 'light_blue', 'blue', 'blue', 'blue',
+    'purple', 'purple', 'red', 'green', 'green', 'white', 'white'
+]
+ecolors = {
+    0: 'light_pink',
+    1: 'light_pink',
+    2: 'light_pink',
+    3: 'pink',
+    4: 'pink',
+    5: 'pink',
+    6: 'light_blue',
+    7: 'light_blue',
+    8: 'light_blue',
+    9: 'blue',
+    10: 'blue',
+    11: 'blue',
+    12: 'purple',
+    13: 'light_green',
+    14: 'light_green',
+    15: 'purple'
+}
+
+root = "/yzbdata/MeshTrack/Data/HMR/Human/Subject/"
+image_root = osp.join(root, "datapre_all")
+
+in_filename = "../data/h36m_valid_pred_3d4118.h5"
+in_filename_ssadv = "../data/h36m_valid_pred_3dssadv.h5"
+
+print("Read from", in_filename)
+f = h5py.File(in_filename, "r")
+imagenames = [name.decode() for name in f['imagename'][:]]
+# 2d joints in the order of v3d convention
+# poses2d = np.array(f['joint_2d_gt'])[:, v3d_to_ours]
+poses2d = np.array(f['joint_2d_gt'])
+poses3d = np.array(f['joint_3d_pre'])
+poses3d_gt = np.array(f['joint_3d_gt'])
+poses3d_gt = poses3d_gt - poses3d_gt[:, 13:14]
+f.close()
+
+f = h5py.File(in_filename_ssadv, "r")
+poses3d_ssadv = np.array(f['joint_3d_pre'])
+f.close()
+
+t = trange(0, len(imagenames))
+processed_video_names = []
+
+def plot_skeleton_2d(all_frames, joints_2d): 
+    out_frames = []
+    radius = max(4, (np.mean(all_frames[0].shape[:2]) * 0.01).astype(int))
+    for idx in range(len(all_frames)): 
+        for pair in pairs: 
+            i, j = pair 
+            pt1, pt2 = joints_2d[idx, i], joints_2d[idx, j] 
+            x11, y11 = pt1 
+            x22, y22 = pt2 
+            if pair in pairs_left: 
+                color = (205, 0, 0)
+            elif pair in pairs_right: 
+                color = (0, 205, 0)
+            else: 
+                color = (0, 165, 255)
+            cv2.line(all_frames[idx], (int(x11), int(y11)), (int(x22), int(y22)), color, radius-2)
+        
+def get_xxyys(names): 
+    xxyys = []
+    # should be subject, action, camera
+    splits = names[0].split('/')
+    video_name = '/'.join(splits[:-1])
+    part_label_path = osp.join(root, splits[0], 'MySegmentsMat', 'PartLabels',
+                splits[1] + ("cam" + splits[2]).replace('cam0', '.54138969').replace('cam2','.58860488').replace('cam1', '.55011271').replace('cam3', '.60457274') + ".mat")
+    f = h5py.File(part_label_path, "r")
+    for idx, name in enumerate(names): 
+        partmask = f[f['Feat'][idx*30, 0]][()].T 
+        yp, xp = np.where(partmask != 0)
+        xmin, xmax = np.min(xp), np.max(xp) + 1 
+        ymin, ymax = np.min(yp), np.max(yp) + 1 
+        xxyys.append((xmin, xmax, ymin, ymax))
+    f.close()
+    return xxyys
+
+def crop_image(all_frames, xxyys, scale_factor=0.25): 
+    out_frames = []
+    for frame, xxyy in zip(all_frames, xxyys): 
+        h, w = frame.shape[:2]
+        xmin, xmax, ymin, ymax = xxyy 
+        xc, yc = (xmin + xmax) / 2, (ymin + ymax) / 2
+        l = max(xmax - xmin, ymax - ymin)
+        xmin, xmax = max(0, xc - l/2), min(w, xc + l / 2)
+        ymin, ymax = max(0, yc - l/2), min(h, yc + l / 2)
+        xmin, xmax = int(xmin), int(xmax)
+        ymin, ymax = int(ymin), int(ymax)
+        frame = frame[ymin:ymax, xmin:xmax, :].copy()
+        frame = cv2.resize(frame, (int(scale_factor * w), int(scale_factor * h)))
+        frame = frame[::-1, :, ::-1] / 255
+        out_frames.append(frame)
+    return out_frames
+
+for imageid in t:
+    name = imagenames[imageid]
+    splits = name.split('/')
+    video_name = '/'.join(splits[:3])
+    if len(processed_video_names) == seq_num: 
+        print("Finished! Rendered {} sequences, saved to {}".format(seq_num, save_dir))
+        break
+    if video_name in processed_video_names:
+        continue 
+    else:
+        processed_video_names.append(video_name)
+    print(video_name)
+    recs = [(idx, name) for idx, name in enumerate(imagenames) if video_name in name]
+    # downsample 
+    recs = recs[::30]
+    # cand_list = [x*5 for x in [440, 565, 770]]
+    # cand_list = [200, 250, 300, 350, 400, 450, 500, 520, 550, 590, 620, 660, 700, 740, 770, 800, 830, 845]
+    # recs = list(filter(lambda x: x[0] in cand_list,  recs))
+    # recs = list(filter(lambda x: x[0] in [65*5, 100*5, 905*5, 1160*5], recs))
+    recs = sorted(recs, key=lambda x: int(x[1].split('/')[-1]))
+    names_in_video = [rec[1] for rec in recs]
+    indices_in_video = [rec[0] for rec in recs]
+    path_format = osp.join(image_root, splits[0], splits[1].replace(' ', '_'), "cam" + splits[2], "{:06d}.jpg")
+    poses3d_in_video = poses3d[indices_in_video]
+    poses2d_in_video = poses2d[indices_in_video]
+    poses3d_ssadv_in_video = poses3d_ssadv[indices_in_video]
+    poses3d_gt_in_video = poses3d_gt[indices_in_video]
+    all_frames = [cv2.imread(path_format.format(int(name.split('/')[-1])+1)) for name in names_in_video]
+    print("Ploting 2d skeleton...")
+    plot_skeleton_2d(all_frames, poses2d_in_video)
+    # scale_factor = 0.25
+    # all_frames = [cv2.resize(frame, (int(scale_factor * frame.shape[1]), int(scale_factor * frame.shape[0])))[::-1, :, ::-1] / 255 for frame in all_frames]
+    print("Getting bounding boxes...")
+    xxyys = get_xxyys(names_in_video)
+    print("Cropping images...")
+    all_frames = crop_image(all_frames, xxyys, scale_factor=0.2)
+    print("Generating gifs...")
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+    lines_3d, lines_3d_gt = [], []
+    lines_3d_ssadv = []
+    radius = 0.75 
+    initialized = False
+    num_render = len(names_in_video)
+    print(num_render, " frames to plot")
+
+    def update_video(frame_idx):
+        global initialized, lines_3d, lines_3d_gt, lines_3d_ssadv
+        print("{}/{} ".format(frame_idx, num_render), end='\r')
+        pose2d = poses2d_in_video[frame_idx]
+        pose3d = poses3d_in_video[frame_idx]
+        pose3d_ssadv = poses3d_ssadv_in_video[frame_idx]
+        pose3d_gt = poses3d_gt_in_video[frame_idx]
+        if not initialized:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                if pair in pairs_left: 
+                    color = "blue"
+                elif pair in pairs_right: 
+                    color = "green"
+                else: 
+                    color = "darkorange"
+                # pt1, pt2 = pose3d[i], pose3d[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c='red', linewidth=3, label="pre"))
+                pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                x11, y11, z11 = pt1 
+                x22, y22, z22 = pt2 
+                lines_3d_gt.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c=color, linewidth=3, label="gt"))
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c="red", linewidth=3, label="ssadv"))
+            initialized = True
+        else:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                # pt1, pt2 = pose3d[i], pose3d[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d[idx][0].set_xdata([z11, z22])
+                # lines_3d[idx][0].set_ydata([x11, x22])
+                # lines_3d[idx][0].set_3d_properties([-y11, -y22])
+                pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d_gt[idx][0].set_xdata([z11, z22])
+                lines_3d_gt[idx][0].set_ydata([x11, x22])
+                lines_3d_gt[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv[idx][0].set_xdata([z11, z22])
+                # lines_3d_ssadv[idx][0].set_ydata([x11, x22])
+                # lines_3d_ssadv[idx][0].set_3d_properties([-y11, -y22])
+
+        xroot, yroot, zroot = pose3d_gt[13, 0], -pose3d_gt[13, 1], pose3d_gt[13, 2]
+        ax.set_ylim3d([-radius+xroot, radius+xroot])
+        ax.set_zlim3d([-radius+yroot, radius+yroot])
+        ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+        ax.get_xaxis().set_ticklabels([])
+        ax.get_yaxis().set_ticklabels([])
+        ax.set_zticklabels([])
+
+        white = (1.0, 1.0, 1.0, 0.0)
+        ax.w_xaxis.set_pane_color(white)
+        ax.w_yaxis.set_pane_color(white)
+
+        ax.w_xaxis.line.set_color(white)
+        ax.w_yaxis.line.set_color(white)
+        ax.w_zaxis.line.set_color(white)
+
+        r = 0.95
+        # radius = max(4, (np.mean(all_frames[0].shape[:2]) * 0.01).astype(int))
+        xx = np.linspace(-r * radius + xroot, r * radius + xroot, all_frames[frame_idx].shape[1])
+        yy = np.linspace(-r * radius + yroot, r * radius + yroot, all_frames[frame_idx].shape[0])
+        xx, yy = np.meshgrid(xx, yy)
+        zz = np.ones_like(xx) * (-3.2* radius + zroot)
+        ax.set_xlabel('Z', fontsize=13)
+        ax.set_ylabel("X", fontsize=13)
+        ax.set_zlabel("Y", fontsize=13)
+        ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=all_frames[frame_idx], shade=False)
+        plt.savefig(osp.join(save_dir, f"{video_name.replace('/', '_')}_{frame_idx}.png"))
+
+    for idx in range(len(names_in_video)): 
+        update_video(idx)
+    ani = animation.FuncAnimation(fig, update_video, range(len(names_in_video)), interval=20)
+    save_name = name.replace('/', '_')
+    ani.save(osp.join(save_dir, f"{save_name}.gif"), writer='imagemagick', fps=20)
+    t.set_postfix(index=int(imageid))
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/plot2.py b/insightface/body/human_pose/ambiguity_aware/scripts/plot2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e1e78dfb64029858d640c337c8a639deaf82603
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/plot2.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import os
+import cv2
+import random
+import os.path as osp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from mpl_toolkits.mplot3d import Axes3D
+plt.switch_backend('agg')
+plt.ioff()
+
+import h5py
+from tqdm import trange
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--seq_num', type=int, default=1, help='Specify the number of sequences to render')
+parser.add_argument('--save_dir', type=str, default="../vis/", help='Specify the directory the save the visualization')
+parser.add_argument('--in_filename', type=str, default= "../data/h36m_valid_pred_3d.h5", help="Speicfy the dataset to load from")
+args = parser.parse_args()
+seq_num = args.seq_num 
+save_dir = args.save_dir
+in_filename = args.in_filename
+os.makedirs(save_dir, exist_ok=True)
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+pairs = [(0, 1), (1, 2), (2, 13), (3, 13), (3, 4), (4, 5), (6, 7), (7, 8), (8, 12), (9, 10),(9, 12), (10, 11),(12, 14), (12, 15), (13, 14), (15, 16)]
+pairs_left = [(3, 13), (3, 4), (4, 5), (9, 10), (9, 12), (10, 11)]
+pairs_right = [(0, 1), (1, 2), (2, 13), (6, 7), (7, 8), (8, 12)]
+
+colors = {
+    'pink': np.array([197, 27, 125]),  # L lower leg
+    'light_pink': np.array([233, 163, 201]),  # L upper leg
+    'light_green': np.array([161, 215, 106]),  # L lower arm
+    'green': np.array([77, 146, 33]),  # L upper arm
+    'red': np.array([215, 48, 39]),  # head
+    'light_red': np.array([252, 146, 114]),  # head
+    'light_orange': np.array([252, 141, 89]),  # chest
+    'purple': np.array([118, 42, 131]),  # R lower leg
+    'light_purple': np.array([175, 141, 195]),  # R upper
+    'light_blue': np.array([145, 191, 219]),  # R lower arm
+    'blue': np.array([69, 117, 180]),  # R upper arm
+    'gray': np.array([130, 130, 130]),  #
+    'white': np.array([255, 255, 255]),  #
+}
+jcolors = [
+    'light_pink', 'light_pink', 'light_pink', 'pink', 'pink', 'pink',
+    'light_blue', 'light_blue', 'light_blue', 'blue', 'blue', 'blue',
+    'purple', 'purple', 'red', 'green', 'green', 'white', 'white'
+]
+ecolors = {
+    0: 'light_pink',
+    1: 'light_pink',
+    2: 'light_pink',
+    3: 'pink',
+    4: 'pink',
+    5: 'pink',
+    6: 'light_blue',
+    7: 'light_blue',
+    8: 'light_blue',
+    9: 'blue',
+    10: 'blue',
+    11: 'blue',
+    12: 'purple',
+    13: 'light_green',
+    14: 'light_green',
+    15: 'purple'
+}
+
+root = "/yzbdata/MeshTrack/Data/HMR/Human/Subject/"
+image_root = osp.join(root, "datapre_all")
+
+in_filename = "../data/h36m_valid_pred_3d4118.h5"
+in_filename_ssadv = "../data/h36m_valid_pred_3dssadv.h5"
+
+print("Read from", in_filename)
+f = h5py.File(in_filename, "r")
+imagenames = [name.decode() for name in f['imagename'][:]]
+# 2d joints in the order of v3d convention
+# poses2d = np.array(f['joint_2d_gt'])[:, v3d_to_ours]
+print(f.keys())
+scales_pre = np.array(f['scale_pre'])
+poses2d = np.array(f['joint_2d_gt'])
+poses3d = np.array(f['joint_3d_pre'])
+poses3d_gt = np.array(f['joint_3d_gt'])
+poses3d_gt = poses3d_gt - poses3d_gt[:, 13:14]
+f.close()
+
+f = h5py.File(in_filename_ssadv, "r")
+poses3d_ssadv = np.array(f['joint_3d_pre'])
+f.close()
+
+t = trange(0, len(imagenames))
+processed_video_names = []
+
+def plot_skeleton_2d(all_frames, joints_2d): 
+    out_frames = []
+    radius = max(4, (np.mean(all_frames[0].shape[:2]) * 0.01).astype(int))
+    for idx in range(len(all_frames)): 
+        for pair in pairs: 
+            i, j = pair 
+            pt1, pt2 = joints_2d[idx, i], joints_2d[idx, j] 
+            x11, y11 = pt1 
+            x22, y22 = pt2 
+            if pair in pairs_left: 
+                color = (205, 0, 0)
+            elif pair in pairs_right: 
+                color = (0, 205, 0)
+            else: 
+                color = (0, 165, 255)
+            cv2.line(all_frames[idx], (int(x11), int(y11)), (int(x22), int(y22)), color, radius-2)
+        
+def get_xxyys(names): 
+    xxyys = []
+    # should be subject, action, camera
+    splits = names[0].split('/')
+    video_name = '/'.join(splits[:-1])
+    part_label_path = osp.join(root, splits[0], 'MySegmentsMat', 'PartLabels',
+                splits[1] + ("cam" + splits[2]).replace('cam0', '.54138969').replace('cam2','.58860488').replace('cam1', '.55011271').replace('cam3', '.60457274') + ".mat")
+    f = h5py.File(part_label_path, "r")
+    for idx, name in enumerate(names): 
+        partmask = f[f['Feat'][idx*30, 0]][()].T 
+        yp, xp = np.where(partmask != 0)
+        xmin, xmax = np.min(xp), np.max(xp) + 1 
+        ymin, ymax = np.min(yp), np.max(yp) + 1 
+        xxyys.append((xmin, xmax, ymin, ymax))
+    f.close()
+    return xxyys
+
+def crop_image(all_frames, xxyys, scale_factor=0.25): 
+    out_frames = []
+    for frame, xxyy in zip(all_frames, xxyys): 
+        h, w = frame.shape[:2]
+        xmin, xmax, ymin, ymax = xxyy 
+        xc, yc = (xmin + xmax) / 2, (ymin + ymax) / 2
+        l = max(xmax - xmin, ymax - ymin)
+        xmin, xmax = max(0, xc - l/2), min(w, xc + l / 2)
+        ymin, ymax = max(0, yc - l/2), min(h, yc + l / 2)
+        xmin, xmax = int(xmin), int(xmax)
+        ymin, ymax = int(ymin), int(ymax)
+        frame = frame[ymin:ymax, xmin:xmax, :].copy()
+        frame = cv2.resize(frame, (int(scale_factor * w), int(scale_factor * h)))
+        frame = frame[::-1, :, ::-1] / 255
+        out_frames.append(frame)
+    return out_frames
+
+for imageid in t:
+    name = imagenames[imageid]
+    splits = name.split('/')
+    video_name = '/'.join(splits[:3])
+    if len(processed_video_names) == seq_num: 
+        print("Finished! Rendered {} sequences, saved to {}".format(seq_num, save_dir))
+        break
+    if video_name in processed_video_names:
+        continue 
+    else:
+        processed_video_names.append(video_name)
+    print(video_name)
+    recs = [(idx, name) for idx, name in enumerate(imagenames) if video_name in name]
+    # downsample 
+    recs = recs[::30]
+    # cand_list = [x*5 for x in [440, 565, 770]]
+    # cand_list = [200, 250, 300, 350, 400, 450, 500, 520, 550, 590, 620, 660, 700, 740, 770, 800, 830, 845]
+    # recs = list(filter(lambda x: x[0] in cand_list,  recs))
+    # recs = list(filter(lambda x: x[0] in [65*5, 100*5, 905*5, 1160*5], recs))
+    recs = sorted(recs, key=lambda x: int(x[1].split('/')[-1]))
+    names_in_video = [rec[1] for rec in recs]
+    indices_in_video = [rec[0] for rec in recs]
+    path_format = osp.join(image_root, splits[0], splits[1].replace(' ', '_'), "cam" + splits[2], "{:06d}.jpg")
+    scales_in_video = scales_pre[indices_in_video]
+    poses3d_in_video = poses3d[indices_in_video]
+    poses2d_in_video = poses2d[indices_in_video]
+    poses3d_ssadv_in_video = poses3d_ssadv[indices_in_video]
+    poses3d_gt_in_video = poses3d_gt[indices_in_video]
+    all_frames = [cv2.imread(path_format.format(int(name.split('/')[-1])+1)) for name in names_in_video]
+    print("Ploting 2d skeleton...")
+    plot_skeleton_2d(all_frames, poses2d_in_video)
+    scale_factor = 0.20
+    # use original images
+    all_frames = [cv2.resize(frame, (int(scale_factor * frame.shape[1]), int(scale_factor * frame.shape[0])))[::-1, :, ::-1] / 255 for frame in all_frames]
+    # print("Getting bounding boxes...")
+    # xxyys = get_xxyys(names_in_video)
+    # print("Cropping images...")
+    # all_frames = crop_image(all_frames, xxyys, scale_factor=0.2)
+    print("Generating gifs...")
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+    ax.view_init(elev=10., azim=45.)
+    lines_3d, lines_3d_gt = [], []
+    lines_3d_ssadv = []
+    radius = 0.75 
+    initialized = False
+    num_render = len(names_in_video)
+    print(num_render, " frames to plot")
+    constant_c = 2.0
+
+    def update_video(frame_idx):
+        global initialized, lines_3d, lines_3d_gt, lines_3d_ssadv
+        print("{}/{} ".format(frame_idx, num_render), end='\r')
+        scale = scales_in_video[frame_idx].item()
+        pose2d = poses2d_in_video[frame_idx]
+        pose3d = poses3d_in_video[frame_idx]
+        # move pose 3d to 0, 0, c  * s 
+        pose3d = pose3d + np.array([0, 0, constant_c * scale]).reshape(1, -1)
+        pose3d_ssadv = poses3d_ssadv_in_video[frame_idx]
+        pose3d_gt = poses3d_gt_in_video[frame_idx]
+
+        r = 0.95
+        xroot, yroot, zroot = pose3d[13, 0], -pose3d[13, 1], pose3d[13, 2]
+        xx = np.linspace(-r * radius + xroot, r * radius + xroot, all_frames[frame_idx].shape[1])
+        yy = np.linspace(-r * radius + yroot, r * radius + yroot, all_frames[frame_idx].shape[0])
+        xx, yy = np.meshgrid(xx, yy)
+        # set the surface's z to 0 
+        zz = np.ones_like(xx) * (-0.2)
+        ax.plot_surface(zz, xx, yy, rstride=1, cstride=1, facecolors=all_frames[frame_idx], shade=False)
+
+        if not initialized:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                if pair in pairs_left: 
+                    color = "blue"
+                elif pair in pairs_right: 
+                    color = "green"
+                else: 
+                    color = "darkorange"
+                pt1, pt2 = pose3d[i], pose3d[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c=color, linewidth=3, label="pre"))
+                # pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2 
+                # lines_3d_gt.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c="green", linewidth=3, label="gt"))
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1 
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv.append(ax.plot([z11, z22], [x11, x22], [-y11, -y22], c="red", linewidth=3, label="ssadv"))
+            initialized = True
+        else:
+            for idx, pair in enumerate(pairs):
+                i, j = pair
+                pt1, pt2 = pose3d[i], pose3d[j]
+                x11, y11, z11 = pt1
+                x22, y22, z22 = pt2
+                lines_3d[idx][0].set_xdata([z11, z22])
+                lines_3d[idx][0].set_ydata([x11, x22])
+                lines_3d[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_gt[i], pose3d_gt[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_gt[idx][0].set_xdata([z11, z22])
+                # lines_3d_gt[idx][0].set_ydata([x11, x22])
+                # lines_3d_gt[idx][0].set_3d_properties([-y11, -y22])
+                # pt1, pt2 = pose3d_ssadv[i], pose3d_ssadv[j]
+                # x11, y11, z11 = pt1
+                # x22, y22, z22 = pt2
+                # lines_3d_ssadv[idx][0].set_xdata([z11, z22])
+                # lines_3d_ssadv[idx][0].set_ydata([x11, x22])
+                # lines_3d_ssadv[idx][0].set_3d_properties([-y11, -y22])
+
+        # xroot, yroot, zroot = pose3d_gt[13, 0], -pose3d_gt[13, 1], pose3d_gt[13, 2]
+        ax.set_ylim3d([-radius+xroot, radius+xroot])
+        ax.set_zlim3d([-radius+yroot, radius+yroot])
+        # min should cover 0 
+        ax.set_xlim3d([-0.2, constant_c + radius])
+        # ax.set_xlim3d([-0.2, radius + zroot])
+        # ax.set_xlim3d([-2.5 * radius+zroot, radius+zroot])
+        ax.get_xaxis().set_ticklabels([])
+        ax.get_yaxis().set_ticklabels([])
+        ax.set_zticklabels([])
+
+        white = (1.0, 1.0, 1.0, 0.0)
+        ax.w_xaxis.set_pane_color(white)
+        ax.w_yaxis.set_pane_color(white)
+
+        ax.w_xaxis.line.set_color(white)
+        ax.w_yaxis.line.set_color(white)
+        ax.w_zaxis.line.set_color(white)
+
+        r = 0.95
+        # zz = np.ones_like(xx) * (-3.2* radius + zroot)
+        ax.set_xlabel('Z', fontsize=13)
+        ax.set_ylabel("X", fontsize=13)
+        ax.set_zlabel("Y", fontsize=13)
+        plt.savefig(osp.join(save_dir, f"{video_name.replace('/', '_')}_{frame_idx}.png"))
+
+    for idx in range(len(names_in_video)): 
+        update_video(idx)
+    ani = animation.FuncAnimation(fig, update_video, range(len(names_in_video)), interval=20)
+    save_name = name.replace('/', '_')
+    ani.save(osp.join(save_dir, f"{save_name}.gif"), writer='imagemagick', fps=20)
+    t.set_postfix(index=int(imageid))
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/process_yaml.py b/insightface/body/human_pose/ambiguity_aware/scripts/process_yaml.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8760dac590a449c2d731169b21d44fcff4dd79
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/process_yaml.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import yaml 
+import os 
+import os.path as osp 
+
+for _root, _dirs, _files in os.walk("./"): 
+    for _file in _files: 
+        if not _file.endswith(".yaml"): 
+            continue
+        filepath = osp.join(_root, _file)
+        with open(filepath, "r") as f: 
+            data = yaml.load(f)
+
+        loss_weights = data['TRAIN']['LOSS_WEIGHTS']
+        if len(loss_weights) == 5: 
+            data['TRAIN']['LOSS_WEIGHTS'] = loss_weights[:4]
+
+        with open(filepath, "w") as f: 
+            yaml.dump(data, f)
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/validate_project.py b/insightface/body/human_pose/ambiguity_aware/scripts/validate_project.py
new file mode 100644
index 0000000000000000000000000000000000000000..3904c8ad9bc3ffb33bb24dda359369cb36f27e83
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/validate_project.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import h5py 
+import numpy as np
+import pickle as pkl
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+
+# filepath = "../data/h36m_valid_pred_3d.h5"
+filepath = "../../unsupervised_mesh/data/h36m_valid_pred_3d_mesh.h5"
+f = h5py.File(filepath, "r")
+joints_2d_gt = np.array(f['joint_2d_gt'])
+joints_3d_pre = np.array(f['joint_3d_pre'])
+joints_3d_gt = np.array(f['joint_3d_gt'])
+f.close()
+
+factor_path = "../data/h36m_test_factor_3d.pkl"
+f = open(factor_path, "rb")
+factors = pkl.load(f)
+f.close()
+factors = 0.680019
+
+# joints_2d_gt[:, :, 0] = (joints_2d_gt[:, :, 0] - 514.0435) / 500.0
+# joints_2d_gt[:, :, 1] = (joints_2d_gt[:, :, 1] - 506.7003) / 500.0
+joints_2d_gt = (joints_2d_gt - 112.0) / 112.0
+root2d = joints_2d_gt[:, 13:14].copy()
+joints_2d_gt = joints_2d_gt - root2d
+joints_2d_gt[:, 13:14] = 1e-5
+
+factor_2d = 1 / 10 / np.linalg.norm(joints_2d_gt[:, -1] - joints_2d_gt[:, 13], axis=1).mean()
+# scale the 2d joints
+# joints_2d_gt = joints_2d_gt * factor_2d * factors[:, 0:1, 0:1]
+joints_2d_gt = joints_2d_gt * factor_2d
+
+# then we project the 3d joints 
+# minus the root and shift to (0, 0, 10)
+joints_3d_gt = joints_3d_gt - joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt / factors
+shift = np.array([0, 0, 10]).reshape(1, 1, 3)
+root3d_gt = joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt - root3d_gt + shift 
+
+# project the 3d joints 
+# N * J * 2
+project_gt_2d = joints_3d_gt[..., :2] / joints_3d_gt[..., 2:]
+x1_min, x1_max = joints_2d_gt[..., 0:1].min(axis=1, keepdims=True), joints_2d_gt[..., 0:1].max(axis=1, keepdims=True)
+y1_min, y1_max = joints_2d_gt[..., 1:].min(axis=1, keepdims=True), joints_2d_gt[..., 1:].max(axis=1, keepdims=True)
+x2_min, x2_max = project_gt_2d[..., 0:1].min(axis=1, keepdims=True), project_gt_2d[..., 0:1].max(axis=1, keepdims=True)
+y2_min, y2_max = project_gt_2d[..., 1:].min(axis=1, keepdims=True), project_gt_2d[..., 1:].max(axis=1, keepdims=True)
+scales = ((x2_max - x2_min) / (x1_max - x1_min) + (y2_max - y2_min) / (y1_max - y1_min)) / 2
+heights, widths = y1_max - y1_min, x1_max - x1_min
+scale_mids = (scales + (heights + widths) / 2) / 2
+print("Mean/Std of scale mid: {:.3f}/{:.3f}".format(scale_mids.mean(), scale_mids.std()))
+
+# with open("../data/h36m_valid_scales.pkl", "wb") as f: 
+#     pkl.dump({"scale": scales.reshape(-1), "scale_mid": scale_mids.reshape(-1)}, f)
+
+err_gt = np.linalg.norm(project_gt_2d - joints_2d_gt, axis=-1).mean()
+print("Projection GT error is: {:.4f}".format(err_gt))
+
+# first descale, minus the root, and shift 
+joints_3d_pre = joints_3d_pre / factors
+root3d_pre = joints_3d_pre[:, 13:14].copy()
+joints_3d_pre = joints_3d_pre - root3d_pre + shift 
+project_pre_2d = joints_3d_pre[..., :2] / joints_3d_pre[..., 2:]
+err_pre = np.linalg.norm(project_pre_2d - joints_2d_gt, axis=-1).mean()
+print("Projection PRE error is: {:.4f}".format(err_pre))
+
diff --git a/insightface/body/human_pose/ambiguity_aware/scripts/validate_project_pre.py b/insightface/body/human_pose/ambiguity_aware/scripts/validate_project_pre.py
new file mode 100644
index 0000000000000000000000000000000000000000..65413ae5f0e3f866744ab56f56367fb95747bbed
--- /dev/null
+++ b/insightface/body/human_pose/ambiguity_aware/scripts/validate_project_pre.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import h5py 
+import numpy as np
+import pickle as pkl
+
+v3d_to_ours = [3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10]
+
+filepath = "../data/h36m_valid_pred3.h5"
+f = h5py.File(filepath, "r")
+joints_2d_pre = np.array(f['joint_2d_pre'])[:, v3d_to_ours]
+joints_3d_gt = np.array(f['joint_3d_gt'])[:, v3d_to_ours]
+f.close()
+
+factor_path = "../data/h36m_test_factor_3d.pkl"
+f = open(factor_path, "rb")
+factors = pkl.load(f)
+f.close()
+
+joints_2d_pre[:, :, 0] = (joints_2d_pre[:, :, 0] - 514.0435) / 500.0
+joints_2d_pre[:, :, 1] = (joints_2d_pre[:, :, 1] - 506.7003) / 500.0
+root2d = joints_2d_pre[:, 13:14].copy()
+joints_2d_pre = joints_2d_pre - root2d
+joints_2d_pre[:, 13:14] = 1e-5
+
+factor_2d = 1 / 10 / np.linalg.norm(joints_2d_pre[:, -1] - joints_2d_pre[:, 13], axis=1).mean()
+# scale the 2d joints
+# joints_2d_pre = joints_2d_pre * factor_2d * factors[:, 0:1, 0:1]
+joints_2d_pre = joints_2d_pre * factor_2d
+
+# then we project the 3d joints 
+# minus the root and shift to (0, 0, 10)
+joints_3d_gt = joints_3d_gt - joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt / factors
+shift = np.array([0, 0, 10]).reshape(1, 1, 3)
+root3d_gt = joints_3d_gt[:, 13:14].copy()
+joints_3d_gt = joints_3d_gt - root3d_gt + shift 
+
+# project the 3d joints 
+# N * J * 2
+project_gt_2d = joints_3d_gt[..., :2] / joints_3d_gt[..., 2:]
+x1_min, x1_max = joints_2d_pre[..., 0:1].min(axis=1, keepdims=True), joints_2d_pre[..., 0:1].max(axis=1, keepdims=True)
+y1_min, y1_max = joints_2d_pre[..., 1:].min(axis=1, keepdims=True), joints_2d_pre[..., 1:].max(axis=1, keepdims=True)
+x2_min, x2_max = project_gt_2d[..., 0:1].min(axis=1, keepdims=True), project_gt_2d[..., 0:1].max(axis=1, keepdims=True)
+y2_min, y2_max = project_gt_2d[..., 1:].min(axis=1, keepdims=True), project_gt_2d[..., 1:].max(axis=1, keepdims=True)
+ws = x1_max - x1_min
+hs = y1_max - y1_min 
+hws = (hs + ws) / 2
+scales = ((x2_max - x2_min) / (x1_max - x1_min) + (y2_max - y2_min) / (y1_max - y1_min)) / 2
+scale_mids = (scales + hws) / 2
+print("Mean/Std of scale mid: {:.3f}/{:.3f}".format(scale_mids.mean(), scale_mids.std()))
+
+with open("../data/h36m_valid_scales_pre.pkl", "wb") as f: 
+    pkl.dump({"scale": scales.reshape(-1), "scale_mid": scale_mids.reshape(-1)}, f)
+
+err_gt = np.linalg.norm(project_gt_2d - joints_2d_pre, axis=-1).mean()
+print("Projection GT error is: {:.4f}".format(err_gt))
+
+# first descale, minus the root, and shift 
+# joints_3d_pre = joints_3d_pre / factors
+# root3d_pre = joints_3d_pre[:, 13:14].copy()
+# joints_3d_pre = joints_3d_pre - root3d_pre + shift 
+# project_pre_2d = joints_3d_pre[..., :2] / joints_3d_pre[..., 2:]
+# err_pre = np.linalg.norm(project_pre_2d - joints_2d_pre, axis=-1).mean()
+# print("Projection PRE error is: {:.4f}".format(err_pre))
+
diff --git a/insightface/challenges/README.md b/insightface/challenges/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..009432168f60b101f0d08ddad75142f7d11d35fe
--- /dev/null
+++ b/insightface/challenges/README.md
@@ -0,0 +1,29 @@
+## Challenges
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+
+## Introduction
+
+These benchmarks are maintained by [InsightFace](https://insightface.ai)
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/thumb_ifrt.png" width="480"/>
+</div>
+
+
+
+## Supported Benchmarks
+- [MFR-Ongoing](mfr) (Ongoing version of iccv21-mfr)
+- [MFR21 (ICCVW'2021)](iccv21-mfr)
+- [LFR19 (ICCVW'2019)](iccv19-lfr)
+
+
+
+
+
+
diff --git a/insightface/challenges/cvpr23-fas-wild/README.md b/insightface/challenges/cvpr23-fas-wild/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..49d30ff7f6a04237cabb4b949d525ff5328d140a
--- /dev/null
+++ b/insightface/challenges/cvpr23-fas-wild/README.md
@@ -0,0 +1,172 @@
+# 4th Face Anti-spoofing Workshop and Challenge@CVPR2023, Wild Track
+
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/faswild_large.png" width="640"/>
+</div>
+
+## Updates
+
+**``2023-05-03``**: We have launched the ongoing version of this challenge. You can start submitting your test results at this [new link](https://codalab.lisn.upsaclay.fr/competitions/12933). If you have not applied for the dataset yet, you need to send an application email to both of ``insightface.challenge@gmail.com`` and ``wangdong@moredian.com``. 
+
+
+
+**``2023-02-15``**: The annotation format in readme is fixed: 
+  ```
+    - e.g: Train/spoof/2D-Display-Phone/000001/000001.txt
+    192 148 (bbox left top)
+    234 203 (bbox right bottom)
+    216 171 (landmark left eye)
+    230 168 (landmark right eye)
+    231 180 (landmark nose)
+    218 190 (landmark left mouth )
+    229 188 (landmark right mouth )
+  ```
+
+## Challenge
+
+We host the WILD track of Face Anti-spoofing Workshop and Challenge@CVPR2023 here. The challenge will officially start together with [4th Face Anti-spoofing Workshop](https://sites.google.com/view/face-anti-spoofing-challenge/welcome/challengecvpr2023). 
+
+[Registration](https://codalab.lisn.upsaclay.fr/competitions/10670) is now open on codalab.
+
+Our competition encompasses over 800K spoof photos and over 500K live photos. In the spoof photos, there are three major categories and 17 subcategories.
+
+### Rules and Regulations
+
+1) Any extra data or pretrained model trained from extra data cannot be used in this challenge.
+
+2) Only one DL model can be used, we can not accept the fusion results from many DL models. The computational cost of a single DL model should be **less than 5G FLOPs**. (FLOPs can be calculated using `ptflops` python library)
+
+3) The top-3 winners are required to submit the code for the entire method, ensuring reproducibility of the results and compliance with all contest rules, otherwise the score will be disqualified.
+
+### Timeline
+
+| Phase | Start Date | End Date            | Intro                                                               |
+|-------|------------|---------------------|---------------------------------------------------------------------|
+| Dev   | 2023-02-13 | 2023-03-15          | evaluate the accuracy on dev set                                    |
+| Test  | 2023-03-15 | 2023-03-28 23:59:59 | evaluate the accuracy on test set, using the threshold from dev set |
+
+
+### Rewards
+
+
+| Rank      | Prize   |
+|-----------|---------|
+| 1st place | $ 3,000 |
+| 2nd place | $ 1,500 |
+| 3rd place | $ 500   |
+
+Sponsors: ``Moredian Technology``
+
+## Evaluation
+
+### Evaluation Criteria
+
+For the performance evaluation, we selected the recently standardized ISO/IEC 30107-3 metrics: Attack Presentation Classification Error Rate (APCER), Normal/Bona Fide Presentation Classification Error Rate (NPCER/BPCER) and Average Classification Error Rate (ACER) as the evaluation metric, in which APCER and BPCER/NPCER are used to measure the error rate of fake or live samples, respectively. The ACER value is used as the final evaluation criterion.
+
+
+### Submission Format
+
+**Phase1**: training dataset is used to train the model (Label: live=1, fake=0). Then the trained model is used to predict the sample scores in dev.txt. Participants can directly submit the predicted score file in codalab system. Note that the order of the samples in dev.txt cannot be changed. The final submitted file contains a total of 140,058 lines. Each line in the file contains two parts separated by a space. The first part is the path of each image in dev.txt and must contain the set name(``dev/``), and the second part is the prediction score given by the model (representing the probability that the sample belongs to the live face, which must be in the range of [0.0, 1.0]). Such as:
+```
+dev/000001.jpg 0.15361                   #Note:  line 1- the first row of dev.txt
+
+......
+
+dev/140058.jpg 0.23394                   #Note:  line 140,058 the last row of dev.txt
+```
+The predicted file should be a ``.txt`` file and compressed into a ZIP file (do not add any folder in the ZIP).
+
+
+**Phase2**: In order to submit results at one time, participants need to combine the dev and test predictions into one file before result submission via codalab system. Note that the order of the samples cannot be changed and the dev sample list needs to be written before the test samples.
+
+The final submission file contains a total of 895,237 lines. Each line in the file contains two parts separated by a space. Such as: 
+```
+dev/000001.jpg 0.15361                   #Note:  line 1- the first row of dev.txt
+
+......
+
+dev/140058.jpg 0.23394                   #Note:  line 140,058 the last row of dev.txt
+test/000001.jpg 0.15361                   #Note:  line 140,059 the first row of test.txt  
+
+......   
+
+test/755179.jpg 0.23394                   #Note:  line 895,237 the last row of test.txt
+```
+The predicted file should be a ``.txt`` file and compressed into a ZIP file (do not add any folder in the ZIP).
+
+## Dataset
+
+### Rules
+
+1. The dataset and its subsets can only be used for academic research purposes.
+2. The user is not allowed to use the dataset or its subsets for any type of commercial purpose.
+3. Any form of usage of the dataset in defamatory, pornographic, or any other unlawful manner, or violation of any applicable regulations or laws is forbidden. We are not responsible for any consequences caused by the above behaviors.
+4. The User is not allowed to distribute, broadcast, or reproduce the dataset or its subsets in any way without official permission.
+5. The user is not allowed to share, transfer, sell or resell the dataset or its subsets to any third party for any purpose. HOWEVER, providing the dataset access to user’s research associates, colleagues or team member is allowed if user’s research associates, colleagues or team member agree to be bound by these usage rules.
+6. All images in this dataset can be used for academic research purposes, BUT, only the approved images of the dataset can be exhibited on user’s publications(including but not limited to research paper, presentations for conferences or educational purposes). The approved images have special marks and are listed in a appendix.
+7. We reserve the right to interpret and amend these rules.
+8. please cite us if the InsightFace Wild Anti-Spoofing Dataset or its subset is used in your research:
+```
+@misc{wang2023wild,
+      title={Wild Face Anti-Spoofing Challenge 2023: Benchmark and Results}, 
+      author={Dong Wang and Jia Guo and Qiqi Shao and Haochi He and Zhian Chen and Chuanbao Xiao and Ajian Liu and Sergio Escalera and Hugo Jair Escalante and Lei Zhen and Jun Wan and Jiankang Deng},
+      year={2023},
+      eprint={2304.05753},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+
+
+### Download
+
+All users can obtain and use this dataset and its subsets only after signing the [Agreement](https://github.com/nttstar/insightface-resources/raw/master/files/License%20Agreement%20for%20InsightFace%20Wild%20Anti-Spoofing%20Dataset.pdf) and sending it to the official e-mail ``insightface.challenge#gmail.com``.
+
+
+### Dataset Annotations
+
+Please refer to the following table for detailed information on the number of labeled data and examples in the dataset:
+
+#### Spoof Images, Training Set
+
+Training Subset, live/spoof labels and categorization information are given:
+
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/faswild_train_dataset.png" width="1024"/>
+</div>
+
+#### Spoof Images, Dev and Test Sets
+
+Dev and Test Subsets, where dev set is used to select the threshold.
+
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/faswild_devtest_dataset.png" width="1024"/>
+</div>
+
+#### Live Images
+
+There're 205,146 live images in training set, and 51,299/273,126 images in dev and test sets respectively.
+
+
+## Baselines
+
+### Dev
+
+| Backbone | Input Crop | FLOPs | APCER  | BPCER  | ACER   |
+|----------|------------|-------|--------|--------|--------|
+| ResNet18 | 224x224    | 1.8G    | 4.244% | 4.245% | 4.245% |
+
+
+### Test
+
+| Backbone | Input Crop | FLOPs | APCER  | BPCER  | ACER   |
+|----------|------------|-------|--------|--------|--------|
+| ResNet18 | 224x224    | 1.8G    | 6.145% | 8.874% | 7.509% |
+
+
+## Feedback
+
+1) If you have any questions regarding the challenge, kindly open an issue on insightface github. (recommended)
+2) Or you can send an e-mail to ``insightface.challenge#gmail.com``
+
diff --git a/insightface/challenges/frvt-impl/README.md b/insightface/challenges/frvt-impl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6c3ea5c6af3a870673f525361e5a159b2702a57
--- /dev/null
+++ b/insightface/challenges/frvt-impl/README.md
@@ -0,0 +1,21 @@
+## FRVT-Implementation
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+
+## Introduction
+
+We provide a [FRVT-1:1](https://pages.nist.gov/frvt/html/frvt11.html) implementation example here. One can easily build FRVT-1:1 submission by simply putting insightface trained ONNX models into the codebase.
+
+
+
+Coming soon. 
+
+
+
+
+
+
diff --git a/insightface/challenges/iccv19-lfr/README.md b/insightface/challenges/iccv19-lfr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..12aa5354959447e690654c90e5f55a4eafefdcff
--- /dev/null
+++ b/insightface/challenges/iccv19-lfr/README.md
@@ -0,0 +1,111 @@
+[The Lightweight Face Recognition Challenge & Workshop](https://ibug.doc.ic.ac.uk/resources/lightweight-face-recognition-challenge-workshop/) will be held in conjunction with the International Conference on Computer Vision (ICCV) 2019, Seoul Korea. 
+
+Please strictly follow the rules. For example, please use the same [method](https://github.com/deepinsight/insightface/blob/master/common/flops_counter.py) for the FLOPs calculation regardless of your training framework is insightface or not.
+
+[Test Server](http://www.insightface-challenge.com/overview) 
+
+**Sponsors:**
+
+The Lightweight Face Recognition Challenge has been supported by 
+
+EPSRC project FACER2VM (EP/N007743/1)
+
+Huawei (5000$)
+
+DeepGlint (3000$)
+
+iQIYI (3000$)
+
+Kingsoft Cloud (3000$)
+
+Pensees (3000$)
+
+Dynamic funding pool: (17000$)
+
+Cash sponsors and gift donations are welcome.
+
+Contact:
+insightface.challenge@gmail.com
+
+**Discussion Group**
+
+*For Chinese:*
+
+![wechat](https://insightface.ai/assets/img/github/lfr19_wechat1.jpg)
+
+*For English:*
+
+(in #lfr2019 channel)
+https://join.slack.com/t/insightface/shared_invite/enQtNjU0NDk2MjYyMTMzLTIzNDEwNmIxMjU5OGYzYzFhMjlkNjlhMTBkNWFiNjU4MTVhNTgzYjQ5ZTZiMGM3MzUyNzQ3OTBhZTg3MzM5M2I
+
+
+**NEWS**
+
+``2019.06.21`` We updated the groundtruth of Glint test dataset.
+
+``2019.06.04`` We will clean the groundtruth on deepglint testset.
+
+``2019.05.21`` Baseline models and training logs available.
+
+``2019.05.16`` The four tracks (deepglint-light, deepglint-large, iQIYI-light, iQIYI-large) will equally share the dynamic funding pool (14000$). From each track, the top 3 players will share the funding pool for 50%, 30% and 20% respectively.
+
+==================
+
+**How To Start:**
+
+**Training:**
+
+1. Download ms1m-retinaface from [baiducloud](https://pan.baidu.com/s/14z7qbi0K8aAYDcgT4ArnWg)(code:4ouw) or [onedrive](https://1drv.ms/u/s!AswpsDO2toNKrjhJhMRoxr-HlECx?e=VSXTmv) and unzip it to `$INSIGHTFACE_ROOT/datasets/`
+2. Go into `$INSIGHTFACE_ROOT/recognition/`
+3. Refer to the `retina` dataset configuration section in `sample_config.py` and copy it as your own configuration file `config.py`.
+4. Start training with `CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --dataset retina --network [your-network] --loss arcface`. It will output the accuracy of lfw, cfp_fp and agedb_30 every 2000 batches by default.
+5. Putting the training dataset on SSD hard disk will achieve better training efficiency.
+
+------------------
+
+**Testing:**
+
+1. Download testdata-image from [baiducloud](https://pan.baidu.com/s/1UKUYsRfVTSzj1tfU3BVFrw) or [dropbox](https://www.dropbox.com/s/r5y6xt754m36rh8/iccv19-challenge-data-v1.zip?dl=0). These face images are all pre-processed and aligned.
+2. To download testdata-video from iQIYI, please visit <http://challenge.ai.iqiyi.com/data-cluster>. You need to download iQIYI-VID-FACE.z01, iQIYI-VID-FACE.z02 and iQIYI-VID-FACE.zip after registration. These face frames are also pre-processed and aligned.
+   1. Unzip: ``zip iQIYI_VID_FACE.zip -s=0 --out iQIYI_VID_FACE_ALL.zip; unzip iQIYI_VID_FACE_ALL.zip``
+   2. We can get a directory named ``iQIYI_VID_FACE`` after decompression. Then, we have to move ``video_filelist.txt`` in testdata-image package to ``iQIYI_VID_FACE/filelist.txt``, to indicate the order of videos in our submission feature file.
+3. To generate image feature submission file: check ``gen_image_feature.py``
+4. To generate video feature submission file: check ``gen_video_feature.py``
+5. Submit binary feature to the right track of the test server.
+
+You can also check the verification performance during training time on LFW,CFP_FP,AgeDB_30 datasets.
+
+------------------
+
+**Evaluation:**
+
+Final ranking is determined by the TAR under 1:1 protocal only, for all valid submissions. 
+
+For image testset, we evaluate the TAR under FAR@e-8 while we choose the TAR under FAR@e-4 for video testset.
+
+------------------
+
+**Baseline:**
+
+1. Network y2(a deeper mobilefacenet): 933M FLOPs. TAR_image: 0.64691, TAR_video: 0.47191
+2. Network r100fc(ResNet100FC-IR): 24G FLOPs. TAR_image: 0.80312, TAR_video: 0.64894
+
+Baseline models download link: [baidu cloud](https://pan.baidu.com/s/1Em0ZFnefSoTsZoTd-9m8Nw)    [dropbox](https://www.dropbox.com/s/yqaziktiv38ehrv/iccv19-baseline-models.zip?dl=0)
+
+Training logs: [baidu cloud](https://pan.baidu.com/s/12rsp-oMzsjTeU6nugEvA9g)   [dropbox](https://www.dropbox.com/s/4ufb9g7n76rfav5/iccv-baseline-log.zip?dl=0)
+
+------------------
+
+**Discussion:**
+
+[https://github.com/deepinsight/insightface/issues/632](https://github.com/deepinsight/insightface/issues/632)
+
+------------------
+
+**Candidate solutions:**
+
+1. Manually design or automatically search different networks/losses.
+2. Use slightly deeper or wider mobile-level networks.
+3. [OctConv](https://arxiv.org/abs/1904.05049), to reduce FLOPs.
+4. [HRNet](https://arxiv.org/abs/1904.04514), for large FLOPs track.
+and so on
diff --git a/insightface/challenges/iccv19-lfr/gen_image_feature.py b/insightface/challenges/iccv19-lfr/gen_image_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..c555227b910db2216dfbe5a3ca55aa6177720653
--- /dev/null
+++ b/insightface/challenges/iccv19-lfr/gen_image_feature.py
@@ -0,0 +1,157 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from datetime import datetime
+import os.path
+from easydict import EasyDict as edict
+import time
+import json
+import sys
+import numpy as np
+import importlib
+import itertools
+import argparse
+import struct
+import cv2
+import sklearn
+from sklearn.preprocessing import normalize
+import mxnet as mx
+from mxnet import ndarray as nd
+
+image_shape = None
+net = None
+data_size = 1862120
+emb_size = 0
+use_flip = True
+
+
+def do_flip(data):
+    for idx in range(data.shape[0]):
+        data[idx, :, :] = np.fliplr(data[idx, :, :])
+
+
+def get_feature(buffer):
+    global emb_size
+    if use_flip:
+        input_blob = np.zeros(
+            (len(buffer) * 2, 3, image_shape[1], image_shape[2]))
+    else:
+        input_blob = np.zeros((len(buffer), 3, image_shape[1], image_shape[2]))
+    idx = 0
+    for item in buffer:
+        img = cv2.imread(item)[:, :, ::-1]  #to rgb
+        img = np.transpose(img, (2, 0, 1))
+        attempts = [0, 1] if use_flip else [0]
+        for flipid in attempts:
+            _img = np.copy(img)
+            if flipid == 1:
+                do_flip(_img)
+            input_blob[idx] = _img
+            idx += 1
+    data = mx.nd.array(input_blob)
+    db = mx.io.DataBatch(data=(data, ))
+    net.model.forward(db, is_train=False)
+    _embedding = net.model.get_outputs()[0].asnumpy()
+    if emb_size == 0:
+        emb_size = _embedding.shape[1]
+        print('set emb_size to ', emb_size)
+    embedding = np.zeros((len(buffer), emb_size), dtype=np.float32)
+    if use_flip:
+        embedding1 = _embedding[0::2]
+        embedding2 = _embedding[1::2]
+        embedding = embedding1 + embedding2
+    else:
+        embedding = _embedding
+    embedding = sklearn.preprocessing.normalize(embedding)
+    return embedding
+
+
+def write_bin(path, m):
+    rows, cols = m.shape
+    with open(path, 'wb') as f:
+        f.write(struct.pack('4i', rows, cols, cols * 4, 5))
+        f.write(m.data)
+
+
+def main(args):
+    global image_shape
+    global net
+
+    print(args)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    image_shape = [int(x) for x in args.image_size.split(',')]
+    vec = args.model.split(',')
+    assert len(vec) > 1
+    prefix = vec[0]
+    epoch = int(vec[1])
+    print('loading', prefix, epoch)
+    net = edict()
+    net.ctx = ctx
+    net.sym, net.arg_params, net.aux_params = mx.model.load_checkpoint(
+        prefix, epoch)
+    #net.arg_params, net.aux_params = ch_dev(net.arg_params, net.aux_params, net.ctx)
+    all_layers = net.sym.get_internals()
+    net.sym = all_layers['fc1_output']
+    net.model = mx.mod.Module(symbol=net.sym,
+                              context=net.ctx,
+                              label_names=None)
+    net.model.bind(data_shapes=[('data', (args.batch_size, 3, image_shape[1],
+                                          image_shape[2]))])
+    net.model.set_params(net.arg_params, net.aux_params)
+
+    features_all = None
+
+    i = 0
+    fstart = 0
+    buffer = []
+    for line in open(os.path.join(args.input, 'filelist.txt'), 'r'):
+        if i % 1000 == 0:
+            print("processing ", i)
+        i += 1
+        line = line.strip()
+        image_path = os.path.join(args.input, line)
+        buffer.append(image_path)
+        if len(buffer) == args.batch_size:
+            embedding = get_feature(buffer)
+            buffer = []
+            fend = fstart + embedding.shape[0]
+            if features_all is None:
+                features_all = np.zeros((data_size, emb_size),
+                                        dtype=np.float32)
+            #print('writing', fstart, fend)
+            features_all[fstart:fend, :] = embedding
+            fstart = fend
+    if len(buffer) > 0:
+        embedding = get_feature(buffer)
+        fend = fstart + embedding.shape[0]
+        print('writing', fstart, fend)
+        features_all[fstart:fend, :] = embedding
+    write_bin(args.output, features_all)
+    #os.system("bypy upload %s"%args.output)
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--batch_size', type=int, help='', default=32)
+    parser.add_argument('--image_size', type=str, help='', default='3,112,112')
+    parser.add_argument('--input', type=str, help='', default='')
+    parser.add_argument('--output', type=str, help='', default='')
+    parser.add_argument('--model', type=str, help='', default='')
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
diff --git a/insightface/challenges/iccv19-lfr/gen_video_feature.py b/insightface/challenges/iccv19-lfr/gen_video_feature.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3d7fc29940fbd0495f51c8523ce22413b9a60d
--- /dev/null
+++ b/insightface/challenges/iccv19-lfr/gen_video_feature.py
@@ -0,0 +1,215 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from datetime import datetime
+import os.path
+from easydict import EasyDict as edict
+import time
+import json
+import glob
+import sys
+import numpy as np
+import importlib
+import itertools
+import argparse
+import struct
+import cv2
+import sklearn
+from sklearn.preprocessing import normalize
+import mxnet as mx
+from mxnet import ndarray as nd
+
+image_shape = None
+net = None
+data_size = 203848
+emb_size = 0
+use_flip = False
+ctx_num = 0
+
+
+def do_flip(data):
+    for idx in range(data.shape[0]):
+        data[idx, :, :] = np.fliplr(data[idx, :, :])
+
+
+def get_feature(buffer):
+    global emb_size
+    input_count = len(buffer)
+    if use_flip:
+        input_count *= 2
+    network_count = input_count
+    if input_count % ctx_num != 0:
+        network_count = (input_count // ctx_num + 1) * ctx_num
+
+    input_blob = np.zeros((network_count, 3, image_shape[1], image_shape[2]),
+                          dtype=np.float32)
+    idx = 0
+    for item in buffer:
+        img = cv2.imread(item)[:, :, ::-1]  #to rgb
+        img = np.transpose(img, (2, 0, 1))
+        attempts = [0, 1] if use_flip else [0]
+        for flipid in attempts:
+            _img = np.copy(img)
+            if flipid == 1:
+                do_flip(_img)
+            input_blob[idx] = _img
+            idx += 1
+    data = mx.nd.array(input_blob)
+    db = mx.io.DataBatch(data=(data, ))
+    net.model.forward(db, is_train=False)
+    _embedding = net.model.get_outputs()[0].asnumpy()
+    _embedding = _embedding[0:input_count]
+    if emb_size == 0:
+        emb_size = _embedding.shape[1]
+        print('set emb_size to ', emb_size)
+    embedding = np.zeros((len(buffer), emb_size), dtype=np.float32)
+    if use_flip:
+        embedding1 = _embedding[0::2]
+        embedding2 = _embedding[1::2]
+        embedding = embedding1 + embedding2
+    else:
+        embedding = _embedding
+    embedding = sklearn.preprocessing.normalize(embedding)
+    return embedding
+
+
+def write_bin(path, m):
+    rows, cols = m.shape
+    with open(path, 'wb') as f:
+        f.write(struct.pack('4i', rows, cols, cols * 4, 5))
+        f.write(m.data)
+
+
+def main(args):
+    global image_shape
+    global net
+    global ctx_num
+
+    print(args)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    ctx_num = len(ctx)
+    image_shape = [int(x) for x in args.image_size.split(',')]
+    vec = args.model.split(',')
+    assert len(vec) > 1
+    prefix = vec[0]
+    epoch = int(vec[1])
+    print('loading', prefix, epoch)
+    net = edict()
+    net.ctx = ctx
+    net.sym, net.arg_params, net.aux_params = mx.model.load_checkpoint(
+        prefix, epoch)
+    #net.arg_params, net.aux_params = ch_dev(net.arg_params, net.aux_params, net.ctx)
+    all_layers = net.sym.get_internals()
+    net.sym = all_layers['fc1_output']
+    net.model = mx.mod.Module(symbol=net.sym,
+                              context=net.ctx,
+                              label_names=None)
+    net.model.bind(data_shapes=[('data', (args.batch_size, 3, image_shape[1],
+                                          image_shape[2]))])
+    net.model.set_params(net.arg_params, net.aux_params)
+
+    features_all = None
+
+    i = 0
+    filelist = os.path.join(args.input, 'filelist.txt')
+    #print(filelist)
+    buffer_images = []
+    buffer_embedding = np.zeros((0, 0), dtype=np.float32)
+    aggr_nums = []
+    row_idx = 0
+    for line in open(filelist, 'r'):
+        if i % 1000 == 0:
+            print("processing ", i)
+        i += 1
+        #print('stat', i, len(buffer_images), buffer_embedding.shape, aggr_nums, row_idx)
+        videoname = line.strip().split()[0]
+        images = glob.glob("%s/%s/*.jpg" % (args.input, videoname))
+        assert len(images) > 0
+        image_features = []
+        for image_path in images:
+            buffer_images.append(image_path)
+        aggr_nums.append(len(images))
+        while len(buffer_images) >= args.batch_size:
+            embedding = get_feature(buffer_images[0:args.batch_size])
+            buffer_images = buffer_images[args.batch_size:]
+            if buffer_embedding.shape[1] == 0:
+                buffer_embedding = embedding.copy()
+            else:
+                buffer_embedding = np.concatenate(
+                    (buffer_embedding, embedding), axis=0)
+        buffer_idx = 0
+        acount = 0
+        for anum in aggr_nums:
+            if buffer_embedding.shape[0] >= anum + buffer_idx:
+                image_features = buffer_embedding[buffer_idx:buffer_idx + anum]
+                video_feature = np.sum(image_features, axis=0, keepdims=True)
+                video_feature = sklearn.preprocessing.normalize(video_feature)
+                if features_all is None:
+                    features_all = np.zeros(
+                        (data_size, video_feature.shape[1]), dtype=np.float32)
+                #print('write to', row_idx, anum, buffer_embedding.shape)
+                features_all[row_idx] = video_feature.flatten()
+                row_idx += 1
+                buffer_idx += anum
+                acount += 1
+            else:
+                break
+        aggr_nums = aggr_nums[acount:]
+        buffer_embedding = buffer_embedding[buffer_idx:]
+
+    if len(buffer_images) > 0:
+        embedding = get_feature(buffer_images)
+        buffer_images = buffer_images[args.batch_size:]
+        buffer_embedding = np.concatenate((buffer_embedding, embedding),
+                                          axis=0)
+    buffer_idx = 0
+    acount = 0
+    for anum in aggr_nums:
+        assert buffer_embedding.shape[0] >= anum + buffer_idx
+        image_features = buffer_embedding[buffer_idx:buffer_idx + anum]
+        video_feature = np.sum(image_features, axis=0, keepdims=True)
+        video_feature = sklearn.preprocessing.normalize(video_feature)
+        #print('last write to', row_idx, anum, buffer_embedding.shape)
+        features_all[row_idx] = video_feature.flatten()
+        row_idx += 1
+        buffer_idx += anum
+        acount += 1
+
+    aggr_nums = aggr_nums[acount:]
+    buffer_embedding = buffer_embedding[buffer_idx:]
+    assert len(aggr_nums) == 0
+    assert buffer_embedding.shape[0] == 0
+
+    write_bin(args.output, features_all)
+    print(row_idx, features_all.shape)
+    #os.system("bypy upload %s"%args.output)
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--batch_size', type=int, help='', default=32)
+    parser.add_argument('--image_size', type=str, help='', default='3,112,112')
+    parser.add_argument('--input',
+                        type=str,
+                        help='',
+                        default='./testdata-video')
+    parser.add_argument('--output', type=str, help='', default='')
+    parser.add_argument('--model', type=str, help='', default='')
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
diff --git a/insightface/challenges/iccv21-mfr/README.md b/insightface/challenges/iccv21-mfr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..286511b428ae408deb2359a8684cc5ebb8cb9a83
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/README.md
@@ -0,0 +1,187 @@
+# InsightFace Track of ICCV21-MFR
+
+## NEWS
+
+**``2021-11-30``** [MFR-Ongoing](../mfr) is now available.
+
+**``2021-10-26``** Please send the onnx models to us(insightface.challenge[at]gmail.com) if you want to test the MFR accuracy before our system rebooting(may be in Nov.).
+
+**``2021-10-11``** [Final Leaderboard](https://insightface.ai/mfr21)
+
+**``2021-10-04``** Please fix the public leaderboard scores before ``2021-10-05 20:00(UTC+8 Time)``
+
+**``2021-07-16``**  Implicit batch inference is prohibited. For example, inserting some data-related OPs to onnx graph to enable automatic flip-test is not allowed(or similar ideas). We will check it after submission closed, to ensure fairness.
+
+**``2021-06-17``**  Participants are now ordered in terms of highest scores across two datasets: **TAR@Mask** and **TAR@MR-All**, by the formula of ``0.25 * TAR@Mask + 0.75 * TAR@MR-All``.
+
+
+## Introduction
+
+The Masked Face Recognition Challenge & Workshop(MFR) will be held in conjunction with the International Conference on Computer Vision (ICCV) 2021. 
+
+[Workshop-Homepage](https://ibug.doc.ic.ac.uk/resources/masked-face-recognition-challenge-workshop-iccv-21/). 
+
+There're InsightFace track here and [Webface260M](https://www.face-benchmark.org/challenge.html) track(with larger training set) in this workshop.
+
+[**Challenge Leaderboard**](https://insightface.ai/mfr21)
+
+Submission server link: [http://iccv21-mfr.com/](http://iccv21-mfr.com/)
+
+An alternative submission server for Non-Chinese users: [http://124.156.136.55/](http://124.156.136.55/)
+
+
+### Discussion group:
+
+WeChat:
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/mfr_wechat_group.png" alt="mfr_group" width="360">
+
+QQ Group: 711302608, *answer: github*
+
+Online issue discussion: [https://github.com/deepinsight/insightface/issues/1564](https://github.com/deepinsight/insightface/issues/1564)
+
+## Testsets for insightface track
+
+In this challenge, we will evaluate the accuracy of following testsets:
+
+  * Accuracy between masked and non-masked faces.
+  * Accuracy among children(2~16 years old).
+  * Accuracy of globalised multi-racial benchmarks.
+
+We ensure that there's no overlap between these testsets and public available training datasets, as they are not collected from online celebrities.
+
+Our test datasets mainly comes from [IFRT](../ifrt).
+
+### ``Mask test-set:``
+
+Mask testset contains 6,964 identities, 6,964 masked images and 13,928 non-masked images. There are totally 13,928 positive pairs and 96,983,824 negative pairs.
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrt_mask_sample.jpg" alt="ifrtsample" width="360">
+</details>
+
+### ``Children test-set:``
+
+Children testset contains 14,344 identities and 157,280 images. There are totally 1,773,428 positive pairs and 24,735,067,692 negative pairs.
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrt_children_sample.jpg" alt="ifrtsample" width="360">
+</details>
+
+### ``Multi-racial test-set (MR in short):``
+
+The globalised multi-racial testset contains 242,143 identities and 1,624,305 images.
+
+| Race-Set     | Identities  | Images        |  Positive Pairs   | Negative Pairs        |
+| -------      | ----------  | -----------   |  -----------      | -----------           |
+| African      | 43,874      | 298,010       |  870,091          | 88,808,791,999        |
+| Caucasian    | 103,293     | 697,245       |  2,024,609        | 486,147,868,171       |
+| Indian       | 35,086      | 237,080       |  688,259          | 56,206,001,061        |
+| Asian        | 59,890      | 391,970       |  1,106,078        | 153,638,982,852       |
+| **ALL**      | **242,143** | **1,624,305** |  **4,689,037**    | **2,638,360,419,683** |
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrtsample_blur.jpg" alt="ifrtsample" width="640">
+</details>
+
+## Evaluation Metric
+
+For ``Mask`` set, TAR is measured on mask-to-nonmask 1:1 protocal, with FAR less than 0.0001(e-4).
+
+For ``Children`` set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.0001(e-4).
+
+For other sets, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6).
+
+Participants are ordered in terms of highest scores across two datasets: **TAR@Mask** and **TAR@MR-All**, by the formula of ``0.25 * TAR@Mask + 0.75 * TAR@MR-All``.
+
+
+## Baselines
+
+
+| Backbone   | Dataset    | Method     | Mask   | Children | African | Caucasian | South Asian | East Asian | MR-All    | size(mb) | infer(ms) | link |
+|------------|------------|------------|--------|----------|---------|-----------|-------------|------------|--------|----------|-----------|-----------|
+| R100  | Casia  | ArcFace  | 26.623 | 30.359   | 39.666  | 53.933    | 47.807      | 21.572     | 42.735 | 248.904  | 7.073     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUJpk8zC61HVN7Kg?e=zE9JDd) |
+| R100  | MS1MV2  | ArcFace  | 65.767 | 60.496   | 79.117  | 87.176    | 85.501      | 55.807     | 80.725 | 248.904  | 7.028     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUTlYEHJCHg3UYM-?e=ihxMpS) |
+| R18  | MS1MV3  | ArcFace | 47.853 | 41.047   | 62.613  | 75.125    | 70.213      | 43.859     | 68.326 | 91.658   | 1.856     | [download](https://1drv.ms/u/s!AswpsDO2toNKrTxlT6w1Jo02yzSh?e=KDhFAA) |
+| R34  | MS1MV3  | ArcFace | 58.723 | 55.834   | 71.644  | 83.291    | 80.084      | 53.712     | 77.365 | 130.245  | 3.054     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT2O5pgyVtwnjeMq?e=16S8LI) |
+| R50  | MS1MV3  | ArcFace | 63.850 | 60.457   | 75.488  | 86.115    | 84.305      | 57.352     | 80.533 | 166.305  | 4.262     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUUWd5i3a5OlFpM_?e=ExBDBN) |
+| R100 | MS1MV3 | ArcFace | 69.091 | 66.864   | 81.083  | 89.040    | 88.082      | 62.193     | 84.312 | 248.590  | 7.031     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUPwyqWvNXUlNd3P?e=pTLw9A) |
+| R18   | Glint360K   | ArcFace | 53.317 | 48.113   | 68.230  | 80.575    | 75.852      | 47.831     | 72.074 | 91.658   | 2.013     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT5ey4lCqFzlpzDd?e=VWP28J) |
+| R34   | Glint360K   | ArcFace | 65.106 | 65.454   | 79.907  | 88.620    | 86.815      | 60.604     | 83.015 | 130.245  | 3.044     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUBcgGkiuUS11Hsd?e=ISGDnP) |
+| R50   | Glint360K   | ArcFace | 70.233 | 69.952   | 85.272  | 91.617    | 90.541      | 66.813     | 87.077 | 166.305  | 4.340     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT8jbvHxjqCY0d08?e=igfdrd) |
+| R100  | Glint360K  | ArcFace | 75.567 | 75.202   | 89.488  | 94.285    | 93.434      | 72.528     | 90.659 | 248.590  | 7.038     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUFgLEIj-mnkb51b?e=vWqy2q) |
+| -       | *Private*     | <div style="width: 50pt">insightface-000 of frvt  | 97.760 | 93.358   | 98.850  | 99.372    | 99.058      | 87.694     | 97.481 | -  | -    |   -  |
+
+
+(MS1M-V2 means MS1M-ArcFace, MS1M-V3 means MS1M-RetinaFace).
+
+Inference time was evaluated on Tesla V100 GPU, using onnxruntime-gpu==1.6.
+
+
+## Rules
+
+1. We have two sub-tracks, determined by the size of training dataset and inference time limitation.
+  * Sub-Track A: Use MS1M-V3 as training set, download: [ref-link](https://github.com/deepinsight/insightface/blob/master/recognition/_datasets_/README.md#ms1m-retinaface), feature length must <= 512, and the inference time must <= 10ms on Tesla V100 GPU.
+  * Sub-Track B: Use Glint360K as training set, download: [ref-link](https://github.com/deepinsight/insightface/blob/master/recognition/_datasets_/README.md#deepglint-181k-ids675m-images-8), feature length must <= 1024, and the inference time must <= 20ms on Tesla V100 GPU.
+2. Training set and testing set are both aligned to 112x112, re-alignment is prohibited.
+3. Mask data-augmentation is allowed, such as [this](../../recognition/_tools_). The applied mask augmentation tool should be reproducible. 
+4. External dataset and pretrained models are both prohibited.
+5. Participants submit onnx model, then get scores by our online evaluation. Test images are invisible.
+6. Matching score is measured by cosine similarity.
+7. Model size must <= 1GB.
+8. The input shape of submission model should equal to 3x112x112 (RGB order).
+9. Online evaluation server uses onnxruntime-gpu==1.6, cuda==10.2, cudnn==8.0.5.
+10. Any float-16 model weights is prohibited, as it will lead to incorrect model size estimiation.
+11. Please use ``onnx_helper.py`` to check whether the model is valid.
+12. Participants are now ordered in terms of highest scores across two datasets: **TAR@Mask** and **TAR@MR-All**, by the formula of ``0.25 * TAR@Mask + 0.75 * TAR@MR-All``.
+13. Top-ranked participants should provide their solutions and codes to ensure their validity after submission closed.
+
+
+## Tutorial 
+
+1. ArcFace-PyTorch (with Partial-FC), [code](../../recognition/arcface_torch), [tutorial-cn](tutorial_pytorch_cn.md)
+2. OneFlow, [code](../../recognition/oneflow_face)
+3. MXNet, [code](../../recognition/arcface_mxnet)
+
+## Submission Guide
+
+1. Participants must package the onnx model for submission using ``zip xxx.zip model.onnx``.
+2. Each participant can submit three times a day at most.
+3. Please sign-up with the real organization name. You can hide the organization name in our system if you like.
+4. You can decide which submission to be displayed on the leaderboard by clicking 'Set Public' button.
+5. Please click 'sign-in' on submission server if find you're not logged in.
+
+Server link: [http://iccv21-mfr.com/](http://iccv21-mfr.com/)
+
+## Timelines
+
+* 1 June - Release of the training data, baseline solutions and testing leader-board
+* 1 October - Stop leader-board submission (11:59 PM Pacific Time)
+* 7 October - Winners notification
+
+
+## Sponsors
+
+(in alphabetical order)
+ 
+
+**[DeepGlint](http://www.deepglint.com/)**
+
+**[Kiwi Tech](http://www.kiwiar.com)**
+
+**[OneFlow](https://www.oneflow.org)**
+
+ **[Zoloz](https://www.zoloz.com)**
+
+
+## Bonus Share
+
+|           | Sub-Track A | Sub-Track B |
+| --------- | ------- | ------- |
+| 1st place | 30%     | 30%     |
+| 2nd place | 15%     | 15%     |
+| 3rd place | 5%      | 5%      |
+
diff --git a/insightface/challenges/iccv21-mfr/dataset_mask.py b/insightface/challenges/iccv21-mfr/dataset_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..b221c95c52603d653e98ba992828ec3325db58b1
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/dataset_mask.py
@@ -0,0 +1,208 @@
+import numbers
+import os
+import queue as Queue
+import threading
+
+import mxnet as mx
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from insightface.app import MaskAugmentation
+
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank,
+                                                 non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+class MXFaceDataset(Dataset):
+    def __init__(self, root_dir, local_rank, aug_modes="brightness=0.1+mask=0.1"):
+        super(MXFaceDataset, self).__init__()
+        default_aug_probs = {
+                'brightness' : 0.2,
+                'blur': 0.1,
+                'mask': 0.1,
+                }
+
+        aug_mode_list = aug_modes.lower().split('+')
+        aug_mode_map = {}
+        for aug_mode_str in aug_mode_list:
+            _aug = aug_mode_str.split('=')
+            aug_key = _aug[0]
+            if len(_aug)>1:
+                aug_prob = float(_aug[1])
+            else:
+                aug_prob = default_aug_probs[aug_key]
+            aug_mode_map[aug_key] = aug_prob
+
+        transform_list = []
+        self.mask_aug = False
+        self.mask_prob = 0.0
+        key = 'mask'
+        if key in aug_mode_map:
+            self.mask_aug = True
+            self.mask_prob = aug_mode_map[key]
+            transform_list.append(
+                MaskAugmentation(mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'], mask_probs=[0.4, 0.4, 0.1, 0.1], h_low=0.33, h_high=0.4, p=self.mask_prob)
+                )
+        if local_rank==0:
+            print('data_transform_list:', transform_list)
+            print('mask:', self.mask_aug, self.mask_prob)
+        key = 'brightness'
+        if key in aug_mode_map:
+            prob = aug_mode_map[key]
+            transform_list.append(
+                A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=prob)
+                )
+        key = 'blur'
+        if key in aug_mode_map:
+            prob = aug_mode_map[key]
+            transform_list.append(
+                A.ImageCompression(quality_lower=30, quality_upper=80, p=prob)
+                )
+            transform_list.append(
+                A.MedianBlur(blur_limit=(1,7), p=prob)
+                )
+            transform_list.append(
+                A.MotionBlur(blur_limit=(5,12), p=prob)
+                )
+        transform_list += \
+            [
+                A.HorizontalFlip(p=0.5),
+                A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+                ToTensorV2(),
+            ]
+        #here, the input for A transform is rgb cv2 img
+        self.transform = A.Compose(
+            transform_list 
+        )
+        self.root_dir = root_dir
+        self.local_rank = local_rank
+        path_imgrec = os.path.join(root_dir, 'train.rec')
+        path_imgidx = os.path.join(root_dir, 'train.idx')
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        s = self.imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        #print(header)
+        #print(len(self.imgrec.keys))
+        if header.flag > 0:
+            if len(header.label)==2:
+                self.imgidx = np.array(range(1, int(header.label[0])))
+            else:
+                self.imgidx = np.array(list(self.imgrec.keys))
+        else:
+            self.imgidx = np.array(list(self.imgrec.keys))
+        #print('imgidx len:', len(self.imgidx))
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        s = self.imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        hlabel = header.label
+        #print('hlabel:', hlabel.__class__)
+        sample = mx.image.imdecode(img).asnumpy()
+        if not isinstance(hlabel, numbers.Number):
+            idlabel = hlabel[0]
+        else:
+            idlabel = hlabel
+        label = torch.tensor(idlabel, dtype=torch.long)
+        if self.transform is not None:
+            sample = self.transform(image=sample, hlabel=hlabel)['image']
+        return sample, label
+
+    def __len__(self):
+        return len(self.imgidx)
+
+if __name__ == "__main__":
+    import argparse, cv2, copy
+    parser = argparse.ArgumentParser(description='dataset test')
+    parser.add_argument('--dataset', type=str,  help='dataset path')
+    parser.add_argument('--samples', type=int, default=256, help='')
+    parser.add_argument('--cols', type=int, default=16, help='')
+    args = parser.parse_args()
+    assert args.samples%args.cols==0
+    assert args.cols%2==0
+    samples = args.samples
+    cols = args.cols
+    rows = args.samples // args.cols
+    dataset = MXFaceDataset(root_dir=args.dataset, local_rank=0, aug_modes='mask=1.0')
+    dataset.transform = A.Compose([t for t in dataset.transform if not isinstance(t, (A.Normalize, ToTensorV2))])
+    dataset_0 = copy.deepcopy(dataset)
+    #dataset_0.transform = None
+    dataset_1 = copy.deepcopy(dataset)
+    #dataset_1.transform = A.Compose(
+    #    [
+    #        A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=1.0),
+    #        A.ImageCompression(quality_lower=30, quality_upper=80, p=1.0),
+    #        A.MedianBlur(blur_limit=(1,7), p=1.0),
+    #        A.MotionBlur(blur_limit=(5,12), p=1.0),
+    #        A.Affine(scale=(0.92, 1.08),  translate_percent=(-0.06, 0.06), rotate=(-6, 6), shear=None, interpolation=cv2.INTER_LINEAR, p=1.0),
+    #    ]
+    #)
+    fig = np.zeros( (112*rows, 112*cols, 3), dtype=np.uint8 )
+    for idx in range(samples):
+        if idx%2==0:
+            image, _ = dataset_0[idx//2]
+        else:
+            image, _ = dataset_1[idx//2]
+        row_idx = idx // cols
+        col_idx = idx % cols
+        fig[row_idx*112:(row_idx+1)*112, col_idx*112:(col_idx+1)*112,:] = image[:,:,::-1] # to bgr
+    cv2.imwrite("./datasets.png", fig)
diff --git a/insightface/challenges/iccv21-mfr/mxnet_to_ort.py b/insightface/challenges/iccv21-mfr/mxnet_to_ort.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f9f2f6aecfd929eb08d82bafdead7f42b78a57
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/mxnet_to_ort.py
@@ -0,0 +1,115 @@
+import sys
+import os
+import argparse
+import onnx
+import mxnet as mx
+from onnx import helper
+from onnx import TensorProto
+from onnx import numpy_helper
+
+print('mxnet version:', mx.__version__)
+print('onnx version:', onnx.__version__)
+
+assert mx.__version__ >= '1.8', 'mxnet version should >= 1.8'
+assert onnx.__version__ >= '1.2.1', 'onnx version should >= 1.2.1'
+
+import numpy as np
+from mxnet.contrib import onnx as onnx_mxnet
+
+def create_map(graph_member_list):
+    member_map={}
+    for n in graph_member_list:
+        member_map[n.name]=n
+    return member_map
+
+
+parser = argparse.ArgumentParser(description='convert arcface models to onnx')
+# general
+parser.add_argument('params', default='./r100a/model-0000.params', help='mxnet params to load.')
+parser.add_argument('output', default='./r100a.onnx', help='path to write onnx model.')
+parser.add_argument('--eps', default=1.0e-8, type=float, help='eps for weights.')
+parser.add_argument('--input-shape', default='3,112,112', help='input shape.')
+args = parser.parse_args()
+input_shape = (1,) + tuple( [int(x) for x in args.input_shape.split(',')] )
+
+params_file = args.params
+pos = params_file.rfind('-')
+prefix = params_file[:pos]
+epoch = int(params_file[pos+1:pos+5])
+sym_file = prefix + "-symbol.json"
+assert os.path.exists(sym_file)
+assert os.path.exists(params_file)
+
+sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+eps = args.eps
+
+arg = {}
+aux = {}
+invalid = 0
+ac = 0
+for k in arg_params:
+    v = arg_params[k]
+    nv = v.asnumpy()
+    #print(k, nv.dtype)
+    nv = nv.astype(np.float32)
+    ac += nv.size
+    invalid += np.count_nonzero(np.abs(nv)<eps)
+    nv[np.abs(nv) < eps] = 0.0
+    arg[k] = mx.nd.array(nv, dtype='float32')
+print(invalid, ac)
+arg_params = arg
+invalid = 0
+ac = 0
+for k in aux_params:
+    v = aux_params[k]
+    nv = v.asnumpy().astype(np.float32)
+    ac += nv.size
+    invalid += np.count_nonzero(np.abs(nv)<eps)
+    nv[np.abs(nv) < eps] = 0.0
+    aux[k] = mx.nd.array(nv, dtype='float32')
+print(invalid, ac)
+aux_params = aux
+
+all_args = {}
+all_args.update(arg_params)
+all_args.update(aux_params)
+converted_model_path = onnx_mxnet.export_model(sym, all_args, [input_shape], np.float32, args.output, opset_version=11)
+
+model = onnx.load(args.output)
+graph = model.graph
+input_map = create_map(graph.input)
+node_map = create_map(graph.node)
+init_map = create_map(graph.initializer)
+
+#fix PRelu issue
+for input_name in input_map.keys():
+    if input_name.endswith('_gamma'):
+        node_name = input_name[:-6]
+        if not node_name in node_map:
+            continue
+        node = node_map[node_name]
+        if node.op_type!='PRelu':
+            continue
+        input_shape = input_map[input_name].type.tensor_type.shape.dim
+        input_dim_val=input_shape[0].dim_value
+        
+        graph.initializer.remove(init_map[input_name])
+        weight_array = numpy_helper.to_array(init_map[input_name])
+        
+        b=[]
+        for w in weight_array:
+            b.append(w)
+        new_nv = helper.make_tensor(input_name, TensorProto.FLOAT, [input_dim_val,1,1], b)
+        graph.initializer.extend([new_nv])
+
+for init_name in init_map.keys():
+    weight_array = numpy_helper.to_array(init_map[init_name])
+    assert weight_array.dtype==np.float32
+    if init_name in input_map:
+        graph.input.remove(input_map[init_name])
+
+#support batch-inference
+graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+
+onnx.save(model, args.output)
+
diff --git a/insightface/challenges/iccv21-mfr/onnx_helper.py b/insightface/challenges/iccv21-mfr/onnx_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d44acedae72e73c9b7ded5dd32e91b024ca7462
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/onnx_helper.py
@@ -0,0 +1,253 @@
+from __future__ import division
+import datetime
+import os
+import os.path as osp
+import glob
+import numpy as np
+import cv2
+import sys
+import onnxruntime
+import onnx
+import argparse
+from onnx import numpy_helper
+from insightface.data import get_image
+
+class ArcFaceORT:
+    def __init__(self, model_path, cpu=False):
+        self.model_path = model_path
+        self.cpu = cpu
+
+    #input_size is (w,h), return error message, return None if success
+    def check(self, track='cfat', test_img = None):
+        #default is cfat
+        max_model_size_mb=1024
+        max_feat_dim=512
+        max_time_cost=15
+        if track.startswith('ms1m'):
+            max_model_size_mb=1024
+            max_feat_dim=512
+            max_time_cost=10
+        elif track.startswith('glint'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=20
+        elif track.startswith('cfat'):
+            max_model_size_mb = 1024
+            max_feat_dim = 512
+            max_time_cost = 15
+        elif track.startswith('unconstrained'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=30
+        else:
+            return "track not found"
+
+        if not os.path.exists(self.model_path):
+            return "model_path not exists"
+        if not os.path.isdir(self.model_path):
+            return "model_path should be directory"
+        onnx_files = []
+        for _file in os.listdir(self.model_path):
+            if _file.endswith('.onnx'):
+                onnx_files.append(osp.join(self.model_path, _file))
+        if len(onnx_files)==0:
+            return "do not have onnx files"
+        self.model_file = sorted(onnx_files)[-1]
+        print('use onnx-model:', self.model_file)
+        try:
+            session = onnxruntime.InferenceSession(self.model_file, None)
+        except:
+            return "load onnx failed"
+        if self.cpu:
+            session.set_providers(['CPUExecutionProvider'])
+        input_cfg = session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        print('input-shape:', input_shape)
+        if len(input_shape)!=4:
+            return "length of input_shape should be 4"
+        if not isinstance(input_shape[0], str):
+            #return "input_shape[0] should be str to support batch-inference"
+            print('reset input-shape[0] to None')
+            model = onnx.load(self.model_file)
+            model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+            new_model_file = osp.join(self.model_path, 'zzzzrefined.onnx')
+            onnx.save(model, new_model_file)
+            self.model_file = new_model_file
+            print('use new onnx-model:', self.model_file)
+            try:
+                session = onnxruntime.InferenceSession(self.model_file, None)
+            except:
+                return "load onnx failed"
+            if self.cpu:
+                session.set_providers(['CPUExecutionProvider'])
+            input_cfg = session.get_inputs()[0]
+            input_shape = input_cfg.shape
+            print('new-input-shape:', input_shape)
+
+        self.image_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        outputs = session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+            #print(o.name, o.shape)
+        if len(output_names)!=1:
+            return "number of output nodes should be 1"
+        self.session = session
+        self.input_name = input_name
+        self.output_names = output_names
+        #print(self.output_names)
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        if len(graph.node)<8:
+            return "too small onnx graph"
+
+        input_size = (112,112)
+        self.crop = None
+        if track=='cfat':
+            crop_file = osp.join(self.model_path, 'crop.txt')
+            if osp.exists(crop_file):
+                lines = open(crop_file,'r').readlines()
+                if len(lines)!=6:
+                    return "crop.txt should contain 6 lines"
+                lines = [int(x) for x in lines]
+                self.crop = lines[:4]
+                input_size = tuple(lines[4:6])
+        if input_size!=self.image_size:
+            return "input-size is inconsistant with onnx model input, %s vs %s"%(input_size, self.image_size)
+
+        self.model_size_mb = os.path.getsize(self.model_file) / float(1024*1024)
+        if self.model_size_mb > max_model_size_mb:
+            return "max model size exceed, given %.3f-MB"%self.model_size_mb
+
+        input_mean = None
+        input_std = None
+        if track=='cfat':
+            pn_file = osp.join(self.model_path, 'pixel_norm.txt')
+            if osp.exists(pn_file):
+                lines = open(pn_file,'r').readlines()
+                if len(lines)!=2:
+                    return "pixel_norm.txt should contain 2 lines"
+                input_mean = float(lines[0])
+                input_std = float(lines[1])
+        if input_mean is not None or input_std is not None:
+            if input_mean is None or input_std is None:
+                return "please set input_mean and input_std simultaneously"
+        else:
+            find_sub = False
+            find_mul = False
+            for nid, node in enumerate(graph.node[:8]):
+                print(nid, node.name)
+                if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                    find_sub = True
+                if node.name.startswith('Mul') or node.name.startswith('_mul') or node.name.startswith('Div'):
+                    find_mul = True
+            if find_sub and find_mul:
+                print("find sub and mul")
+                #mxnet arcface model
+                input_mean = 0.0
+                input_std = 1.0
+            else:
+                input_mean = 127.5
+                input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        for initn in graph.initializer:
+            weight_array = numpy_helper.to_array(initn)
+            dt = weight_array.dtype
+            if dt.itemsize<4:
+                return 'invalid weight type - (%s:%s)' % (initn.name, dt.name)
+        if test_img is None:
+            test_img = get_image('Tom_Hanks_54745')
+            test_img = cv2.resize(test_img, self.image_size)
+        else:
+            test_img = cv2.resize(test_img, self.image_size)
+        feat, cost = self.benchmark(test_img)
+        batch_result = self.check_batch(test_img)
+        batch_result_sum = float(np.sum(batch_result))
+        if batch_result_sum in [float('inf'), -float('inf')] or batch_result_sum != batch_result_sum:
+            print(batch_result)
+            print(batch_result_sum)
+            return "batch result output contains NaN!"
+
+        if len(feat.shape) < 2:
+           return "the shape of the feature must be two, but get {}".format(str(feat.shape))
+
+        if feat.shape[1] > max_feat_dim:
+            return "max feat dim exceed, given %d"%feat.shape[1]
+        self.feat_dim = feat.shape[1]
+        cost_ms = cost*1000
+        if cost_ms>max_time_cost:
+            return "max time cost exceed, given %.4f"%cost_ms
+        self.cost_ms = cost_ms
+        print('check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f'%(self.model_size_mb, self.feat_dim, self.cost_ms, self.input_mean, self.input_std))
+        return None
+
+    def check_batch(self, img):
+        if not isinstance(img, list):
+            imgs = [img, ] * 32
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :]
+                if nimg.shape[0] != self.image_size[1] or nimg.shape[1] != self.image_size[0]:
+                    nimg = cv2.resize(nimg, self.image_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(
+            images=imgs, scalefactor=1.0 / self.input_std, size=self.image_size,
+            mean=(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
+    def meta_info(self):
+        return {'model-size-mb':self.model_size_mb, 'feature-dim':self.feat_dim, 'infer': self.cost_ms}
+
+
+    def forward(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.image_size
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+                if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                    nimg = cv2.resize(nimg, input_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(imgs, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+        return net_out
+
+    def benchmark(self, img):
+        input_size = self.image_size
+        if self.crop is not None:
+            nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+            if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                nimg = cv2.resize(nimg, input_size)
+            img = nimg
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        costs = []
+        for _ in range(50):
+            ta = datetime.datetime.now()
+            net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+            tb = datetime.datetime.now()
+            cost = (tb-ta).total_seconds()
+            costs.append(cost)
+        costs = sorted(costs)
+        cost = costs[5]
+        return net_out, cost
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    # general
+    parser.add_argument('workdir', help='submitted work dir', type=str)
+    parser.add_argument('--track', help='track name, for different challenge', type=str, default='cfat')
+    args = parser.parse_args()
+    handler = ArcFaceORT(args.workdir)
+    err = handler.check(args.track)
+    print('err:', err)
diff --git a/insightface/challenges/iccv21-mfr/tutorial_pytorch_cn.md b/insightface/challenges/iccv21-mfr/tutorial_pytorch_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..39f5b9acbe56ae7ee602b5a2333116e52b9b827c
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/tutorial_pytorch_cn.md
@@ -0,0 +1,167 @@
+## pytorch 训练样例
+
+[训练样例地址]()
+
+### 下载数据集
+
+* 下载 MS1MV3 [Link](https://github.com/deepinsight/insightface/tree/master/challenges/iccv19-lfr)
+* 下载 Glint360K [Link](https://github.com/deepinsight/insightface/tree/master/recognition/partial_fc#4-download)
+
+### 服务器提交地址
+
+http://iccv21-mfr.com/
+
+### 安装依赖
+
+1. 安装 pytorch 1.7.1
+
+假设你已经安装好了GPU驱动和CUDA，根据你的CUDA版本，来选择你要安装的pytorch命令。  
+查看CUDA版本的命令为: `/usr/local/cuda/bin/nvcc -V`。
+
+Linux and Windows  
+```shell
+# CUDA 11.0
+pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 10.2
+pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
+
+# CUDA 10.1
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+```
+
+你也可以安装pytorch的其他版本，例如1.6.0或者更高的版本。
+
+2. 安装其他依赖
+
+```shell
+pip install -r requirement.txt
+```
+
+### 运行
+根据你的服务器，选择你要运行的命令。
+
+* 一台服务器，四张GPU运行
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
+```
+
+* 一台服务器，八张GPU运行
+
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
+```
+
+* 多台服务器，每台服务器8张GPU
+
+1. 节点0
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py
+```
+
+2. 节点1
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py
+```
+
+
+### 提交
+
+1. 提交onnx模型
+
+竞赛要求模型转换为`onnx`模型提交，arcface_torch工程在保存模型时，会自动转换成为onnx，其地址为`${cfg.output}/backbone.onnx`。
+
+模型checkpoint介绍：
+```shell
+├── backbone.onnx                        # 需要提交的模型
+├── backbone.pth                         # pytorch 保存的模型
+├── rank_0_softmax_weight_mom.pt         # 模型并行原因，每张卡保存softmax独有的参数
+├── rank_0_softmax_weight.pt
+├── rank_1_softmax_weight_mom.pt
+├── rank_1_softmax_weight.pt
+├── ... ...
+└── training.log                          # 训练日志
+```
+
+2. 检查onnx模型是否规范
+
+提交模型前检查一下提交的模型是否规范，并测试模型的推理时间  
+
+
+测试命令：
+```shell
+python onnx_helper_sample.py --model_root ms1mv3_arcface_r50/
+```
+
+也可以先测试一下onnx模型在公开测试集IJBC上的性能：
+https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/onnx_ijbc.py
+
+测试命令：
+```shell
+CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50
+```
+
+3. 模型大小参考
+
+推理时间是在`Tesla V100 GPU`中测试, 其中 onnxruntime-gpu==1.6。
+
+| 模型名称      | 大小/MB     | 推理时间/ms      |  
+| -------      | ----------  | -----------   |
+| R50          | 166         | 4.262         | 
+| R100         | 248         | 7.031         |  
+| R200         | 476         | 13.48         | 
+
+### 提示与技巧
+
+1. 训练加速-混合精度训练
+
+当时使用图灵架构的GPU时候，强烈建议开启混合精度训练模型，在`config.py`中，将`config.fp16`设置为True，可以节省大量显存和提升训练速度，例如：
+
+训练设置：
+MS1MV3(SSD) + 4*V100 + R100 + BatchSize 4*128
+
+- 开启混合精度训练前
+```python3
+# training log
+Training: 2021-05-12 00:00:42,110-Speed 884.42 samples/sec   Loss 47.2532   Epoch: 0   Global Step: 100
+Training: 2021-05-12 00:01:10,979-Speed 886.77 samples/sec   Loss 47.3550   Epoch: 0   Global Step: 150
+Training: 2021-05-12 00:01:43,936-Speed 776.80 samples/sec   Loss 47.0214   Epoch: 0   Global Step: 200
+Training: 2021-05-12 00:02:16,064-Speed 796.83 samples/sec   Loss 46.7781   Epoch: 0   Global Step: 250
+Training: 2021-05-12 00:02:45,018-Speed 884.18 samples/sec   Loss 46.3187   Epoch: 0   Global Step: 300
+# gpustat -i
+[0] Tesla V100-SXM2-32GB | 67 C,  99 % | 17844 / 32510 MB 
+[1] Tesla V100-SXM2-32GB | 64 C,  98 % | 17844 / 32510 MB 
+[2] Tesla V100-SXM2-32GB | 65 C,  93 % | 17916 / 32510 MB 
+[3] Tesla V100-SXM2-32GB | 72 C,  82 % | 17910 / 32510 MB 
+```
+
+- 开启混合精度训练后
+
+```python3
+# training log
+Training: 2021-05-12 00:04:27,869-Speed 1604.59 samples/sec   Loss 47.6050   Epoch: 0   Global Step: 100
+Training: 2021-05-12 00:04:43,681-Speed 1619.08 samples/sec   Loss 47.5865   Epoch: 0   Global Step: 150
+Training: 2021-05-12 00:04:59,460-Speed 1622.39 samples/sec   Loss 47.2380   Epoch: 0   Global Step: 200
+Training: 2021-05-12 00:05:15,271-Speed 1619.25 samples/sec   Loss 46.9030   Epoch: 0   Global Step: 250
+Training: 2021-05-12 00:05:31,065-Speed 1620.86 samples/sec   Loss 46.4425   Epoch: 0   Global Step: 300
+# gpustat -i
+[0] Tesla V100-SXM2-32GB | 64 C,  96 % | 10664 / 32510 M  
+[1] Tesla V100-SXM2-32GB | 63 C,  96 % | 10630 / 32510 MB 
+[2] Tesla V100-SXM2-32GB | 63 C,  79 % | 10736 / 32510 MB 
+[3] Tesla V100-SXM2-32GB | 70 C,  86 % | 10736 / 32510 MB
+```
+
+2. 训练加速-将数据挂载到内存盘来提升训练速度  
+使用如下的命令：
+```shell
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=40G  tmpfs /train_tmp
+```
+
+让后将训练集拷贝到目录`/train_tmp`下，然后开始训练。
diff --git a/insightface/challenges/iccv21-mfr/tutorial_pytorch_mask_aug.md b/insightface/challenges/iccv21-mfr/tutorial_pytorch_mask_aug.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bf2ba8761b75a50925583f12d63ef70c88c5030
--- /dev/null
+++ b/insightface/challenges/iccv21-mfr/tutorial_pytorch_mask_aug.md
@@ -0,0 +1,34 @@
+# A tutorial on how to enable mask augmentation on arcface_torch training.
+
+The python package insightface==0.3.2 provides utilities to enable mask augmentation within one line:
+
+```
+transform_list.append(
+   MaskAugmentation(
+      mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'], 
+      mask_probs=[0.4, 0.4, 0.1, 0.1], h_low=0.33, h_high=0.4, p=self.mask_prob)
+   )
+```
+
+### Prepare
+
+1. Download antelope model pack by `bash> insightface-cli model.download antelope` which will be located at `~/.insightface/models/antelope`
+2. Generate BFM.mat and BFM_UV.mat following [here](https://github.com/deepinsight/insightface/tree/master/recognition/tools#data-prepare), for license concern.
+3. Generate new mask-rec dataset by `bash> insightface-cli rec.addmaskparam /data/ms1m-retinaface-t1 /data/ms1m-retinaface-t1mask` which generates and writes the mask params of each image into the record.
+
+
+### Add Mask Renderer Augmentation
+just by following code:
+```
+from insightface.app import MaskAugmentation
+self.transform_list.append(
+    MaskAugmentation(
+    mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'], 
+    mask_probs=[0.4, 0.4, 0.1, 0.1], 
+    h_low=0.33, h_high=0.4, p=0.1)
+)
+```
+
+Please check [dataset_mask.py](https://github.com/deepinsight/insightface/blob/master/challenges/iccv21-mfr/dataset_mask.py) for detail. 
+
+You can override the original dataset.py with this file to simply enable mask augmentation.
diff --git a/insightface/challenges/mfr/README.md b/insightface/challenges/mfr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db9bfd34d4a723a5686a44c70383c5dd0ac8e7cb
--- /dev/null
+++ b/insightface/challenges/mfr/README.md
@@ -0,0 +1,135 @@
+# MFR Ongoing
+
+This is the ongoing version of [ICCV-2021 Masked Face Recognition Challenge & Workshop(MFR)](https://ibug.doc.ic.ac.uk/resources/masked-face-recognition-challenge-workshop-iccv-21/). We also extend it to involve some public available and popular benchmarks such as IJBC, LFW, CFPFP and AgeDB.
+
+
+(:bulb: :bulb: Once you find the name **IFRT** which is *InsightFace Recognition Test* in short anywhere, it is the same as MFR-Ongoing.)
+
+
+For detail, please check our ICCV 2021 workshop [paper](https://openaccess.thecvf.com/content/ICCV2021W/MFR/papers/Deng_Masked_Face_Recognition_Challenge_The_InsightFace_Track_Report_ICCVW_2021_paper.pdf).
+
+More information about the workshop challenge can be found [here](../iccv21-mfr), for reference.
+
+**MFR** testset consists of non-celebrities so we can ensure that it has very few overlap with public available face recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities. As the result, we can evaluate the FAIR performance for different algorithms.
+
+In recent changes, we also add public available popular benchmarks such as IJBC, LFW, CFPFP, AgeDB into **MFR-Ongoing**.
+
+
+Current submission server link: [http://iccv21-mfr.com/](http://iccv21-mfr.com/)
+
+For any question, please send email to `insightface.challenge AT gmail.com`
+
+## Testsets
+
+In MFR-Ongoing, we will evaluate the accuracy of following testsets:
+
+  * **Accuracy between masked and non-masked faces.**
+  * **Accuracy among children(2~16 years old).**
+  * **Accuracy of globalised multi-racial benchmarks.**
+
+
+We ensure that there's no overlap between the above testsets and public available training datasets, as they are not collected from online celebrities.
+
+We also evaluate below public available popular benchmarks:
+
+  * **IJBC under FAR<=e-5 and FAR<=e-4.**
+  * **Some 1:1 verification testsets, such as LFW, CFPFP, AgeDB-30.**
+
+
+### ``Mask test-set:``
+
+Mask testset contains 6,964 identities, 6,964 masked images and 13,928 non-masked images. There are totally 13,928 positive pairs and 96,983,824 negative pairs.
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrt_mask_sample.jpg" alt="ifrtsample" width="360">
+</details>
+
+### ``Children test-set:``
+
+Children testset contains 14,344 identities and 157,280 images. There are totally 1,773,428 positive pairs and 24,735,067,692 negative pairs.
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrt_children_sample.jpg" alt="ifrtsample" width="360">
+</details>
+
+### ``Multi-racial test-set (MR in short):``
+
+The globalised multi-racial testset contains 242,143 identities and 1,624,305 images.
+
+| Race-Set     | Identities  | Images        |  Positive Pairs   | Negative Pairs        |
+| -------      | ----------  | -----------   |  -----------      | -----------           |
+| African      | 43,874      | 298,010       |  870,091          | 88,808,791,999        |
+| Caucasian    | 103,293     | 697,245       |  2,024,609        | 486,147,868,171       |
+| Indian       | 35,086      | 237,080       |  688,259          | 56,206,001,061        |
+| Asian        | 59,890      | 391,970       |  1,106,078        | 153,638,982,852       |
+| **ALL**      | **242,143** | **1,624,305** |  **4,689,037**    | **2,638,360,419,683** |
+
+<details>
+  <summary>Click to check the sample images(here we manually blur it to protect privacy) </summary>
+  <img src="https://github.com/nttstar/insightface-resources/blob/master/images/ifrtsample_blur.jpg" alt="ifrtsample" width="640">
+</details>
+
+## Evaluation Metric
+
+For ``Mask`` set, TAR is measured on mask-to-nonmask 1:1 protocal, with FAR less than 0.0001(e-4).
+
+For ``Children`` set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.0001(e-4).
+
+For multi-racial sets, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6).
+
+For IJBC and verification test-set, we use the most common test protocal.
+
+Participants are ordered in terms of highest scores across two datasets: **TAR@Mask** and **TAR@MR-All**, by the formula of ``0.25 * TAR@Mask + 0.75 * TAR@MR-All``.
+
+
+
+
+## Baselines
+
+**``2021.04.25``** We made a clean on East Asian subset, by removing children images.
+
+**``2021.04.27``** Add onnx download links.
+
+| Backbone   | Dataset    | Method     | Mask   | Children | African | Caucasian | South Asian | East Asian | All    | size(mb) | infer(ms) | link |
+|------------|------------|------------|--------|----------|---------|-----------|-------------|------------|--------|----------|-----------|-----------|
+| R100  | Casia  | ArcFace  | 26.623 | 30.359   | 39.666  | 53.933    | 47.807      | 21.572     | 42.735 | 248.904  | 7.073     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUJpk8zC61HVN7Kg?e=zE9JDd) |
+| R100  | MS1MV2  | ArcFace  | 65.767 | 60.496   | 79.117  | 87.176    | 85.501      | 55.807     | 80.725 | 248.904  | 7.028     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUTlYEHJCHg3UYM-?e=ihxMpS) |
+| R18  | MS1MV3  | ArcFace | 47.853 | 41.047   | 62.613  | 75.125    | 70.213      | 43.859     | 68.326 | 91.658   | 1.856     | [download](https://1drv.ms/u/s!AswpsDO2toNKrTxlT6w1Jo02yzSh?e=KDhFAA) |
+| R34  | MS1MV3  | ArcFace | 58.723 | 55.834   | 71.644  | 83.291    | 80.084      | 53.712     | 77.365 | 130.245  | 3.054     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT2O5pgyVtwnjeMq?e=16S8LI) |
+| R50  | MS1MV3  | ArcFace | 63.850 | 60.457   | 75.488  | 86.115    | 84.305      | 57.352     | 80.533 | 166.305  | 4.262     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUUWd5i3a5OlFpM_?e=ExBDBN) |
+| R100 | MS1MV3 | ArcFace | 69.091 | 66.864   | 81.083  | 89.040    | 88.082      | 62.193     | 84.312 | 248.590  | 7.031     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUPwyqWvNXUlNd3P?e=pTLw9A) |
+| R18   | Glint360K   | ArcFace | 53.317 | 48.113   | 68.230  | 80.575    | 75.852      | 47.831     | 72.074 | 91.658   | 2.013     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT5ey4lCqFzlpzDd?e=VWP28J) |
+| R34   | Glint360K   | ArcFace | 65.106 | 65.454   | 79.907  | 88.620    | 86.815      | 60.604     | 83.015 | 130.245  | 3.044     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUBcgGkiuUS11Hsd?e=ISGDnP) |
+| R50   | Glint360K   | ArcFace | 70.233 | 69.952   | 85.272  | 91.617    | 90.541      | 66.813     | 87.077 | 166.305  | 4.340     | [download](https://1drv.ms/u/s!AswpsDO2toNKrT8jbvHxjqCY0d08?e=igfdrd) |
+| R100  | Glint360K  | ArcFace | 75.567 | 75.202   | 89.488  | 94.285    | 93.434      | 72.528     | 90.659 | 248.590  | 7.038     | [download](https://1drv.ms/u/s!AswpsDO2toNKrUFgLEIj-mnkb51b?e=vWqy2q) |
+| -       | *Private*     | <div style="width: 50pt">insightface-000 of frvt  | 97.760 | 93.358   | 98.850  | 99.372    | 99.058      | 87.694     | 97.481 | -  | -    |   -  |
+
+
+(MS1M-V2 means MS1M-ArcFace, MS1M-V3 means MS1M-RetinaFace).
+
+Inference time in above table was evaluated on Tesla V100 GPU, using onnxruntime-gpu==1.6.
+
+## Rules
+
+1. We have two tracks, academic and unconstrained.
+2. Please **DO NOT** register the account with messy or random characters(for both username and organization).
+3. **For academic submissions, we recommend to set the username as the name of your proposed paper or method. Orgnization hiding is not allowed(or the score will be banned) for this track but you can set the submission as private. You can also create multiple accounts, one account for one method.**
+4. Right now we only support 112x112 input, so make sure that the submission model accepts the correct input shape(['*',3,112,112]), in RGB order. Add an interpolate operator into the first layer of the submission model if you need a different input resolution.
+5. Participants submit onnx model, then get scores by our online evaluation. 
+6. Matching score is measured by cosine similarity.
+7. **Online evaluation server uses onnxruntime-gpu==1.8, cuda==11.1, cudnn==8.0.5, GPU is RTX3090.**
+8. Any float-16 model weights is prohibited, as it will lead to incorrect model size estimiation.
+9. Please use ``onnx_helper.py`` to check whether the model is valid.
+10. Leaderboard is now ordered in terms of highest scores across two datasets: **TAR@Mask** and **TAR@MR-All**, by the formula of ``0.25 * TAR@Mask + 0.75 * TAR@MR-All``.
+
+
+
+## Submission Guide
+
+1. Participants must package the onnx model for submission using ``zip xxx.zip model.onnx``.
+2. Each participant can submit three times a day at most.
+3. Please sign-up with the real organization name. You can hide the organization name in our system if you like(not allowed for academic track).
+4. You can decide which submission to be displayed on the leaderboard by clicking 'Set Public' button.
+5. Please click 'sign-in' on submission server if find you're not logged in.
diff --git a/insightface/detection/README.md b/insightface/detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3139f812660603e612747c1af4d28b7e402ac807
--- /dev/null
+++ b/insightface/detection/README.md
@@ -0,0 +1,42 @@
+## Face Detection
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+
+## Introduction
+
+These are the face detection methods of [InsightFace](https://insightface.ai)
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/github/11513D05.jpg" width="800"/>
+</div>
+
+
+### Datasets
+
+  Please refer to [datasets](_datasets_) page for the details of face detection datasets used for training and evaluation.
+
+### Evaluation
+
+  Please refer to [evaluation](_evaluation_) page for the details of face recognition evaluation.
+
+
+## Methods
+
+
+Supported methods:
+
+- [x] [RetinaFace (CVPR'2020)](retinaface)
+- [x] [SCRFD (Arxiv'2021)](scrfd)
+- [x] [blazeface_paddle](blazeface_paddle)
+
+
+## Contributing
+
+We appreciate all contributions to improve the face detection model zoo of InsightFace. 
+
+
diff --git a/insightface/detection/_datasets_/README.md b/insightface/detection/_datasets_/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..618227009d54f25c981afaeec007cf30fcdf8e04
--- /dev/null
+++ b/insightface/detection/_datasets_/README.md
@@ -0,0 +1,31 @@
+# Face Detection Datasets
+
+(Updating)
+
+## Training Datasets
+
+### WiderFace
+
+http://shuoyang1213.me/WIDERFACE/
+
+
+
+## Test Datasets
+
+### WiderFace
+
+http://shuoyang1213.me/WIDERFACE/
+
+### FDDB
+
+http://vis-www.cs.umass.edu/fddb/
+
+### AFW
+
+
+### PASCAL FACE
+
+
+### MALF
+
+http://www.cbsr.ia.ac.cn/faceevaluation/
diff --git a/insightface/detection/blazeface_paddle/README.md b/insightface/detection/blazeface_paddle/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..13c4f964bb9063f28d6e08dfb8c6b828a81d2536
--- /dev/null
+++ b/insightface/detection/blazeface_paddle/README.md
@@ -0,0 +1 @@
+README_en.md
\ No newline at end of file
diff --git a/insightface/detection/blazeface_paddle/README_cn.md b/insightface/detection/blazeface_paddle/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0762058e58ca17356418930ef7fe3643dce0442f
--- /dev/null
+++ b/insightface/detection/blazeface_paddle/README_cn.md
@@ -0,0 +1,355 @@
+简体中文 | [English](README_en.md)
+
+# 人脸检测模型
+
+* [1. 简介](#简介)
+* [2. 模型库](#模型库)
+* [3. 安装](#安装)
+* [4. 数据准备](#数据准备)
+* [5. 参数配置](#参数配置)
+* [6. 训练与评估](#训练与评估)
+  * [6.1 训练](#训练)
+  * [6.2 在WIDER-FACE数据集上评估](#评估)
+  * [6.3 推理部署](#推理部署)
+  * [6.4 推理速度提升](#推理速度提升)
+  * [6.5 人脸检测demo](#人脸检测demo)
+* [7. 参考文献](#参考文献)
+
+<a name="简介"></a>
+
+## 1. 简介
+
+`Arcface-Paddle`是基于PaddlePaddle实现的，开源深度人脸检测、识别工具。`Arcface-Paddle`目前提供了三个预训练模型，包括用于人脸检测的 `BlazeFace`、用于人脸识别的 `ArcFace` 和 `MobileFace`。
+
+- 本部分内容为人脸检测部分，基于PaddleDetection进行开发。
+- 人脸识别相关内容可以参考：[人脸识别](../../recognition/arcface_paddle/README_cn.md)。
+- 基于PaddleInference的Whl包预测部署内容可以参考：[Whl包预测部署](https://github.com/littletomatodonkey/insight-face-paddle)。
+
+
+<a name="模型库"></a>
+
+## 2. 模型库
+
+### WIDER-FACE数据集上的mAP
+
+| 网络结构 | 输入尺寸 | 图片个数/GPU | epoch数量 | Easy/Medium/Hard Set  | CPU预测时延 | GPU 预测时延 | 模型大小(MB) | 预训练模型地址 | inference模型地址 |  配置文件 |
+|:------------:|:--------:|:----:|:-------:|:-------:|:-------:|:---------:|:----------:|:---------:|:---------:|:--------:|
+| BlazeFace-FPN-SSH  | 640  |    8    | 1000    | 0.9187 / 0.8979 / 0.8168 | 31.7ms  |  5.6ms | 0.646 |[下载链接](https://paddledet.bj.bcebos.com/models/blazeface_fpn_ssh_1000e.pdparams) | [下载链接](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/blazeface_fpn_ssh_1000e_v1.0_infer.tar) |  [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.1/configs/face_detection/blazeface_fpn_ssh_1000e.yml) |
+| RetinaFace  | 480x640  |    -    | -     | - / - / 0.8250 | 182.0ms  |  17.4ms | 1.680 | -  |  - | - |
+
+
+**注意:**  
+- 我们使用多尺度评估策略得到`Easy/Medium/Hard Set`里的mAP。具体细节请参考[在WIDER-FACE数据集上评估](#评估)。
+- 测量速度时我们使用640*640的分辨，在 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz cpu，CPU线程数设置为5，更多细节请参考[推理速度提升](#推理速度提升)。
+- `RetinaFace`的速度测试代码参考自：[../retinaface/README.md](../retinaface/README.md).
+- 测试环境为
+  - CPU: Intel(R) Xeon(R) Gold 6184 CPU @ 2.40GHz
+  - GPU: a single NVIDIA Tesla V100
+
+
+<a name="安装"></a>
+
+## 3. 安装
+
+请参考[安装教程](../../recognition/arcface_paddle/install_ch.md)安装PaddlePaddle以及PaddleDetection。
+
+<a name="数据准备"></a>
+
+## 4. 数据准备
+我们使用[WIDER-FACE数据集](http://shuoyang1213.me/WIDERFACE/)进行训练和模型测试，官方网站提供了详细的数据介绍。
+- WIDER-Face数据源:  
+使用如下目录结构加载`wider_face`类型的数据集：
+
+  ```
+  dataset/wider_face/
+  ├── wider_face_split
+  │   ├── wider_face_train_bbx_gt.txt
+  │   ├── wider_face_val_bbx_gt.txt
+  ├── WIDER_train
+  │   ├── images
+  │   │   ├── 0--Parade
+  │   │   │   ├── 0_Parade_marchingband_1_100.jpg
+  │   │   │   ├── 0_Parade_marchingband_1_381.jpg
+  │   │   │   │   ...
+  │   │   ├── 10--People_Marching
+  │   │   │   ...
+  ├── WIDER_val
+  │   ├── images
+  │   │   ├── 0--Parade
+  │   │   │   ├── 0_Parade_marchingband_1_1004.jpg
+  │   │   │   ├── 0_Parade_marchingband_1_1045.jpg
+  │   │   │   │   ...
+  │   │   ├── 10--People_Marching
+  │   │   │   ...
+  ```
+
+- 手动下载数据集：
+要下载WIDER-FACE数据集，请运行以下命令：
+```
+cd dataset/wider_face && ./download_wider_face.sh
+```
+
+<a name="参数配置"></a>
+
+## 5. 参数配置
+
+我们使用 `configs/face_detection/blazeface_fpn_ssh_1000e.yml`配置进行训练，配置文件摘要如下：
+
+```yaml
+
+_BASE_: [
+  '../datasets/wider_face.yml',
+  '../runtime.yml',
+  '_base_/optimizer_1000e.yml',
+  '_base_/blazeface_fpn.yml',
+  '_base_/face_reader.yml',
+]
+weights: output/blazeface_fpn_ssh_1000e/model_final
+multi_scale_eval: True
+
+```
+
+`blazeface_fpn_ssh_1000e.yml` 配置需要依赖其他的配置文件，在该例子中需要依赖:
+
+```
+wider_face.yml：主要说明了训练数据和验证数据的路径
+
+runtime.yml：主要说明了公共的运行参数，比如是否使用GPU、每多少个epoch存储checkpoint等
+
+optimizer_1000e.yml：主要说明了学习率和优化器的配置
+
+blazeface_fpn.yml：主要说明模型和主干网络的情况
+
+face_reader.yml：主要说明数据读取器配置，如batch size，并发加载子进程数等，同时包含读取后预处理操作，如resize、数据增强等等
+```
+
+根据实际情况，修改上述文件，比如数据集路径、batch size等。
+
+基础模型的配置可以参考`configs/face_detection/_base_/blazeface.yml`；
+改进模型增加FPN和SSH的neck结构，配置文件可以参考`configs/face_detection/_base_/blazeface_fpn.yml`，可以根据需求配置FPN和SSH，具体如下：
+```yaml
+BlazeNet:
+   blaze_filters: [[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]]
+   double_blaze_filters: [[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],
+                           [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]]
+   act: hard_swish # 配置backbone中BlazeBlock的激活函数，基础模型为relu，增加FPN和SSH时需使用hard_swish
+
+BlazeNeck:
+   neck_type : fpn_ssh # 可选only_fpn、only_ssh和fpn_ssh
+   in_channel: [96,96]
+```
+
+<a name="训练与评估"></a>
+
+## 6. 训练与评估
+
+<a name="训练"></a>
+
+### 6.1 训练
+首先，下载预训练模型文件：
+```bash
+wget https://paddledet.bj.bcebos.com/models/pretrained/blazenet_pretrain.pdparams
+```
+PaddleDetection提供了单卡/多卡训练模式，满足用户多种训练需求
+* GPU单卡训练
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -o pretrain_weight=blazenet_pretrain
+```
+
+* GPU多卡训练
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3 #windows和Mac下不需要执行该命令
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -o pretrain_weight=blazenet_pretrain
+```
+* 模型恢复训练
+
+  在日常训练过程中，有的用户由于一些原因导致训练中断，用户可以使用-r的命令恢复训练
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 #windows和Mac下不需要执行该命令
+python tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -r output/blazeface_fan_ssh_1000e/100
+ ```
+* 训练策略
+
+`BlazeFace`训练是以每卡`batch_size=32`在4卡GPU上进行训练(总`batch_size`是128),学习率为0.002，并且训练1000epoch。
+
+
+**注意:** 人脸检测模型目前不支持边训练边评估。
+
+<a name="评估"></a>
+
+### 6.2 在WIDER-FACE数据集上评估
+- 步骤一：评估并生成结果文件：
+```shell
+python -u tools/eval.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml \
+       -o weights=output/blazeface_fpn_ssh_1000e/model_final \
+       multi_scale_eval=True BBoxPostProcess.nms.score_threshold=0.1
+```
+设置`multi_scale_eval=True`进行多尺度评估，评估完成后，将在`output/pred`中生成txt格式的测试结果。
+
+- 步骤二：下载官方评估脚本和Ground Truth文件：
+```
+wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/eval_script/eval_tools.zip
+unzip eval_tools.zip && rm -f eval_tools.zip
+```
+
+- 步骤三：开始评估
+
+方法一：python评估。
+
+```bash
+git clone https://github.com/wondervictor/WiderFace-Evaluation.git
+cd WiderFace-Evaluation
+# 编译
+python3 setup.py build_ext --inplace
+# 开始评估
+python3 evaluation.py -p /path/to/PaddleDetection/output/pred -g /path/to/eval_tools/ground_truth
+```
+
+方法二：MatLab评估。
+
+```bash
+# 在`eval_tools/wider_eval.m`中修改保存结果路径和绘制曲线的名称：
+pred_dir = './pred';  
+legend_name = 'Paddle-BlazeFace';
+
+`wider_eval.m` 是评估模块的主要执行程序。运行命令如下：
+matlab -nodesktop -nosplash -nojvm -r "run wider_eval.m;quit;"
+```
+<a name="推理部署"></a>
+
+### 6.3 推理部署
+
+在模型训练过程中保存的模型文件是包含前向预测和反向传播的过程，在实际的工业部署则不需要反向传播，因此需要将模型进行导成部署需要的模型格式。
+在PaddleDetection中提供了 `tools/export_model.py`脚本来导出模型
+
+```bash
+python tools/export_model.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml --output_dir=./inference_model \
+ -o weights=output/blazeface_fpn_ssh_1000e/best_model BBoxPostProcess.nms.score_threshold=0.1
+```
+
+预测模型会导出到`inference_model/blazeface_fpn_ssh_1000e`目录下，分别为`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`,`model.pdmodel` 如果不指定文件夹，模型则会导出在`output_inference`
+
+* 这里将nms后处理`score_threshold`修改为0.1，因为mAP基本没有影响的情况下，GPU预测速度能够大幅提升。更多关于模型导出的文档，请参考[模型导出文档](https://github.com/PaddlePaddle/PaddleDetection/deploy/EXPORT_MODEL.md)
+
+ PaddleDetection提供了PaddleInference、PaddleServing、PaddleLite多种部署形式，支持服务端、移动端、嵌入式等多种平台，提供了完善的Python和C++部署方案。
+* 在这里，我们以Python为例，说明如何使用PaddleInference进行模型部署
+
+```bash
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_file=demo/road554.png --use_gpu=True
+```
+* 同时`infer.py`提供了丰富的接口，用户进行接入视频文件、摄像头进行预测，更多内容请参考[Python端预测部署](https://github.com/PaddlePaddle/PaddleDetection/deploy/python.md)
+
+* 更多关于预测部署的文档，请参考[预测部署文档](https://github.com/PaddlePaddle/PaddleDetection/deploy/README.md) 。
+
+<a name="推理速度提升"></a>
+
+### 6.4 推理速度提升
+如果想要复现我们提供的速度指标，请修改预测模型配置文件`./inference_model/blazeface_fpn_ssh_1000e/infer_cfg.yml`中的输入尺寸，如下所示:
+```yaml
+mode: fluid
+draw_threshold: 0.5
+metric: WiderFace
+arch: Face
+min_subgraph_size: 3
+Preprocess:
+- is_scale: false
+  mean:
+  - 123
+  - 117
+  - 104
+  std:
+  - 127.502231
+  - 127.502231
+  - 127.502231
+  type: NormalizeImage
+- interp: 1
+  keep_ratio: false
+  target_size:
+  - 640
+  - 640
+  type: Resize
+- type: Permute
+label_list:
+- face
+```
+如果希望模型在cpu环境下更快推理，可安装[paddlepaddle_gpu-0.0.0](https://paddle-wheel.bj.bcebos.com/develop-cpu-mkl/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl) （mkldnn的依赖）可开启mkldnn加速推理。
+
+```bash
+# 使用GPU测速：
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_dir=./path/images --run_benchmark=True --use_gpu=True
+
+# 使用cpu测速：
+# 下载paddle whl包
+wget https://paddle-wheel.bj.bcebos.com/develop-cpu-mkl/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl
+# 安装paddlepaddle_gpu-0.0.0
+pip install paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl
+# 推理
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_dir=./path/images --enable_mkldnn=True --run_benchmark=True --cpu_threads=5
+```
+
+<a name="人脸检测demo"></a>
+
+### 6.5 人脸检测demo
+
+本节介绍基于提供的BlazeFace模型进行人脸检测。
+
+先下载待检测图像与字体文件。
+
+```bash
+# 下载用于人脸检测的示例图像
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends1.jpg
+# 下载字体，用于可视化
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/SourceHanSansCN-Medium.otf
+```
+
+示例图像如下所示。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends1.jpg"  width = "800" />
+</div>
+
+
+检测的示例命令如下。
+
+```shell
+python3.7 test_blazeface.py --input=friends1.jpg  --output="./output"
+```
+
+最终可视化结果保存在`output`目录下，可视化结果如下所示。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/output/friends1.jpg"  width = "800" />
+</div>
+
+
+更多关于参数解释，索引库构建、人脸识别、whl包预测部署的内容可以参考：[Whl包预测部署](https://github.com/littletomatodonkey/insight-face-paddle)。
+
+<a name="参考文献"></a>
+
+## 7. 参考文献
+
+```
+@misc{long2020ppyolo,
+title={PP-YOLO: An Effective and Efficient Implementation of Object Detector},
+author={Xiang Long and Kaipeng Deng and Guanzhong Wang and Yang Zhang and Qingqing Dang and Yuan Gao and Hui Shen and Jianguo Ren and Shumin Han and Errui Ding and Shilei Wen},
+year={2020},
+eprint={2007.12099},
+archivePrefix={arXiv},
+primaryClass={cs.CV}
+}
+@misc{ppdet2019,
+title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.},
+author={PaddlePaddle Authors},
+howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}},
+year={2019}
+}
+@article{bazarevsky2019blazeface,
+title={BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs},
+author={Valentin Bazarevsky and Yury Kartynnik and Andrey Vakunov and Karthik Raveendran and Matthias Grundmann},
+year={2019},
+eprint={1907.05047},
+ archivePrefix={arXiv}
+}
+```
diff --git a/insightface/detection/blazeface_paddle/README_en.md b/insightface/detection/blazeface_paddle/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..24fd17863fab2907465960087c6c905f3dfc78ec
--- /dev/null
+++ b/insightface/detection/blazeface_paddle/README_en.md
@@ -0,0 +1,354 @@
+[简体中文](README_cn.md) | English
+
+# FaceDetection
+
+* [1. Introduction](#Introduction)
+* [2. Model Zoo](#Model_Zoo)
+* [3. Installation](#Installation)
+* [4. Data Pipline](#Data_Pipline)
+* [5. Configuration File](#Configuration_File)
+* [6. Training and Inference](#Training_and_Inference)
+  * [6.1 Training](#Training)
+  * [6.2 Evaluate on the WIDER FACE](#Evaluation)
+  * [6.3 Inference deployment](#Inference_deployment)
+  * [6.4 Improvement of inference speed](#Increase_in_inference_speed)
+  * [6.4 Face detection demo](#Face_detection_demo)
+* [7. Citations](#Citations)
+
+<a name="Introduction"></a>
+
+## 1. Introduction
+
+`Arcface-Paddle` is an open source deep face detection and recognition toolkit, powered by PaddlePaddle. `Arcface-Paddle` provides three related pretrained models now, include `BlazeFace` for face detection, `ArcFace` and `MobileFace` for face recognition.
+
+- This tutorial is mainly about face detection based on `PaddleDetection`.
+- For face recognition task, please refer to: [Face recognition tuturial](../../recognition/arcface_paddle/README_en.md).
+- For Whl package inference using PaddleInference, please refer to [whl package inference](https://github.com/littletomatodonkey/insight-face-paddle).
+
+<a name="Model_Zoo"></a>
+
+## 2. Model Zoo
+
+### mAP in WIDER FACE
+
+| Model | input size | images/GPU | epochs | Easy/Medium/Hard Set  | CPU time cost | GPU time cost| Model Size(MB) | Pretrained model | Inference model | Config |
+|:------------:|:--------:|:----:|:-------:|:-------:|:---------:|:---------:|:----------:|:---------:|:--------:|:--------:|
+| BlazeFace-FPN-SSH  | 640×640  |    8    | 1000     | 0.9187 / 0.8979 / 0.8168 | 31.7ms  |  5.6ms | 0.646 |[download link](https://paddledet.bj.bcebos.com/models/blazeface_fpn_ssh_1000e.pdparams)  |  [download link](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/blazeface_fpn_ssh_1000e_v1.0_infer.tar) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.1/configs/face_detection/blazeface_fpn_ssh_1000e.yml) |
+| RetinaFace  | 480x640  |    -    | -     | - / - / 0.8250 | 182.0ms  |  17.4ms | 1.680 | -  |  - | - |
+
+
+**NOTE:**  
+- Get mAP in `Easy/Medium/Hard Set` by multi-scale evaluation. For details can refer to [Evaluation](#Evaluate-on-the-WIDER-FACE).
+- Measuring the speed, we use the resolution of `640×640`, in Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz environment, cpu-threads are set as 5. For more details, you can refer to [Improvement of inference speed](#Increase_in_inference_speed).
+- Benchmark code for `RetinaFace` is from: [../retinaface/README.md](../retinaface/README.md).
+- The benchmark environment is
+  - CPU: Intel(R) Xeon(R) Gold 6184 CPU @ 2.40GHz
+  - GPU: a single NVIDIA Tesla V100
+
+<a name="Installation"></a>
+
+## 3. Installation
+
+Please refer to [installation tutorial](../../recognition/arcface_paddle/install_en.md) to install PaddlePaddle and PaddleDetection.
+
+
+<a name="Data_Pipline"></a>
+
+## 4. Data Pipline
+We use the [WIDER FACE dataset](http://shuoyang1213.me/WIDERFACE/) to carry out the training
+and testing of the model, the official website gives detailed data introduction.
+- WIDER Face data source:  
+Loads `wider_face` type dataset with directory structures like this:
+
+  ```
+  dataset/wider_face/
+  ├── wider_face_split
+  │   ├── wider_face_train_bbx_gt.txt
+  │   ├── wider_face_val_bbx_gt.txt
+  ├── WIDER_train
+  │   ├── images
+  │   │   ├── 0--Parade
+  │   │   │   ├── 0_Parade_marchingband_1_100.jpg
+  │   │   │   ├── 0_Parade_marchingband_1_381.jpg
+  │   │   │   │   ...
+  │   │   ├── 10--People_Marching
+  │   │   │   ...
+  ├── WIDER_val
+  │   ├── images
+  │   │   ├── 0--Parade
+  │   │   │   ├── 0_Parade_marchingband_1_1004.jpg
+  │   │   │   ├── 0_Parade_marchingband_1_1045.jpg
+  │   │   │   │   ...
+  │   │   ├── 10--People_Marching
+  │   │   │   ...
+  ```
+
+- Download dataset manually:  
+To download the WIDER FACE dataset, run the following commands:
+```
+cd dataset/wider_face && ./download_wider_face.sh
+```
+
+<a name="Configuration_file"></a>
+
+## 5. Configuration file
+
+We use the `configs/face_detection/blazeface_fpn_ssh_1000e.yml` configuration for training. The summary of the configuration file is as follows:
+
+```yaml
+_BASE_: [
+  '../datasets/wider_face.yml',
+  '../runtime.yml',
+  '_base_/optimizer_1000e.yml',
+  '_base_/blazeface_fpn.yml',
+  '_base_/face_reader.yml',
+]
+weights: output/blazeface_fpn_ssh_1000e/model_final
+multi_scale_eval: True
+```
+
+`blazeface_fpn_ssh_1000e.yml` The configuration needs to rely on other configuration files, in this example it needs to rely on:
+
+```
+wider_face.yml：Mainly explains the path of training data and verification data
+
+runtime.yml：Mainly describes the common operating parameters, such as whether to use GPU, how many epochs to store checkpoints, etc.
+
+optimizer_1000e.yml：Mainly explains the configuration of learning rate and optimizer
+
+blazeface_fpn.yml：Mainly explain the situation of the model and the backbone network
+
+face_reader.yml：It mainly describes the configuration of the data reader, such as batch size, the number of concurrent loading subprocesses, etc., and also includes post-reading preprocessing operations, such as resize, data enhancement, etc.
+```
+
+According to the actual situation, modify the above files, such as the data set path, batch size, etc.
+
+For the configuration of the base model, please refer to `configs/face_detection/_base_/blazeface.yml`.
+The improved model adds the neck structure of FPN and SSH. For the configuration file, please refer to `configs/face_detection/_base_/blazeface_fpn.yml`. You can configure FPN and SSH if needed, which is as follows:
+
+```yaml
+BlazeNet:
+   blaze_filters: [[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]]
+   double_blaze_filters: [[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],
+                           [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]]
+   act: hard_swish # Configure the activation function of BlazeBlock in backbone, the basic model is relu, hard_swish is required when adding FPN and SSH
+
+BlazeNeck:
+   neck_type : fpn_ssh # Optional only_fpn, only_ssh and fpn_ssh
+   in_channel: [96,96]
+```
+
+<a name="Training_and_Inference"></a>
+
+## 6. Training_and_Inference
+
+<a name="Training"></a>
+
+### 6.1 Training
+Firstly, download the pretrained model.
+```bash
+wget https://paddledet.bj.bcebos.com/models/pretrained/blazenet_pretrain.pdparams
+```
+PaddleDetection provides a single-GPU/multi-GPU training mode to meet the various training needs of users.
+* single-GPU training
+```bash
+export CUDA_VISIBLE_DEVICES=0 # Do not need to execute this command under windows and Mac
+python tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -o pretrain_weight=blazenet_pretrain
+```
+
+* multi-GPU training
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3 # Do not need to execute this command under windows and Mac
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -o pretrain_weight=blazenet_pretrain
+```
+* Resume training from Checkpoint
+
+  In the daily training process, if the training was be interrupted, using the -r command to resume training:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0 # Do not need to execute this command under windows and Mac
+python tools/train.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml -r output/blazeface_fan_ssh_1000e/100
+ ```
+* Training hyperparameters
+
+`BlazeFace` training is based on each GPU `batch_size=32` training on 4 GPUs (total `batch_size` is 128), the learning rate is 0.002, and the total training epoch is set as 1000.
+
+
+**NOTE:** Not support evaluation during train.
+
+<a name="Evaluation"></a>
+
+### 6.2 Evaluate on the WIDER FACE
+- Evaluate and generate results files:
+```shell
+python -u tools/eval.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml \
+       -o weights=output/blazeface_fpn_ssh_1000e/model_final \
+       multi_scale_eval=True BBoxPostProcess.nms.score_threshold=0.1
+```
+Set `multi_scale_eval=True` for multi-scale evaluation，after the evaluation is completed, the test result in txt format will be generated in `output/pred`.
+
+- Download the official evaluation script to evaluate the AP metrics:
+
+```bash
+wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/eval_script/eval_tools.zip
+unzip eval_tools.zip && rm -f eval_tools.zip
+```
+
+- Start evaluation:
+
+Method One: Python evaluation:
+
+```bash
+git clone https://github.com/wondervictor/WiderFace-Evaluation.git
+cd WiderFace-Evaluation
+# Compile
+python3 setup.py build_ext --inplace
+# Start evaluation
+python3 evaluation.py -p /path/to/PaddleDetection/output/pred -g /path/to/eval_tools/ground_truth
+```
+
+Method Two: MatLab evaluation:
+
+```bash
+# Modify the result path and the name of the curve to be drawn in `eval_tools/wider_eval.m`:
+pred_dir = './pred';  
+legend_name = 'Paddle-BlazeFace';
+
+`wider_eval.m` is the main execution program of the evaluation module. The run command is as follows:
+matlab -nodesktop -nosplash -nojvm -r "run wider_eval.m;quit;"
+```
+<a name="Inference_deployment"></a>
+
+### 6.3 Inference deployment
+
+The model file saved in the model training process includes forward prediction and back propagation. In actual industrial deployment, back propagation is not required. Therefore, the model needs to be exported into the model format required for deployment.
+The `tools/export_model.py` script is provided in PaddleDetection to export the model:
+
+```bash
+python tools/export_model.py -c configs/face_detection/blazeface_fpn_ssh_1000e.yml --output_dir=./inference_model \
+ -o weights=output/blazeface_fpn_ssh_1000e/best_model BBoxPostProcess.nms.score_threshold=0.1
+```
+The inference model will be exported to the `inference_model/blazeface_fpn_ssh_1000e` directory, which are `infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel` If no folder is specified, the model will be exported In `output_inference`.
+
+* `score_threshold` for nms is modified as 0.1 for inference, because it takes great speed performance improvement while has little effect on mAP. For more documentation about model export, please refer to: [export doc](https://github.com/PaddlePaddle/PaddleDetection/deploy/EXPORT_MODEL.md)
+
+ PaddleDetection provides multiple deployment forms of PaddleInference, PaddleServing, and PaddleLite, supports multiple platforms such as server, mobile, and embedded, and provides a complete deployment plan for Python and C++.
+* Here, we take Python as an example to illustrate how to use PaddleInference for model deployment:
+```bash
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_file=demo/road554.png --use_gpu=True
+```
+* `infer.py` provides a rich interface for users to access video files and cameras for prediction. For more information, please refer to: [Python deployment](https://github.com/PaddlePaddle/PaddleDetection/deploy/python.md).
+
+* For more documentation on deployment, please refer to: [deploy doc](https://github.com/PaddlePaddle/PaddleDetection/deploy/README.md).
+
+<a name="Increase_in_inference_speed"></a>
+
+### 6.4 Improvement of inference speed
+
+If you want to reproduce our speed indicators, you need to modify the input size of inference model in the `./inference_model/blazeface_fpn_ssh_1000e/infer_cfg.yml` configuration file. As follows:
+```yaml
+mode: fluid
+draw_threshold: 0.5
+metric: WiderFace
+arch: Face
+min_subgraph_size: 3
+Preprocess:
+- is_scale: false
+  mean:
+  - 123
+  - 117
+  - 104
+  std:
+  - 127.502231
+  - 127.502231
+  - 127.502231
+  type: NormalizeImage
+- interp: 1
+  keep_ratio: false
+  target_size:
+  - 640
+  - 640
+  type: Resize
+- type: Permute
+label_list:
+- face
+```
+
+If you want the model to be inferred faster in the CPU environment, install [paddlepaddle_gpu-0.0.0](https://paddle-wheel.bj.bcebos.com/develop-cpu-mkl/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl) (dependency of mkldnn) and enable_mkldnn is set to True, when predicting acceleration. 
+
+```bash
+# use GPU:
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_dir=./path/images --run_benchmark=True --use_gpu=True
+
+# inference with mkldnn use CPU
+# downdoad whl package
+wget https://paddle-wheel.bj.bcebos.com/develop-cpu-mkl/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl
+#install paddlepaddle_gpu-0.0.0
+pip install paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl
+python deploy/python/infer.py --model_dir=./inference_model/blazeface_fpn_ssh_1000e --image_dir=./path/images --enable_mkldnn=True --run_benchmark=True --cpu_threads=5
+```
+
+<a name="Face_detection_demo"></a>
+
+### 6.5 Face detection demo
+
+This part talks about how to detect faces using BlazeFace model.
+
+Firstly, use the following commands to download the demo image and font file for visualization.
+
+
+```bash
+# Demo image
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends1.jpg
+# Font file for visualization
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/SourceHanSansCN-Medium.otf
+```
+
+The demo image is shown as follows.
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends1.jpg"  width = "800" />
+</div>
+
+
+Use the following command to run the face detection process.
+
+```shell
+python3.7 test_blazeface.py --input=friends1.jpg  --output="./output"
+```
+
+The final result is save in folder `output/`, which is shown as follows.
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/output/friends1.jpg"  width = "800" />
+</div>
+
+
+For more details about parameter explanations, face recognition, index gallery construction and whl package inference, please refer to [Whl package inference tutorial](https://github.com/littletomatodonkey/insight-face-paddle).
+
+
+## 7. Citations
+
+```
+@misc{long2020ppyolo,
+title={PP-YOLO: An Effective and Efficient Implementation of Object Detector},
+author={Xiang Long and Kaipeng Deng and Guanzhong Wang and Yang Zhang and Qingqing Dang and Yuan Gao and Hui Shen and Jianguo Ren and Shumin Han and Errui Ding and Shilei Wen},
+year={2020},
+eprint={2007.12099},
+archivePrefix={arXiv},
+primaryClass={cs.CV}
+}
+@misc{ppdet2019,
+title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.},
+author={PaddlePaddle Authors},
+howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}},
+year={2019}
+}
+@article{bazarevsky2019blazeface,
+title={BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs},
+author={Valentin Bazarevsky and Yury Kartynnik and Andrey Vakunov and Karthik Raveendran and Matthias Grundmann},
+year={2019},
+eprint={1907.05047},
+ archivePrefix={arXiv}
+}
+```
diff --git a/insightface/detection/blazeface_paddle/test_blazeface.py b/insightface/detection/blazeface_paddle/test_blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a8f103be3e18c5de6e9d699b4bfdbb891d29a
--- /dev/null
+++ b/insightface/detection/blazeface_paddle/test_blazeface.py
@@ -0,0 +1,593 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import requests
+import logging
+import imghdr
+import pickle
+import tarfile
+from functools import partial
+
+import cv2
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
+from prettytable import PrettyTable
+from PIL import Image, ImageDraw, ImageFont
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+__all__ = ["parser"]
+BASE_INFERENCE_MODEL_DIR = os.path.expanduser("~/.insightface/ppmodels/")
+BASE_DOWNLOAD_URL = "https://paddle-model-ecology.bj.bcebos.com/model/insight-face/{}.tar"
+
+
+def parser(add_help=True):
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser(add_help=add_help)
+    
+    parser.add_argument(
+        "--det_model",
+        type=str,
+        default="BlazeFace",
+        help="The detection model.")
+    parser.add_argument(
+        "--use_gpu",
+        type=str2bool,
+        default=True,
+        help="Whether use GPU to predict. Default by True.")
+    parser.add_argument(
+        "--enable_mkldnn",
+        type=str2bool,
+        default=True,
+        help="Whether use MKLDNN to predict, valid only when --use_gpu is False. Default by False."
+    )
+    parser.add_argument(
+        "--cpu_threads",
+        type=int,
+        default=1,
+        help="The num of threads with CPU, valid only when --use_gpu is False. Default by 1."
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="The path or directory of image(s) or video to be predicted.")
+    parser.add_argument(
+        "--output", type=str, default="./output/", help="The directory of prediction result.")
+    parser.add_argument(
+        "--det_thresh",
+        type=float,
+        default=0.8,
+        help="The threshold of detection postprocess. Default by 0.8.")
+    return parser
+
+
+def print_config(args):
+    args = vars(args)
+    table = PrettyTable(['Param', 'Value'])
+    for param in args:
+        table.add_row([param, args[param]])
+    width = len(str(table).split("\n")[0])
+    print("{}".format("-" * width))
+    print("PaddleFace".center(width))
+    print(table)
+    print("Powered by PaddlePaddle!".rjust(width))
+    print("{}".format("-" * width))
+
+
+def download_with_progressbar(url, save_path):
+    """Download from url with progressbar.
+    """
+    if os.path.isfile(save_path):
+        os.remove(save_path)
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get("content-length", 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+    with open(save_path, "wb") as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes or not os.path.isfile(
+            save_path):
+        raise Exception(
+            f"Something went wrong while downloading model/image from {url}")
+
+
+def check_model_file(model):
+    """Check the model files exist and download and untar when no exist.
+    """
+    model_map = {
+        "ArcFace": "arcface_iresnet50_v1.0_infer",
+        "BlazeFace": "blazeface_fpn_ssh_1000e_v1.0_infer",
+        "MobileFace": "mobileface_v1.0_infer"
+    }
+
+    if os.path.isdir(model):
+        model_file_path = os.path.join(model, "inference.pdmodel")
+        params_file_path = os.path.join(model, "inference.pdiparams")
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            raise Exception(
+                f"The specifed model directory error. The drectory must include 'inference.pdmodel' and 'inference.pdiparams'."
+            )
+
+    elif model in model_map:
+        storage_directory = partial(os.path.join, BASE_INFERENCE_MODEL_DIR,
+                                    model)
+        url = BASE_DOWNLOAD_URL.format(model_map[model])
+
+        tar_file_name_list = [
+            "inference.pdiparams", "inference.pdiparams.info",
+            "inference.pdmodel"
+        ]
+        model_file_path = storage_directory("inference.pdmodel")
+        params_file_path = storage_directory("inference.pdiparams")
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            tmp_path = storage_directory(url.split("/")[-1])
+            logging.info(f"Download {url} to {tmp_path}")
+            os.makedirs(storage_directory(), exist_ok=True)
+            download_with_progressbar(url, tmp_path)
+            with tarfile.open(tmp_path, "r") as tarObj:
+                for member in tarObj.getmembers():
+                    filename = None
+                    for tar_file_name in tar_file_name_list:
+                        if tar_file_name in member.name:
+                            filename = tar_file_name
+                    if filename is None:
+                        continue
+                    file = tarObj.extractfile(member)
+                    with open(storage_directory(filename), "wb") as f:
+                        f.write(file.read())
+            os.remove(tmp_path)
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            raise Exception(
+                f"Something went wrong while downloading and unzip the model[{model}] files!"
+            )
+    else:
+        raise Exception(
+            f"The specifed model name error. Support 'BlazeFace' for detection. And support local directory that include model files ('inference.pdmodel' and 'inference.pdiparams')."
+        )
+
+    return model_file_path, params_file_path
+
+
+def normalize_image(img, scale=None, mean=None, std=None, order='chw'):
+    if isinstance(scale, str):
+        scale = eval(scale)
+    scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+    mean = mean if mean is not None else [0.485, 0.456, 0.406]
+    std = std if std is not None else [0.229, 0.224, 0.225]
+
+    shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+    mean = np.array(mean).reshape(shape).astype('float32')
+    std = np.array(std).reshape(shape).astype('float32')
+
+    if isinstance(img, Image.Image):
+        img = np.array(img)
+
+    assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
+    return (img.astype('float32') * scale - mean) / std
+
+
+def to_CHW_image(img):
+    if isinstance(img, Image.Image):
+        img = np.array(img)
+    return img.transpose((2, 0, 1))
+
+
+class ColorMap(object):
+    def __init__(self, num):
+        super().__init__()
+        self.get_color_map_list(num)
+        self.color_map = {}
+        self.ptr = 0
+
+    def __getitem__(self, key):
+        return self.color_map[key]
+
+    def update(self, keys):
+        for key in keys:
+            if key not in self.color_map:
+                i = self.ptr % len(self.color_list)
+                self.color_map[key] = self.color_list[i]
+                self.ptr += 1
+
+    def get_color_map_list(self, num_classes):
+        color_map = num_classes * [0, 0, 0]
+        for i in range(0, num_classes):
+            j = 0
+            lab = i
+            while lab:
+                color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+                color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+                color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+                j += 1
+                lab >>= 3
+        self.color_list = [
+            color_map[i:i + 3] for i in range(0, len(color_map), 3)
+        ]
+
+
+class ImageReader(object):
+    def __init__(self, inputs):
+        super().__init__()
+        self.idx = 0
+        if isinstance(inputs, np.ndarray):
+            self.image_list = [inputs]
+        else:
+            imgtype_list = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
+            self.image_list = []
+            if os.path.isfile(inputs):
+                if imghdr.what(inputs) not in imgtype_list:
+                    raise Exception(
+                        f"Error type of input path, only support: {imgtype_list}"
+                    )
+                self.image_list.append(inputs)
+            elif os.path.isdir(inputs):
+                tmp_file_list = os.listdir(inputs)
+                warn_tag = False
+                for file_name in tmp_file_list:
+                    file_path = os.path.join(inputs, file_name)
+                    if not os.path.isfile(file_path):
+                        warn_tag = True
+                        continue
+                    if imghdr.what(file_path) in imgtype_list:
+                        self.image_list.append(file_path)
+                    else:
+                        warn_tag = True
+                if warn_tag:
+                    logging.warning(
+                        f"The directory of input contine directory or not supported file type, only support: {imgtype_list}"
+                    )
+            else:
+                raise Exception(
+                    f"The file of input path not exist! Please check input: {inputs}"
+                )
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.idx >= len(self.image_list):
+            raise StopIteration
+
+        data = self.image_list[self.idx]
+        if isinstance(data, np.ndarray):
+            self.idx += 1
+            return data, "tmp.png"
+        path = data
+        _, file_name = os.path.split(path)
+        img = cv2.imread(path)
+        if img is None:
+            logging.warning(f"Error in reading image: {path}! Ignored.")
+            self.idx += 1
+            return self.__next__()
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        self.idx += 1
+        return img, file_name
+
+    def __len__(self):
+        return len(self.image_list)
+
+
+class VideoReader(object):
+    def __init__(self, inputs):
+        super().__init__()
+        videotype_list = {"mp4"}
+        if os.path.splitext(inputs)[-1][1:] not in videotype_list:
+            raise Exception(
+                f"The input file is not supported, only support: {videotype_list}"
+            )
+        if not os.path.isfile(inputs):
+            raise Exception(
+                f"The file of input path not exist! Please check input: {inputs}"
+            )
+        self.capture = cv2.VideoCapture(inputs)
+        self.file_name = os.path.split(inputs)[-1]
+
+    def get_info(self):
+        info = {}
+        width = int(self.capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(self.capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        info["file_name"] = self.file_name
+        info["fps"] = 30
+        info["shape"] = (width, height)
+        info["fourcc"] = cv2.VideoWriter_fourcc(* 'mp4v')
+        return info
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ret, frame = self.capture.read()
+        if not ret:
+            raise StopIteration
+        return frame, self.file_name
+
+
+class ImageWriter(object):
+    def __init__(self, output_dir):
+        super().__init__()
+        if output_dir is None:
+            raise Exception(
+                "Please specify the directory of saving prediction results by --output."
+            )
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+
+    def write(self, image, file_name):
+        path = os.path.join(self.output_dir, file_name)
+        cv2.imwrite(path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+
+
+class VideoWriter(object):
+    def __init__(self, output_dir, video_info):
+        super().__init__()
+        if output_dir is None:
+            raise Exception(
+                "Please specify the directory of saving prediction results by --output."
+            )
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        output_path = os.path.join(output_dir, video_info["file_name"])
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        self.writer = cv2.VideoWriter(output_path, video_info["fourcc"],
+                                      video_info["fps"], video_info["shape"])
+
+    def write(self, frame, file_name):
+        self.writer.write(frame)
+
+    def __del__(self):
+        if hasattr(self, "writer"):
+            self.writer.release()
+
+
+class BasePredictor(object):
+    def __init__(self, predictor_config):
+        super().__init__()
+        self.predictor_config = predictor_config
+        self.predictor, self.input_names, self.output_names = self.load_predictor(
+            predictor_config["model_file"], predictor_config["params_file"])
+
+    def load_predictor(self, model_file, params_file):
+        config = Config(model_file, params_file)
+        if self.predictor_config["use_gpu"]:
+            config.enable_use_gpu(200, 0)
+            config.switch_ir_optim(True)
+        else:
+            config.disable_gpu()
+            config.set_cpu_math_library_num_threads(self.predictor_config[
+                "cpu_threads"])
+
+            if self.predictor_config["enable_mkldnn"]:
+                try:
+                    # cache 10 different shapes for mkldnn to avoid memory leak
+                    config.set_mkldnn_cache_capacity(10)
+                    config.enable_mkldnn()
+                except Exception as e:
+                    logging.error(
+                        "The current environment does not support `mkldnn`, so disable mkldnn."
+                    )
+        config.disable_glog_info()
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        predictor = create_predictor(config)
+        input_names = predictor.get_input_names()
+        output_names = predictor.get_output_names()
+        return predictor, input_names, output_names
+
+    def preprocess(self):
+        raise NotImplementedError
+
+    def postprocess(self):
+        raise NotImplementedError
+
+    def predict(self, img):
+        raise NotImplementedError
+
+
+class Detector(BasePredictor):
+    def __init__(self, det_config, predictor_config):
+        super().__init__(predictor_config)
+        self.det_config = det_config
+        self.target_size = self.det_config["target_size"]
+        self.thresh = self.det_config["thresh"]
+
+    def preprocess(self, img):
+        resize_h, resize_w = self.target_size
+        img_shape = img.shape
+        img_scale_x = resize_w / img_shape[1]
+        img_scale_y = resize_h / img_shape[0]
+        img = cv2.resize(
+            img, None, None, fx=img_scale_x, fy=img_scale_y, interpolation=1)
+        img = normalize_image(
+            img,
+            scale=1. / 255.,
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+            order='hwc')
+        img_info = {}
+        img_info["im_shape"] = np.array(
+            img.shape[:2], dtype=np.float32)[np.newaxis, :]
+        img_info["scale_factor"] = np.array(
+            [img_scale_y, img_scale_x], dtype=np.float32)[np.newaxis, :]
+
+        img = img.transpose((2, 0, 1)).copy()
+        img_info["image"] = img[np.newaxis, :, :, :]
+        return img_info
+
+    def postprocess(self, np_boxes):
+        expect_boxes = (np_boxes[:, 1] > self.thresh) & (np_boxes[:, 0] > -1)
+        return np_boxes[expect_boxes, :]
+
+    def predict(self, img):
+        inputs = self.preprocess(img)
+        for input_name in self.input_names:
+            input_tensor = self.predictor.get_input_handle(input_name)
+            input_tensor.copy_from_cpu(inputs[input_name])
+        self.predictor.run()
+        output_tensor = self.predictor.get_output_handle(self.output_names[0])
+        np_boxes = output_tensor.copy_to_cpu()
+        # boxes_num = self.detector.get_output_handle(self.detector_output_names[1])
+        # np_boxes_num = boxes_num.copy_to_cpu()
+        box_list = self.postprocess(np_boxes)
+        return box_list
+
+class FaceDetector(object):
+    def __init__(self, args, print_info=True):
+        super().__init__()
+        if print_info:
+            print_config(args)
+
+        self.font_path = os.path.join(
+            os.path.abspath(os.path.dirname(__file__)),
+            "SourceHanSansCN-Medium.otf")
+        self.args = args
+
+        predictor_config = {
+            "use_gpu": args.use_gpu,
+            "enable_mkldnn": args.enable_mkldnn,
+            "cpu_threads": args.cpu_threads
+        }
+
+        model_file_path, params_file_path = check_model_file(
+            args.det_model)
+        det_config = {"thresh": args.det_thresh, "target_size": [640, 640]}
+        predictor_config["model_file"] = model_file_path
+        predictor_config["params_file"] = params_file_path
+        self.det_predictor = Detector(det_config, predictor_config)
+        self.color_map = ColorMap(100)
+
+    def preprocess(self, img):
+        img = img.astype(np.float32, copy=False)
+        return img
+
+    def draw(self, img, box_list, labels):
+        self.color_map.update(labels)
+        im = Image.fromarray(img)
+        draw = ImageDraw.Draw(im)
+
+        for i, dt in enumerate(box_list):
+            bbox, score = dt[2:], dt[1]
+            label = labels[i]
+            color = tuple(self.color_map[label])
+
+            xmin, ymin, xmax, ymax = bbox
+
+            font_size = max(int((xmax - xmin) // 6), 10)
+            font = ImageFont.truetype(self.font_path, font_size)
+
+            text = "{} {:.4f}".format(label, score)
+            th = sum(font.getmetrics())
+            tw = font.getsize(text)[0]
+            start_y = max(0, ymin - th)
+
+            draw.rectangle(
+                [(xmin, start_y), (xmin + tw + 1, start_y + th)], fill=color)
+            draw.text(
+                (xmin + 1, start_y),
+                text,
+                fill=(255, 255, 255),
+                font=font,
+                anchor="la")
+            draw.rectangle(
+                [(xmin, ymin), (xmax, ymax)], width=2, outline=color)
+        return np.array(im)
+
+    def predict_np_img(self, img):
+        input_img = self.preprocess(img)
+        box_list = None
+        np_feature = None
+        if hasattr(self, "det_predictor"):
+            box_list = self.det_predictor.predict(input_img)
+        return box_list, np_feature
+
+    def init_reader_writer(self, input_data):
+        if isinstance(input_data, np.ndarray):
+            self.input_reader = ImageReader(input_data)
+            if hasattr(self, "det_predictor"):
+                self.output_writer = ImageWriter(self.args.output)
+        elif isinstance(input_data, str):
+            if input_data.endswith('mp4'):
+                self.input_reader = VideoReader(input_data)
+                info = self.input_reader.get_info()
+                self.output_writer = VideoWriter(self.args.output, info)
+            else:
+                self.input_reader = ImageReader(input_data)
+                if hasattr(self, "det_predictor"):
+                    self.output_writer = ImageWriter(self.args.output)
+        else:
+            raise Exception(
+                f"The input data error. Only support path of image or video(.mp4) and dirctory that include images."
+            )
+
+    def predict(self, input_data, print_info=False):
+        """Predict input_data.
+
+        Args:
+            input_data (str | NumPy.array): The path of image, or the derectory including images, or the image data in NumPy.array format.
+            print_info (bool, optional): Wheather to print the prediction results. Defaults to False.
+
+        Yields:
+            dict: {
+                "box_list": The prediction results of detection.
+                "features": The output of recognition.
+                "labels": The results of retrieval.
+                }
+        """
+        self.init_reader_writer(input_data)
+        for img, file_name in self.input_reader:
+            if img is None:
+                logging.warning(f"Error in reading img {file_name}! Ignored.")
+                continue
+            box_list, np_feature = self.predict_np_img(img)
+            labels = ["face"] * len(box_list)
+            if box_list is not None:
+                result = self.draw(img, box_list, labels=labels)
+                self.output_writer.write(result, file_name)
+            if print_info:
+                logging.info(f"File: {file_name}, predict label(s): {labels}")
+            yield {
+                "box_list": box_list,
+                "features": np_feature,
+                "labels": labels
+            }
+        logging.info(f"Predict complete!")
+
+
+# for CLI
+def main(args=None):
+    logging.basicConfig(level=logging.INFO)
+
+    args = parser().parse_args()
+    predictor = FaceDetector(args)
+    res = predictor.predict(args.input, print_info=True)
+    for _ in res:
+        pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/insightface/detection/retinaface/Makefile b/insightface/detection/retinaface/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..66a3ed047a49124b921548dbc337946202fedbf7
--- /dev/null
+++ b/insightface/detection/retinaface/Makefile
@@ -0,0 +1,6 @@
+all:
+	cd rcnn/cython/; python setup.py build_ext --inplace; rm -rf build; cd ../../
+	cd rcnn/pycocotools/; python setup.py build_ext --inplace; rm -rf build; cd ../../
+clean:
+	cd rcnn/cython/; rm *.so *.c *.cpp; cd ../../
+	cd rcnn/pycocotools/; rm *.so; cd ../../
diff --git a/insightface/detection/retinaface/README.md b/insightface/detection/retinaface/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6665d3b12fe6c5d500de9272c4ac564347c46da7
--- /dev/null
+++ b/insightface/detection/retinaface/README.md
@@ -0,0 +1,86 @@
+# RetinaFace Face Detector
+
+## Introduction
+
+RetinaFace is a practical single-stage [SOTA](http://shuoyang1213.me/WIDERFACE/WiderFace_Results.html) face detector which is initially introduced in [arXiv technical report](https://arxiv.org/abs/1905.00641) and then accepted by [CVPR 2020](https://openaccess.thecvf.com/content_CVPR_2020/html/Deng_RetinaFace_Single-Shot_Multi-Level_Face_Localisation_in_the_Wild_CVPR_2020_paper.html).
+
+![demoimg1](https://insightface.ai/assets/img/github/11513D05.jpg)
+
+![demoimg2](https://insightface.ai/assets/img/github/widerfacevaltest.png)
+
+## Data
+
+1. Download our annotations (face bounding boxes & five facial landmarks) from [baidu cloud](https://pan.baidu.com/s/1Laby0EctfuJGgGMgRRgykA) or [gdrive](https://drive.google.com/file/d/1BbXxIiY-F74SumCNG6iwmJJ5K3heoemT/view?usp=sharing)
+
+2. Download the [WIDERFACE](http://shuoyang1213.me/WIDERFACE/WiderFace_Results.html) dataset.
+
+3. Organise the dataset directory under ``insightface/RetinaFace/`` as follows:
+
+```Shell
+  data/retinaface/
+    train/
+      images/
+      label.txt
+    val/
+      images/
+      label.txt
+    test/
+      images/
+      label.txt
+```
+
+## Install
+
+1. Install MXNet with GPU support.
+2. Install Deformable Convolution V2 operator from [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets) if you use the DCN based backbone.
+3. Type ``make`` to build cxx tools.
+
+## Training
+
+Please check ``train.py`` for training.
+
+1. Copy ``rcnn/sample_config.py`` to ``rcnn/config.py``
+2. Download ImageNet pretrained models and put them into ``model/``(these models are not for detection testing/inferencing but training and parameters initialization). 
+
+    ImageNet ResNet50 ([baidu cloud](https://pan.baidu.com/s/1WAkU9ZA_j-OmzO-sdk9whA) and [googledrive](https://drive.google.com/file/d/1ibQOCG4eJyTrlKAJdnioQ3tyGlnbSHjy/view?usp=sharing)). 
+
+    ImageNet ResNet152 ([baidu cloud](https://pan.baidu.com/s/1nzQ6CzmdKFzg8bM8ChZFQg) and [googledrive](https://drive.google.com/file/d/1FEjeiIB4u-XBYdASgkyx78pFybrlKUA4/view?usp=sharing)).
+
+3. Start training with ``CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --prefix ./model/retina --network resnet``.  
+Before training, you can check the ``resnet`` network configuration (e.g. pretrained model path, anchor setting and learning rate policy etc..) in ``rcnn/config.py``.
+4. We have two predefined network settings named ``resnet``(for medium and large models) and ``mnet``(for lightweight models).
+
+## Testing
+
+Please check ``test.py`` for testing.
+
+## RetinaFace Pretrained Models
+
+Pretrained Model: RetinaFace-R50 ([baidu cloud](https://pan.baidu.com/s/1C6nKq122gJxRhb37vK0_LQ) or [googledrive](https://drive.google.com/file/d/1_DKgGxQWqlTqe78pw0KavId9BIMNUWfu/view?usp=sharing)) is a medium size model with ResNet50 backbone.
+It can output face bounding boxes and five facial landmarks in a single forward pass.
+
+WiderFace validation mAP: Easy 96.5, Medium 95.6, Hard 90.4. 
+
+To avoid the confliction with the WiderFace Challenge (ICCV 2019), we postpone the release time of our best model.
+
+## Third-party
+
+[yangfly](https://github.com/yangfly): RetinaFace-MobileNet0.25 ([baidu cloud](https://pan.baidu.com/s/1P1ypO7VYUbNAezdvLm2m9w):nzof).
+WiderFace validation mAP: Hard 82.5. (model size: 1.68Mb) 
+
+[clancylian](https://github.com/clancylian/retinaface): C++ version
+
+RetinaFace in [modelscope](https://modelscope.cn/models/damo/cv_resnet50_face-detection_retinaface/summary)
+
+## References
+
+```  
+@inproceedings{Deng2020CVPR,
+title = {RetinaFace: Single-Shot Multi-Level Face Localisation in the Wild},
+author = {Deng, Jiankang and Guo, Jia and Ververas, Evangelos and Kotsia, Irene and Zafeiriou, Stefanos},
+booktitle = {CVPR},
+year = {2020}
+}
+```
+
+
diff --git a/insightface/detection/retinaface/rcnn/PY_OP/__init__.py b/insightface/detection/retinaface/rcnn/PY_OP/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/PY_OP/cascade_refine.py b/insightface/detection/retinaface/rcnn/PY_OP/cascade_refine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c6556fab6f8ea13e4a214d88a04503d7c51540
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/PY_OP/cascade_refine.py
@@ -0,0 +1,518 @@
+from __future__ import print_function
+import sys
+import mxnet as mx
+import numpy as np
+import datetime
+from distutils.util import strtobool
+from ..config import config, generate_config
+from ..processing.generate_anchor import generate_anchors, anchors_plane
+from ..processing.bbox_transform import bbox_overlaps, bbox_transform, landmark_transform
+
+STAT = {0: 0}
+STEP = 28800
+
+
+class CascadeRefineOperator(mx.operator.CustomOp):
+    def __init__(self, stride=0, network='', dataset='', prefix=''):
+        super(CascadeRefineOperator, self).__init__()
+        self.stride = int(stride)
+        self.prefix = prefix
+        generate_config(network, dataset)
+        self.mode = config.TRAIN.OHEM_MODE  #0 for random 10:245, 1 for 10:246, 2 for 10:30, mode 1 for default
+        stride = self.stride
+        sstride = str(stride)
+        base_size = config.RPN_ANCHOR_CFG[sstride]['BASE_SIZE']
+        allowed_border = config.RPN_ANCHOR_CFG[sstride]['ALLOWED_BORDER']
+        ratios = config.RPN_ANCHOR_CFG[sstride]['RATIOS']
+        scales = config.RPN_ANCHOR_CFG[sstride]['SCALES']
+        base_anchors = generate_anchors(base_size=base_size,
+                                        ratios=list(ratios),
+                                        scales=np.array(scales,
+                                                        dtype=np.float32),
+                                        stride=stride,
+                                        dense_anchor=config.DENSE_ANCHOR)
+        num_anchors = base_anchors.shape[0]
+        feat_height, feat_width = config.SCALES[0][
+            0] // self.stride, config.SCALES[0][0] // self.stride
+        feat_stride = self.stride
+
+        A = num_anchors
+        K = feat_height * feat_width
+        self.A = A
+
+        all_anchors = anchors_plane(feat_height, feat_width, feat_stride,
+                                    base_anchors)
+        all_anchors = all_anchors.reshape((K * A, 4))
+        self.ori_anchors = all_anchors
+        self.nbatch = 0
+        global STAT
+        for k in config.RPN_FEAT_STRIDE:
+            STAT[k] = [0, 0, 0]
+
+    def apply_bbox_pred(self, bbox_pred, ind=None):
+        box_deltas = bbox_pred
+        box_deltas[:, 0::4] = box_deltas[:, 0::4] * config.TRAIN.BBOX_STDS[0]
+        box_deltas[:, 1::4] = box_deltas[:, 1::4] * config.TRAIN.BBOX_STDS[1]
+        box_deltas[:, 2::4] = box_deltas[:, 2::4] * config.TRAIN.BBOX_STDS[2]
+        box_deltas[:, 3::4] = box_deltas[:, 3::4] * config.TRAIN.BBOX_STDS[3]
+        if ind is None:
+            boxes = self.ori_anchors
+        else:
+            boxes = self.ori_anchors[ind]
+        #print('in apply',self.stride, box_deltas.shape, boxes.shape)
+
+        widths = boxes[:, 2] - boxes[:, 0] + 1.0
+        heights = boxes[:, 3] - boxes[:, 1] + 1.0
+        ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+        ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+        dx = box_deltas[:, 0:1]
+        dy = box_deltas[:, 1:2]
+        dw = box_deltas[:, 2:3]
+        dh = box_deltas[:, 3:4]
+
+        pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+        pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+        pred_w = np.exp(dw) * widths[:, np.newaxis]
+        pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+        pred_boxes = np.zeros(box_deltas.shape)
+        # x1
+        pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+        # y1
+        pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+        # x2
+        pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+        # y2
+        pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+        return pred_boxes
+
+    def assign_anchor_fpn(self,
+                          gt_label,
+                          anchors,
+                          landmark=False,
+                          prefix='face'):
+        IOU = config.TRAIN.CASCADE_OVERLAP
+
+        gt_boxes = gt_label['gt_boxes']
+        #_label = gt_label['gt_label']
+        # clean up boxes
+        #nonneg = np.where(_label[:] != -1)[0]
+        #gt_boxes = gt_boxes[nonneg]
+        if landmark:
+            gt_landmarks = gt_label['gt_landmarks']
+            #gt_landmarks = gt_landmarks[nonneg]
+            assert gt_boxes.shape[0] == gt_landmarks.shape[0]
+        #scales = np.array(scales, dtype=np.float32)
+        feat_strides = config.RPN_FEAT_STRIDE
+        bbox_pred_len = 4
+        landmark_pred_len = 10
+        num_anchors = anchors.shape[0]
+        A = self.A
+        total_anchors = num_anchors
+        feat_height, feat_width = config.SCALES[0][
+            0] // self.stride, config.SCALES[0][0] // self.stride
+
+        #print('total_anchors', anchors.shape[0], len(inds_inside), file=sys.stderr)
+
+        # label: 1 is positive, 0 is negative, -1 is dont care
+        labels = np.empty((num_anchors, ), dtype=np.float32)
+        labels.fill(-1)
+        #print('BB', anchors.shape, len(inds_inside))
+        #print('gt_boxes', gt_boxes.shape, file=sys.stderr)
+        #tb = datetime.datetime.now()
+        #self._times[0] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+
+        if gt_boxes.size > 0:
+            # overlap between the anchors and the gt boxes
+            # overlaps (ex, gt)
+            overlaps = bbox_overlaps(anchors.astype(np.float),
+                                     gt_boxes.astype(np.float))
+            argmax_overlaps = overlaps.argmax(axis=1)
+            #print('AAA', argmax_overlaps.shape)
+            max_overlaps = overlaps[np.arange(num_anchors), argmax_overlaps]
+            gt_argmax_overlaps = overlaps.argmax(axis=0)
+            gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                                       np.arange(overlaps.shape[1])]
+            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+            if not config.TRAIN.RPN_CLOBBER_POSITIVES:
+                # assign bg labels first so that positive labels can clobber them
+                labels[max_overlaps < IOU[0]] = 0
+
+            # fg label: for each gt, anchor with highest overlap
+            if config.TRAIN.RPN_FORCE_POSITIVE:
+                labels[gt_argmax_overlaps] = 1
+
+            # fg label: above threshold IoU
+            labels[max_overlaps >= IOU[1]] = 1
+
+            if config.TRAIN.RPN_CLOBBER_POSITIVES:
+                # assign bg labels last so that negative labels can clobber positives
+                labels[max_overlaps < IOU[0]] = 0
+        else:
+            labels[:] = 0
+        fg_inds = np.where(labels == 1)[0]
+        #print('fg count', len(fg_inds))
+
+        # subsample positive labels if we have too many
+        if config.TRAIN.RPN_ENABLE_OHEM == 0:
+            fg_inds = np.where(labels == 1)[0]
+            num_fg = int(config.TRAIN.RPN_FG_FRACTION *
+                         config.TRAIN.RPN_BATCH_SIZE)
+            if len(fg_inds) > num_fg:
+                disable_inds = npr.choice(fg_inds,
+                                          size=(len(fg_inds) - num_fg),
+                                          replace=False)
+                if DEBUG:
+                    disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
+                labels[disable_inds] = -1
+
+            # subsample negative labels if we have too many
+            num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1)
+            bg_inds = np.where(labels == 0)[0]
+            if len(bg_inds) > num_bg:
+                disable_inds = npr.choice(bg_inds,
+                                          size=(len(bg_inds) - num_bg),
+                                          replace=False)
+                if DEBUG:
+                    disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+                labels[disable_inds] = -1
+
+            #fg_inds = np.where(labels == 1)[0]
+            #num_fg = len(fg_inds)
+            #num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1)
+
+            #bg_inds = np.where(labels == 0)[0]
+            #if len(bg_inds) > num_bg:
+            #    disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+            #    if DEBUG:
+            #        disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+            #    labels[disable_inds] = -1
+        else:
+            fg_inds = np.where(labels == 1)[0]
+            num_fg = len(fg_inds)
+            bg_inds = np.where(labels == 0)[0]
+            num_bg = len(bg_inds)
+
+        #print('anchor stat', num_fg, num_bg)
+
+        bbox_targets = np.zeros((num_anchors, bbox_pred_len), dtype=np.float32)
+        if gt_boxes.size > 0:
+            #print('GT', gt_boxes.shape, gt_boxes[argmax_overlaps, :4].shape)
+            bbox_targets[:, :] = bbox_transform(anchors,
+                                                gt_boxes[argmax_overlaps, :])
+            #bbox_targets[:,4] = gt_blur
+        #tb = datetime.datetime.now()
+        #self._times[1] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+
+        bbox_weights = np.zeros((num_anchors, bbox_pred_len), dtype=np.float32)
+        #bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS)
+        bbox_weights[labels == 1, 0:4] = 1.0
+        if bbox_pred_len > 4:
+            bbox_weights[labels == 1, 4:bbox_pred_len] = 0.1
+
+        if landmark:
+            landmark_targets = np.zeros((num_anchors, landmark_pred_len),
+                                        dtype=np.float32)
+            landmark_weights = np.zeros((num_anchors, landmark_pred_len),
+                                        dtype=np.float32)
+            #landmark_weights[labels == 1, :] = np.array(config.TRAIN.RPN_LANDMARK_WEIGHTS)
+            if landmark_pred_len == 10:
+                landmark_weights[labels == 1, :] = 1.0
+            elif landmark_pred_len == 15:
+                v = [1.0, 1.0, 0.1] * 5
+                assert len(v) == 15
+                landmark_weights[labels == 1, :] = np.array(v)
+            else:
+                assert False
+            #TODO here
+            if gt_landmarks.size > 0:
+                #print('AAA',argmax_overlaps)
+                a_landmarks = gt_landmarks[argmax_overlaps, :, :]
+                landmark_targets[:] = landmark_transform(anchors, a_landmarks)
+                invalid = np.where(a_landmarks[:, 0, 2] < 0.0)[0]
+                #assert len(invalid)==0
+                #landmark_weights[invalid, :] = np.array(config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS)
+                landmark_weights[invalid, :] = 0.0
+        #tb = datetime.datetime.now()
+        #self._times[2] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+        bbox_targets[:,
+                     0::4] = bbox_targets[:, 0::4] / config.TRAIN.BBOX_STDS[0]
+        bbox_targets[:,
+                     1::4] = bbox_targets[:, 1::4] / config.TRAIN.BBOX_STDS[1]
+        bbox_targets[:,
+                     2::4] = bbox_targets[:, 2::4] / config.TRAIN.BBOX_STDS[2]
+        bbox_targets[:,
+                     3::4] = bbox_targets[:, 3::4] / config.TRAIN.BBOX_STDS[3]
+
+        #print('CC', anchors.shape, len(inds_inside))
+        label = {}
+        _label = labels.reshape(
+            (1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
+        _label = _label.reshape((1, A * feat_height * feat_width))
+        bbox_target = bbox_targets.reshape(
+            (1, feat_height * feat_width,
+             A * bbox_pred_len)).transpose(0, 2, 1)
+        bbox_weight = bbox_weights.reshape(
+            (1, feat_height * feat_width, A * bbox_pred_len)).transpose(
+                (0, 2, 1))
+        label['%s_label' % prefix] = _label[0]
+        label['%s_bbox_target' % prefix] = bbox_target[0]
+        label['%s_bbox_weight' % prefix] = bbox_weight[0]
+        if landmark:
+            landmark_target = landmark_target.reshape(
+                (1, feat_height * feat_width,
+                 A * landmark_pred_len)).transpose(0, 2, 1)
+            landmark_target /= config.TRAIN.LANDMARK_STD
+            landmark_weight = landmark_weight.reshape(
+                (1, feat_height * feat_width,
+                 A * landmark_pred_len)).transpose((0, 2, 1))
+            label['%s_landmark_target' % prefix] = landmark_target[0]
+            label['%s_landmark_weight' % prefix] = landmark_weight[0]
+
+        return label
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        self.nbatch += 1
+        ta = datetime.datetime.now()
+        global STAT
+        A = config.NUM_ANCHORS
+
+        cls_label_t0 = in_data[0].asnumpy()  #BS, AHW
+        cls_score_t0 = in_data[1].asnumpy()  #BS, C, AHW
+        cls_score = in_data[2].asnumpy()  #BS, C, AHW
+        #labels_raw = in_data[1].asnumpy() #BS, ANCHORS
+        bbox_pred_t0 = in_data[3].asnumpy()  #BS, AC, HW
+        bbox_target_t0 = in_data[4].asnumpy()  #BS, AC, HW
+        cls_label_raw = in_data[5].asnumpy()  #BS, AHW
+        gt_boxes = in_data[6].asnumpy()  #BS, N, C=4+1
+        #imgs = in_data[7].asnumpy().astype(np.uint8)
+
+        batch_size = cls_score.shape[0]
+        num_anchors = cls_score.shape[2]
+        #print('in cas', cls_score.shape, bbox_target.shape)
+
+        labels_out = np.zeros(shape=(batch_size, num_anchors),
+                              dtype=np.float32)
+        bbox_target_out = np.zeros(shape=bbox_target_t0.shape,
+                                   dtype=np.float32)
+        anchor_weight = np.zeros((batch_size, num_anchors, 1),
+                                 dtype=np.float32)
+        valid_count = np.zeros((batch_size, 1), dtype=np.float32)
+
+        bbox_pred_t0 = bbox_pred_t0.transpose((0, 2, 1))
+        bbox_pred_t0 = bbox_pred_t0.reshape(
+            (bbox_pred_t0.shape[0], -1, 4))  #BS, H*W*A, C
+        bbox_target_t0 = bbox_target_t0.transpose((0, 2, 1))
+        bbox_target_t0 = bbox_target_t0.reshape(
+            (bbox_target_t0.shape[0], -1, 4))
+
+        #print('anchor_weight', anchor_weight.shape)
+
+        #assert labels.shape[0]==1
+        #assert cls_score.shape[0]==1
+        #assert bbox_weight.shape[0]==1
+        #print('shape', cls_score.shape, labels.shape, file=sys.stderr)
+        #print('bbox_weight 0', bbox_weight.shape, file=sys.stderr)
+        #bbox_weight = np.zeros( (labels_raw.shape[0], labels_raw.shape[1], 4), dtype=np.float32)
+        _stat = [0, 0, 0]
+        SEL_TOPK = config.TRAIN.RPN_BATCH_SIZE
+        FAST = False
+        for ibatch in range(batch_size):
+            #bgr = imgs[ibatch].transpose( (1,2,0) )[:,:,::-1]
+
+            if not FAST:
+                _gt_boxes = gt_boxes[ibatch]  #N, 4+1
+                _gtind = len(np.where(_gt_boxes[:, 4] >= 0)[0])
+                #print('gt num', _gtind)
+                _gt_boxes = _gt_boxes[0:_gtind, :]
+
+                #anchors_t1 = self.ori_anchors.copy()
+                #_cls_label_raw = cls_label_raw[ibatch] #AHW
+                #_cls_label_raw = _cls_label_raw.reshape( (A, -1) ).transpose( (1,0) ).reshape( (-1,) ) #HWA
+                #fg_ind_raw = np.where(_cls_label_raw>0)[0]
+                #_bbox_target_t0 = bbox_target_t0[ibatch][fg_ind_raw]
+                #_bbox_pred_t0 = bbox_pred_t0[ibatch][fg_ind_raw]
+                #anchors_t1_pos = self.apply_bbox_pred(_bbox_pred_t0, ind=fg_ind_raw)
+                #anchors_t1[fg_ind_raw,:] = anchors_t1_pos
+
+                anchors_t1 = self.apply_bbox_pred(bbox_pred_t0[ibatch])
+                assert anchors_t1.shape[0] == self.ori_anchors.shape[0]
+
+                #for i in range(_gt_boxes.shape[0]):
+                #  box = _gt_boxes[i].astype(np.int)
+                #  print('%d: gt%d'%(self.nbatch, i), box)
+                #  #color = (0,0,255)
+                #  #cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), color, 2)
+                #for i in range(anchors_t1.shape[0]):
+                #  box1 = self.ori_anchors[i].astype(np.int)
+                #  box2 = anchors_t1[i].astype(np.int)
+                #  print('%d %d: anchorscompare %d'%(self.nbatch, self.stride, i), box1, box2)
+                #color = (255,255,0)
+                #cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), color, 2)
+                #filename = "./debug/%d_%d_%d.jpg"%(self.nbatch, ibatch, stride)
+                #cv2.imwrite(filename, img)
+                #print(filename)
+                #gt_label = {'gt_boxes': gt_anchors, 'gt_label' : labels_raw[ibatch]}
+                gt_label = {'gt_boxes': _gt_boxes}
+                new_label_dict = self.assign_anchor_fpn(gt_label,
+                                                        anchors_t1,
+                                                        False,
+                                                        prefix=self.prefix)
+                labels = new_label_dict['%s_label' % self.prefix]  #AHW
+                new_bbox_target = new_label_dict['%s_bbox_target' %
+                                                 self.prefix]  #AC,HW
+                #print('assign ret', labels.shape, new_bbox_target.shape)
+                _anchor_weight = np.zeros((num_anchors, 1), dtype=np.float32)
+                fg_score = cls_score[ibatch, 1, :] - cls_score[ibatch, 0, :]
+                fg_inds = np.where(labels > 0)[0]
+                num_fg = int(config.TRAIN.RPN_FG_FRACTION *
+                             config.TRAIN.RPN_BATCH_SIZE)
+                origin_num_fg = len(fg_inds)
+                #continue
+                #print('cas fg', len(fg_inds), num_fg, file=sys.stderr)
+                if len(fg_inds) > num_fg:
+                    if self.mode == 0:
+                        disable_inds = np.random.choice(fg_inds,
+                                                        size=(len(fg_inds) -
+                                                              num_fg),
+                                                        replace=False)
+                        labels[disable_inds] = -1
+                    else:
+                        pos_ohem_scores = fg_score[fg_inds]
+                        order_pos_ohem_scores = pos_ohem_scores.ravel(
+                        ).argsort()
+                        sampled_inds = fg_inds[order_pos_ohem_scores[:num_fg]]
+                        labels[fg_inds] = -1
+                        labels[sampled_inds] = 1
+
+                n_fg = np.sum(labels > 0)
+                fg_inds = np.where(labels > 0)[0]
+                num_bg = config.TRAIN.RPN_BATCH_SIZE - n_fg
+                if self.mode == 2:
+                    num_bg = max(
+                        48, n_fg * int(1.0 / config.TRAIN.RPN_FG_FRACTION - 1))
+
+                bg_inds = np.where(labels == 0)[0]
+                origin_num_bg = len(bg_inds)
+                if num_bg == 0:
+                    labels[bg_inds] = -1
+                elif len(bg_inds) > num_bg:
+                    # sort ohem scores
+
+                    if self.mode == 0:
+                        disable_inds = np.random.choice(bg_inds,
+                                                        size=(len(bg_inds) -
+                                                              num_bg),
+                                                        replace=False)
+                        labels[disable_inds] = -1
+                    else:
+                        neg_ohem_scores = fg_score[bg_inds]
+                        order_neg_ohem_scores = neg_ohem_scores.ravel(
+                        ).argsort()[::-1]
+                        sampled_inds = bg_inds[order_neg_ohem_scores[:num_bg]]
+                        #print('sampled_inds_bg', sampled_inds, file=sys.stderr)
+                        labels[bg_inds] = -1
+                        labels[sampled_inds] = 0
+
+                if n_fg > 0:
+                    order0_labels = labels.reshape((1, A, -1)).transpose(
+                        (0, 2, 1)).reshape((-1, ))
+                    bbox_fg_inds = np.where(order0_labels > 0)[0]
+                    #print('bbox_fg_inds, order0 ', bbox_fg_inds, file=sys.stderr)
+                    _anchor_weight[bbox_fg_inds, :] = 1.0
+                anchor_weight[ibatch] = _anchor_weight
+                valid_count[ibatch][0] = n_fg
+                labels_out[ibatch] = labels
+                #print('labels_out', self.stride, ibatch, labels)
+                bbox_target_out[ibatch] = new_bbox_target
+                #print('cascade stat', self.stride, ibatch, len(labels), len(np.where(labels==1)[0]), len(np.where(labels==0)[0]))
+            else:  #FAST MODE
+                fg_score_t0 = cls_score_t0[ibatch, 1, :] - cls_score_t0[ibatch,
+                                                                        0, :]
+                sort_idx_t0 = np.argsort(
+                    fg_score_t0.flatten())[::-1][0:SEL_TOPK]
+                _bbox_pred_t0 = bbox_pred_t0[ibatch][sort_idx_t0]
+                _bbox_target_t0 = bbox_target_t0[ibatch][sort_idx_t0]
+                #print('SEL fg score:', fg_score_t0[sort_idx[-1]], fg_score_t0[sort_idx[0]])
+                anchors_t0 = self.apply_bbox_pred(_bbox_pred_t0)
+                gt_anchors = self.apply_bbox_pred(_bbox_target_t0)
+                #gt_label = {'gt_boxes': gt_anchors, 'gt_label' : labels_raw[ibatch]}
+                gt_label = {'gt_boxes': gt_anchors}
+                new_label_dict = self.assign_anchor_fpn(gt_label,
+                                                        anchors_t0,
+                                                        False,
+                                                        prefix=self.prefix)
+                labels = new_label_dict['%s_label' % self.prefix]
+                new_bbox_target = new_label_dict['%s_bbox_target' %
+                                                 self.prefix]
+                #print('assign ret', labels.shape, new_bbox_target.shape)
+                _anchor_weight = np.zeros((num_anchors, 1), dtype=np.float32)
+                fg_score = cls_score[ibatch, 1, :] - cls_score[ibatch, 0, :]
+                fg_inds = np.where(labels > 0)[0]
+                _labels = np.empty(shape=labels.shape, dtype=np.float32)
+                _labels.fill(-1)
+                _labels[sort_idx_idx] = labels
+
+                anchor_weight[ibatch] = _anchor_weight
+                valid_count[ibatch][0] = len(fg_inds)
+                labels_out[ibatch] = _labels
+                #print('labels_out', self.stride, ibatch, labels)
+                bbox_target_out[ibatch] = new_bbox_target
+
+        #print('cascade pos stat', self.stride, batch_size, np.sum(valid_count))
+        for ind, val in enumerate(
+            [labels_out, bbox_target_out, anchor_weight, valid_count]):
+            val = mx.nd.array(val)
+            self.assign(out_data[ind], req[ind], val)
+        tb = datetime.datetime.now()
+        #print('cascade forward cost', self.stride, (tb-ta).total_seconds())
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        for i in range(len(in_grad)):
+            self.assign(in_grad[i], req[i], 0)
+
+
+@mx.operator.register('cascade_refine')
+class CascadeRefineProp(mx.operator.CustomOpProp):
+    def __init__(self, stride=0, network='', dataset='', prefix=''):
+        super(CascadeRefineProp, self).__init__(need_top_grad=False)
+        self.stride = stride
+        self.network = network
+        self.dataset = dataset
+        self.prefix = prefix
+
+    def list_arguments(self):
+        #return ['cls_label_t0', 'cls_pred_t0', 'cls_pred', 'bbox_pred_t0', 'bbox_label_t0', 'cls_label_raw', 'cas_gt_boxes', 'cas_img']
+        return [
+            'cls_label_t0', 'cls_pred_t0', 'cls_pred', 'bbox_pred_t0',
+            'bbox_label_t0', 'cls_label_raw', 'cas_gt_boxes'
+        ]
+
+    def list_outputs(self):
+        return [
+            'cls_label_out', 'bbox_label_out', 'anchor_weight_out',
+            'pos_count_out'
+        ]
+
+    def infer_shape(self, in_shape):
+        cls_pred_shape = in_shape[1]
+        bs = cls_pred_shape[0]
+        num_anchors = cls_pred_shape[2]
+        #print('in_rpn_ohem', in_shape[0], in_shape[1], in_shape[2], file=sys.stderr)
+        #print('in_rpn_ohem', labels_shape, anchor_weight_shape)
+        cls_label_shape = [bs, num_anchors]
+
+        return in_shape, \
+               [cls_label_shape, in_shape[4], [bs,num_anchors,1], [bs,1]]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return CascadeRefineOperator(self.stride, self.network, self.dataset,
+                                     self.prefix)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
diff --git a/insightface/detection/retinaface/rcnn/PY_OP/rpn_fpn_ohem3.py b/insightface/detection/retinaface/rcnn/PY_OP/rpn_fpn_ohem3.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f7d462ec9aec245852338b392fb4d8afd3311c
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/PY_OP/rpn_fpn_ohem3.py
@@ -0,0 +1,175 @@
+from __future__ import print_function
+import sys
+import mxnet as mx
+import numpy as np
+from distutils.util import strtobool
+from ..config import config, generate_config
+
+STAT = {0: 0}
+STEP = 28800
+
+
+class RPNFPNOHEM3Operator(mx.operator.CustomOp):
+    def __init__(self, stride=0, network='', dataset='', prefix=''):
+        super(RPNFPNOHEM3Operator, self).__init__()
+        self.stride = int(stride)
+        self.prefix = prefix
+        generate_config(network, dataset)
+        self.mode = config.TRAIN.OHEM_MODE  #0 for random 10:245, 1 for 10:246, 2 for 10:30, mode 1 for default
+        global STAT
+        for k in config.RPN_FEAT_STRIDE:
+            STAT[k] = [0, 0, 0]
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        global STAT
+
+        cls_score = in_data[0].asnumpy()  #BS, 2, ANCHORS
+        labels_raw = in_data[1].asnumpy()  # BS, ANCHORS
+
+        A = config.NUM_ANCHORS
+        anchor_weight = np.zeros((labels_raw.shape[0], labels_raw.shape[1], 1),
+                                 dtype=np.float32)
+        valid_count = np.zeros((labels_raw.shape[0], 1), dtype=np.float32)
+        #print('anchor_weight', anchor_weight.shape)
+
+        #assert labels.shape[0]==1
+        #assert cls_score.shape[0]==1
+        #assert bbox_weight.shape[0]==1
+        #print('shape', cls_score.shape, labels.shape, file=sys.stderr)
+        #print('bbox_weight 0', bbox_weight.shape, file=sys.stderr)
+        #bbox_weight = np.zeros( (labels_raw.shape[0], labels_raw.shape[1], 4), dtype=np.float32)
+        _stat = [0, 0, 0]
+        for ibatch in range(labels_raw.shape[0]):
+            _anchor_weight = np.zeros((labels_raw.shape[1], 1),
+                                      dtype=np.float32)
+            labels = labels_raw[ibatch]
+            fg_score = cls_score[ibatch, 1, :] - cls_score[ibatch, 0, :]
+
+            fg_inds = np.where(labels > 0)[0]
+            num_fg = int(config.TRAIN.RPN_FG_FRACTION *
+                         config.TRAIN.RPN_BATCH_SIZE)
+            origin_num_fg = len(fg_inds)
+            #print(len(fg_inds), num_fg, file=sys.stderr)
+            if len(fg_inds) > num_fg:
+                if self.mode == 0:
+                    disable_inds = np.random.choice(fg_inds,
+                                                    size=(len(fg_inds) -
+                                                          num_fg),
+                                                    replace=False)
+                    labels[disable_inds] = -1
+                else:
+                    pos_ohem_scores = fg_score[fg_inds]
+                    order_pos_ohem_scores = pos_ohem_scores.ravel().argsort()
+                    sampled_inds = fg_inds[order_pos_ohem_scores[:num_fg]]
+                    labels[fg_inds] = -1
+                    labels[sampled_inds] = 1
+
+            n_fg = np.sum(labels > 0)
+            fg_inds = np.where(labels > 0)[0]
+            num_bg = config.TRAIN.RPN_BATCH_SIZE - n_fg
+            if self.mode == 2:
+                num_bg = max(
+                    48, n_fg * int(1.0 / config.TRAIN.RPN_FG_FRACTION - 1))
+
+            bg_inds = np.where(labels == 0)[0]
+            origin_num_bg = len(bg_inds)
+            if num_bg == 0:
+                labels[bg_inds] = -1
+            elif len(bg_inds) > num_bg:
+                # sort ohem scores
+
+                if self.mode == 0:
+                    disable_inds = np.random.choice(bg_inds,
+                                                    size=(len(bg_inds) -
+                                                          num_bg),
+                                                    replace=False)
+                    labels[disable_inds] = -1
+                else:
+                    neg_ohem_scores = fg_score[bg_inds]
+                    order_neg_ohem_scores = neg_ohem_scores.ravel().argsort(
+                    )[::-1]
+                    sampled_inds = bg_inds[order_neg_ohem_scores[:num_bg]]
+                    #print('sampled_inds_bg', sampled_inds, file=sys.stderr)
+                    labels[bg_inds] = -1
+                    labels[sampled_inds] = 0
+
+            if n_fg > 0:
+                order0_labels = labels.reshape((1, A, -1)).transpose(
+                    (0, 2, 1)).reshape((-1, ))
+                bbox_fg_inds = np.where(order0_labels > 0)[0]
+                #print('bbox_fg_inds, order0 ', bbox_fg_inds, file=sys.stderr)
+                _anchor_weight[bbox_fg_inds, :] = 1.0
+            anchor_weight[ibatch] = _anchor_weight
+            valid_count[ibatch][0] = n_fg
+
+            #if self.prefix=='face':
+            #  #print('fg-bg', self.stride, n_fg, num_bg)
+            #  STAT[0]+=1
+            #  STAT[self.stride][0] += config.TRAIN.RPN_BATCH_SIZE
+            #  STAT[self.stride][1] += n_fg
+            #  STAT[self.stride][2] += np.sum(fg_score[fg_inds]>=0)
+            #  #_stat[0] += config.TRAIN.RPN_BATCH_SIZE
+            #  #_stat[1] += n_fg
+            #  #_stat[2] += np.sum(fg_score[fg_inds]>=0)
+            #  #print('stride num_fg', self.stride, n_fg, file=sys.stderr)
+            #  #ACC[self.stride] += np.sum(fg_score[fg_inds]>=0)
+            #  #x = float(labels_raw.shape[0]*len(config.RPN_FEAT_STRIDE))
+            #  x = 1.0
+            #  if STAT[0]%STEP==0:
+            #    _str = ['STAT']
+            #    STAT[0] = 0
+            #    for k in config.RPN_FEAT_STRIDE:
+            #      acc = float(STAT[k][2])/STAT[k][1]
+            #      acc0 = float(STAT[k][1])/STAT[k][0]
+            #      #_str.append("%d: all-fg(%d, %d, %.4f), fg-fgcorrect(%d, %d, %.4f)"%(k,STAT[k][0], STAT[k][1], acc0, STAT[k][1], STAT[k][2], acc))
+            #      _str.append("%d: (%d, %d, %.4f)"%(k, STAT[k][1], STAT[k][2], acc))
+            #      STAT[k] = [0,0,0]
+            #    _str = ' | '.join(_str)
+            #    print(_str, file=sys.stderr)
+            #if self.stride==4 and num_fg>0:
+            #  print('_stat_', self.stride, num_fg, num_bg, file=sys.stderr)
+
+        #labels_ohem = mx.nd.array(labels_raw)
+        #anchor_weight = mx.nd.array(anchor_weight)
+        #print('valid_count', self.stride, np.sum(valid_count))
+        #print('_stat', _stat, valid_count)
+
+        for ind, val in enumerate([labels_raw, anchor_weight, valid_count]):
+            val = mx.nd.array(val)
+            self.assign(out_data[ind], req[ind], val)
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        for i in range(len(in_grad)):
+            self.assign(in_grad[i], req[i], 0)
+
+
+@mx.operator.register('rpn_fpn_ohem3')
+class RPNFPNOHEM3Prop(mx.operator.CustomOpProp):
+    def __init__(self, stride=0, network='', dataset='', prefix=''):
+        super(RPNFPNOHEM3Prop, self).__init__(need_top_grad=False)
+        self.stride = stride
+        self.network = network
+        self.dataset = dataset
+        self.prefix = prefix
+
+    def list_arguments(self):
+        return ['cls_score', 'labels']
+
+    def list_outputs(self):
+        return ['labels_ohem', 'anchor_weight', 'valid_count']
+
+    def infer_shape(self, in_shape):
+        labels_shape = in_shape[1]
+        #print('in_rpn_ohem', in_shape[0], in_shape[1], in_shape[2], file=sys.stderr)
+        anchor_weight_shape = [labels_shape[0], labels_shape[1], 1]
+        #print('in_rpn_ohem', labels_shape, anchor_weight_shape)
+
+        return in_shape, \
+               [labels_shape, anchor_weight_shape, [labels_shape[0], 1]]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return RPNFPNOHEM3Operator(self.stride, self.network, self.dataset,
+                                   self.prefix)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
diff --git a/insightface/detection/retinaface/rcnn/__init__.py b/insightface/detection/retinaface/rcnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/core/__init__.py b/insightface/detection/retinaface/rcnn/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/core/callback.py b/insightface/detection/retinaface/rcnn/core/callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..729a3920ed9f08a79db315b7823a9c8213b4a72d
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/callback.py
@@ -0,0 +1,16 @@
+import mxnet as mx
+
+
+def do_checkpoint(prefix, means, stds):
+    def _callback(iter_no, sym, arg, aux):
+        if 'bbox_pred_weight' in arg:
+            arg['bbox_pred_weight_test'] = (arg['bbox_pred_weight'].T *
+                                            mx.nd.array(stds)).T
+            arg['bbox_pred_bias_test'] = arg['bbox_pred_bias'] * mx.nd.array(
+                stds) + mx.nd.array(means)
+        mx.model.save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
+        if 'bbox_pred_weight' in arg:
+            arg.pop('bbox_pred_weight_test')
+            arg.pop('bbox_pred_bias_test')
+
+    return _callback
diff --git a/insightface/detection/retinaface/rcnn/core/loader.py b/insightface/detection/retinaface/rcnn/core/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d34d2eb3149333dcbbbd7058b5a52e8878c6450
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/loader.py
@@ -0,0 +1,549 @@
+from __future__ import print_function
+import sys
+import mxnet as mx
+import numpy as np
+import random
+import datetime
+import multiprocessing
+import cv2
+from mxnet.executor_manager import _split_input_slice
+
+from rcnn.config import config
+from rcnn.io.image import tensor_vstack
+from rcnn.io.rpn import get_rpn_testbatch, get_rpn_batch, assign_anchor_fpn, get_crop_batch, AA
+
+
+class CropLoader(mx.io.DataIter):
+    def __init__(self,
+                 feat_sym,
+                 roidb,
+                 batch_size=1,
+                 shuffle=False,
+                 ctx=None,
+                 work_load_list=None,
+                 aspect_grouping=False):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param feat_sym: to infer shape of assign_output
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :param aspect_grouping: group images with similar aspects
+        :return: AnchorLoader
+        """
+        super(CropLoader, self).__init__()
+
+        # save parameters as properties
+        self.feat_sym = feat_sym
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+        #self.feat_stride = feat_stride
+        #self.anchor_scales = anchor_scales
+        #self.anchor_ratios = anchor_ratios
+        #self.allowed_border = allowed_border
+        self.aspect_grouping = aspect_grouping
+        self.feat_stride = config.RPN_FEAT_STRIDE
+
+        # infer properties from roidb
+        self.size = len(roidb)
+        self.index = np.arange(self.size)
+
+        # decide data and label names
+        #self.data_name = ['data']
+        #self.label_name = []
+        #self.label_name.append('label')
+        #self.label_name.append('bbox_target')
+        #self.label_name.append('bbox_weight')
+
+        self.data_name = ['data']
+        #self.label_name = ['label', 'bbox_target', 'bbox_weight']
+        self.label_name = []
+        prefixes = ['face']
+        if config.HEAD_BOX:
+            prefixes.append('head')
+        names = []
+        for prefix in prefixes:
+            names += [
+                prefix + '_label', prefix + '_bbox_target',
+                prefix + '_bbox_weight'
+            ]
+            if prefix == 'face' and config.FACE_LANDMARK:
+                names += [
+                    prefix + '_landmark_target', prefix + '_landmark_weight'
+                ]
+        #names = ['label', 'bbox_weight']
+        for stride in self.feat_stride:
+            for n in names:
+                k = "%s_stride%d" % (n, stride)
+                self.label_name.append(k)
+        if config.CASCADE > 0:
+            self.label_name.append('gt_boxes')
+
+        # status variable for synchronization between get_data and get_label
+        self.cur = 0
+        self.batch = None
+        self.data = None
+        self.label = None
+        # infer shape
+        feat_shape_list = []
+        _data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]),
+                                 max([v[1] for v in config.SCALES])))]
+        _data_shape = dict(_data_shape)
+        for i in range(len(self.feat_stride)):
+            _, feat_shape, _ = self.feat_sym[i].infer_shape(**_data_shape)
+            feat_shape = [int(i) for i in feat_shape[0]]
+            feat_shape_list.append(feat_shape)
+        self.aa = AA(feat_shape_list)
+
+        self._debug = False
+        self._debug_id = 0
+        self._times = [0.0, 0.0, 0.0, 0.0]
+
+        # get first batch to fill in provide_data and provide_label
+        self.reset()
+        self.get_batch()
+
+    @property
+    def provide_data(self):
+        return [(k, v.shape) for k, v in zip(self.data_name, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(k, v.shape) for k, v in zip(self.label_name, self.label)]
+
+    def reset(self):
+        self.cur = 0
+        if self.shuffle:
+            np.random.shuffle(self.index)
+
+    def iter_next(self):
+        return self.cur + self.batch_size <= self.size
+
+    def next(self):
+        if self.iter_next():
+            self.get_batch()
+            self.cur += self.batch_size
+            return mx.io.DataBatch(data=self.data,
+                                   label=self.label,
+                                   pad=self.getpad(),
+                                   index=self.getindex(),
+                                   provide_data=self.provide_data,
+                                   provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def getindex(self):
+        return self.cur / self.batch_size
+
+    def getpad(self):
+        if self.cur + self.batch_size > self.size:
+            return self.cur + self.batch_size - self.size
+        else:
+            return 0
+
+    def infer_shape(self, max_data_shape=None, max_label_shape=None):
+        """ Return maximum data and label shape for single gpu """
+        if max_data_shape is None:
+            max_data_shape = []
+        if max_label_shape is None:
+            max_label_shape = []
+        max_shapes = dict(max_data_shape + max_label_shape)
+        input_batch_size = max_shapes['data'][0]
+        dummy_boxes = np.zeros((0, 5))
+        dummy_info = [[max_shapes['data'][2], max_shapes['data'][3], 1.0]]
+        dummy_label = {'gt_boxes': dummy_boxes}
+        dummy_blur = np.zeros((0, ))
+        dummy_label['gt_blur'] = dummy_blur
+
+        label_dict = {}
+        if config.HEAD_BOX:
+            head_label_dict = self.aa.assign_anchor_fpn(dummy_label,
+                                                        dummy_info,
+                                                        False,
+                                                        prefix='head')
+            label_dict.update(head_label_dict)
+
+        if config.FACE_LANDMARK:
+            dummy_landmarks = np.zeros((0, 5, 3))
+            dummy_label['gt_landmarks'] = dummy_landmarks
+        face_label_dict = self.aa.assign_anchor_fpn(dummy_label,
+                                                    dummy_info,
+                                                    config.FACE_LANDMARK,
+                                                    prefix='face')
+        label_dict.update(face_label_dict)
+        if config.CASCADE > 0:
+            label_dict['gt_boxes'] = np.zeros(
+                (0, config.TRAIN.MAX_BBOX_PER_IMAGE, 5), dtype=np.float32)
+
+        label_list = []
+        for k in self.label_name:
+            label_list.append(label_dict[k])
+        label_shape = [(k, tuple([input_batch_size] + list(v.shape[1:])))
+                       for k, v in zip(self.label_name, label_list)]
+        return max_data_shape, label_shape
+
+    def get_batch(self):
+        # slice roidb
+        cur_from = self.cur
+        cur_to = min(cur_from + self.batch_size, self.size)
+        assert cur_to == cur_from + self.batch_size
+        roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
+
+        # decide multi device slice
+        work_load_list = self.work_load_list
+        ctx = self.ctx
+        if work_load_list is None:
+            work_load_list = [1] * len(ctx)
+        assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
+            "Invalid settings for work load. "
+        slices = _split_input_slice(self.batch_size, work_load_list)
+
+        # get testing data for multigpu
+        data_list = []
+        label_list = []
+        for islice in slices:
+            iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
+            data, label = get_crop_batch(iroidb)
+            data_list += data
+            label_list += label
+            #data_list.append(data)
+            #label_list.append(label)
+
+        # pad data first and then assign anchor (read label)
+        #data_tensor = tensor_vstack([batch['data'] for batch in data_list])
+        #for i_card in range(len(data_list)):
+        #    data_list[i_card]['data'] = data_tensor[
+        #                                i_card * config.TRAIN.BATCH_IMAGES:(1 + i_card) * config.TRAIN.BATCH_IMAGES]
+
+        #iiddxx = 0
+        select_stride = 0
+        if config.RANDOM_FEAT_STRIDE:
+            select_stride = random.choice(config.RPN_FEAT_STRIDE)
+
+        for data, label in zip(data_list, label_list):
+            data_shape = {k: v.shape for k, v in data.items()}
+            del data_shape['im_info']
+            feat_shape_list = []
+            for s in range(len(self.feat_stride)):
+                _, feat_shape, _ = self.feat_sym[s].infer_shape(**data_shape)
+                feat_shape = [int(i) for i in feat_shape[0]]
+                feat_shape_list.append(feat_shape)
+            im_info = data['im_info']
+            gt_boxes = label['gt_boxes']
+            gt_label = {'gt_boxes': gt_boxes}
+            if config.USE_BLUR:
+                gt_blur = label['gt_blur']
+                gt_label['gt_blur'] = gt_blur
+            if self._debug:
+                img = data['data'].copy()[0].transpose(
+                    (1, 2, 0))[:, :, ::-1].copy()
+                print('DEBUG SHAPE', data['data'].shape,
+                      label['gt_boxes'].shape)
+
+                box = label['gt_boxes'].copy()[0][0:4].astype(np.int)
+                cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]),
+                              (0, 255, 0), 2)
+                filename = './debugout/%d.png' % (self._debug_id)
+                print('debug write', filename)
+                cv2.imwrite(filename, img)
+                self._debug_id += 1
+                #print('DEBUG', img.shape, bbox.shape)
+            label_dict = {}
+            if config.HEAD_BOX:
+                head_label_dict = self.aa.assign_anchor_fpn(
+                    gt_label,
+                    im_info,
+                    False,
+                    prefix='head',
+                    select_stride=select_stride)
+                label_dict.update(head_label_dict)
+            if config.FACE_LANDMARK:
+                gt_landmarks = label['gt_landmarks']
+                gt_label['gt_landmarks'] = gt_landmarks
+            #ta = datetime.datetime.now()
+            #face_label_dict = assign_anchor_fpn(feat_shape_list, gt_label, im_info, config.FACE_LANDMARK, prefix='face', select_stride = select_stride)
+            face_label_dict = self.aa.assign_anchor_fpn(
+                gt_label,
+                im_info,
+                config.FACE_LANDMARK,
+                prefix='face',
+                select_stride=select_stride)
+            #tb = datetime.datetime.now()
+            #self._times[0] += (tb-ta).total_seconds()
+            label_dict.update(face_label_dict)
+            #for k in label_dict:
+            #  print(k, label_dict[k].shape)
+
+            if config.CASCADE > 0:
+                pad_gt_boxes = np.empty(
+                    (1, config.TRAIN.MAX_BBOX_PER_IMAGE, 5), dtype=np.float32)
+                pad_gt_boxes.fill(-1)
+                pad_gt_boxes[0, 0:gt_boxes.shape[0], :] = gt_boxes
+                label_dict['gt_boxes'] = pad_gt_boxes
+            #print('im_info', im_info.shape)
+            #print(gt_boxes.shape)
+            for k in self.label_name:
+                label[k] = label_dict[k]
+
+        all_data = dict()
+        for key in self.data_name:
+            all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+        all_label = dict()
+        for key in self.label_name:
+            pad = 0 if key.startswith('bbox_') else -1
+            #print('label vstack', key, pad, len(label_list), file=sys.stderr)
+            all_label[key] = tensor_vstack(
+                [batch[key] for batch in label_list], pad=pad)
+
+        self.data = [mx.nd.array(all_data[key]) for key in self.data_name]
+        self.label = [mx.nd.array(all_label[key]) for key in self.label_name]
+        #for _label in self.label:
+        #  print('LABEL SHAPE', _label.shape)
+        #print(self._times)
+
+
+class CropLoader2(mx.io.DataIter):
+    def __init__(self,
+                 feat_sym,
+                 roidb,
+                 batch_size=1,
+                 shuffle=False,
+                 ctx=None,
+                 work_load_list=None,
+                 aspect_grouping=False):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param feat_sym: to infer shape of assign_output
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :param aspect_grouping: group images with similar aspects
+        :return: AnchorLoader
+        """
+        super(CropLoader2, self).__init__()
+
+        # save parameters as properties
+        self.feat_sym = feat_sym
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+        #self.feat_stride = feat_stride
+        #self.anchor_scales = anchor_scales
+        #self.anchor_ratios = anchor_ratios
+        #self.allowed_border = allowed_border
+        self.aspect_grouping = aspect_grouping
+        self.feat_stride = config.RPN_FEAT_STRIDE
+
+        # infer properties from roidb
+        self.size = len(roidb)
+
+        # decide data and label names
+        #self.data_name = ['data']
+        #self.label_name = []
+        #self.label_name.append('label')
+        #self.label_name.append('bbox_target')
+        #self.label_name.append('bbox_weight')
+
+        self.data_name = ['data']
+        #self.label_name = ['label', 'bbox_target', 'bbox_weight']
+        self.label_name = []
+        prefixes = ['face']
+        if config.HEAD_BOX:
+            prefixes.append('head')
+        names = []
+        for prefix in prefixes:
+            names += [
+                prefix + '_label', prefix + '_bbox_target',
+                prefix + '_bbox_weight'
+            ]
+            if prefix == 'face' and config.FACE_LANDMARK:
+                names += [
+                    prefix + '_landmark_target', prefix + '_landmark_weight'
+                ]
+        #names = ['label', 'bbox_weight']
+        for stride in self.feat_stride:
+            for n in names:
+                k = "%s_stride%d" % (n, stride)
+                self.label_name.append(k)
+        # status variable for synchronization between get_data and get_label
+        self.cur = 0
+        self.batch = None
+        self.data = None
+        self.label = None
+
+        # get first batch to fill in provide_data and provide_label
+        self.reset()
+        self.q_in = [
+            multiprocessing.Queue(1024) for i in range(config.NUM_CPU)
+        ]
+        #self.q_in = multiprocessing.Queue(1024)
+        self.q_out = multiprocessing.Queue(1024)
+        self.start()
+        self.get_batch()
+
+    @property
+    def provide_data(self):
+        return [(k, v.shape) for k, v in zip(self.data_name, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(k, v.shape) for k, v in zip(self.label_name, self.label)]
+
+    def reset(self):
+        pass
+
+    @staticmethod
+    def input_worker(q_in, roidb, batch_size):
+        index = np.arange(len(roidb))
+        np.random.shuffle(index)
+        cur_from = 0
+        while True:
+            cur_to = cur_from + batch_size
+            if cur_to > len(roidb):
+                np.random.shuffle(index)
+                cur_from = 0
+                continue
+            _roidb = [roidb[index[i]] for i in range(cur_from, cur_to)]
+            istart = index[cur_from]
+            q_in[istart % len(q_in)].put(_roidb)
+            cur_from = cur_to
+
+    @staticmethod
+    def gen_worker(q_in, q_out):
+        while True:
+            deq = q_in.get()
+            if deq is None:
+                break
+            _roidb = deq
+            data, label = get_crop_batch(_roidb)
+            print('generated')
+            q_out.put((data, label))
+
+    def start(self):
+        input_process = multiprocessing.Process(
+            target=CropLoader2.input_worker,
+            args=(self.q_in, self.roidb, self.batch_size))
+        #gen_process = multiprocessing.Process(target=gen_worker, args=(q_in, q_out))
+        gen_process = [multiprocessing.Process(target=CropLoader2.gen_worker, args=(self.q_in[i], self.q_out)) \
+                  for i in range(config.NUM_CPU)]
+        input_process.start()
+        for p in gen_process:
+            p.start()
+
+    def next(self):
+        self.get_batch()
+        return mx.io.DataBatch(data=self.data,
+                               label=self.label,
+                               provide_data=self.provide_data,
+                               provide_label=self.provide_label)
+
+    def infer_shape(self, max_data_shape=None, max_label_shape=None):
+        """ Return maximum data and label shape for single gpu """
+        if max_data_shape is None:
+            max_data_shape = []
+        if max_label_shape is None:
+            max_label_shape = []
+        max_shapes = dict(max_data_shape + max_label_shape)
+        input_batch_size = max_shapes['data'][0]
+        dummy_boxes = np.zeros((0, 5))
+        dummy_info = [[max_shapes['data'][2], max_shapes['data'][3], 1.0]]
+        dummy_label = {'gt_boxes': dummy_boxes}
+
+        # infer shape
+        feat_shape_list = []
+        for i in range(len(self.feat_stride)):
+            _, feat_shape, _ = self.feat_sym[i].infer_shape(**max_shapes)
+            feat_shape = [int(i) for i in feat_shape[0]]
+            feat_shape_list.append(feat_shape)
+
+        label_dict = {}
+        if config.HEAD_BOX:
+            head_label_dict = assign_anchor_fpn(feat_shape_list,
+                                                dummy_label,
+                                                dummy_info,
+                                                False,
+                                                prefix='head')
+            label_dict.update(head_label_dict)
+
+        if config.FACE_LANDMARK:
+            dummy_landmarks = np.zeros((0, 11))
+            dummy_label['gt_landmarks'] = dummy_landmarks
+        face_label_dict = assign_anchor_fpn(feat_shape_list,
+                                            dummy_label,
+                                            dummy_info,
+                                            config.FACE_LANDMARK,
+                                            prefix='face')
+        label_dict.update(face_label_dict)
+
+        label_list = []
+        for k in self.label_name:
+            label_list.append(label_dict[k])
+        label_shape = [(k, tuple([input_batch_size] + list(v.shape[1:])))
+                       for k, v in zip(self.label_name, label_list)]
+        return max_data_shape, label_shape
+
+    def get_batch(self):
+        deq = self.q_out.get()
+        print('q_out got')
+        data_list, label_list = deq
+
+        for data, label in zip(data_list, label_list):
+            data_shape = {k: v.shape for k, v in data.items()}
+            del data_shape['im_info']
+            feat_shape_list = []
+            for s in range(len(self.feat_stride)):
+                _, feat_shape, _ = self.feat_sym[s].infer_shape(**data_shape)
+                feat_shape = [int(i) for i in feat_shape[0]]
+                feat_shape_list.append(feat_shape)
+            #for k in self.label_name:
+            #  label[k] = [0 for i in range(config.TRAIN.BATCH_IMAGES)]
+            im_info = data['im_info']
+            gt_boxes = label['gt_boxes']
+            gt_label = {'gt_boxes': gt_boxes}
+            label_dict = {}
+            head_label_dict = assign_anchor_fpn(feat_shape_list,
+                                                gt_label,
+                                                im_info,
+                                                False,
+                                                prefix='head')
+            label_dict.update(head_label_dict)
+            if config.FACE_LANDMARK:
+                gt_landmarks = label['gt_landmarks']
+                gt_label['gt_landmarks'] = gt_landmarks
+            face_label_dict = assign_anchor_fpn(feat_shape_list,
+                                                gt_label,
+                                                im_info,
+                                                config.FACE_LANDMARK,
+                                                prefix='face')
+            label_dict.update(face_label_dict)
+            #print('im_info', im_info.shape)
+            #print(gt_boxes.shape)
+            for k in self.label_name:
+                label[k] = label_dict[k]
+
+        all_data = dict()
+        for key in self.data_name:
+            all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+        all_label = dict()
+        for key in self.label_name:
+            pad = 0 if key.startswith('bbox_') else -1
+            #print('label vstack', key, pad, len(label_list), file=sys.stderr)
+            all_label[key] = tensor_vstack(
+                [batch[key] for batch in label_list], pad=pad)
+        self.data = [mx.nd.array(all_data[key]) for key in self.data_name]
+        self.label = [mx.nd.array(all_label[key]) for key in self.label_name]
diff --git a/insightface/detection/retinaface/rcnn/core/metric.py b/insightface/detection/retinaface/rcnn/core/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..afdc92522b045d4bd8d44cddb19a279a45f644b4
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/metric.py
@@ -0,0 +1,166 @@
+from __future__ import print_function
+import sys
+import mxnet as mx
+import numpy as np
+
+from rcnn.config import config
+
+
+def get_rpn_names():
+    pred = ['rpn_cls_prob', 'rpn_bbox_loss', 'rpn_label', 'rpn_bbox_weight']
+    label = ['rpn_label', 'rpn_bbox_target', 'rpn_bbox_weight']
+    return pred, label
+
+
+class RPNAccMetric(mx.metric.EvalMetric):
+    def __init__(self, pred_idx=-1, label_idx=-1, name='RPNAcc'):
+        super(RPNAccMetric, self).__init__(name)
+        self.pred, self.label = get_rpn_names()
+        #self.name = 'RPNAcc'
+        self.name = [name, name + '_BG', name + '_FG']
+        self.pred_idx = pred_idx
+        self.label_idx = label_idx
+        self.STAT = [0, 0, 0]
+
+    def reset(self):
+        """Clear the internal statistics to initial state."""
+        if isinstance(self.name, str):
+            self.num_inst = 0
+            self.sum_metric = 0.0
+        else:
+            #print('reset to ',len(self.name), self.name, file=sys.stderr)
+            self.num_inst = [0] * len(self.name)
+            self.sum_metric = [0.0] * len(self.name)
+
+    def get(self):
+        if isinstance(self.name, str):
+            if self.num_inst == 0:
+                return (self.name, float('nan'))
+            else:
+                return (self.name, self.sum_metric / self.num_inst)
+        else:
+            names = ['%s' % (self.name[i]) for i in range(len(self.name))]
+            values = [x / y if y != 0 else float('nan') \
+                for x, y in zip(self.sum_metric, self.num_inst)]
+            return (names, values)
+
+    def update(self, labels, preds):
+        if self.pred_idx >= 0 and self.label_idx >= 0:
+            pred = preds[self.pred_idx]
+            label = preds[self.label_idx]
+        else:
+            pred = preds[self.pred.index('rpn_cls_prob')]
+            label = labels[self.label.index('rpn_label')]
+            #label = preds[self.pred.index('rpn_label')]
+
+        num_images = pred.shape[0]
+        #print(pred.shape, label.shape, file=sys.stderr)
+        # pred (b, c, p) or (b, c, h, w)
+        pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32')
+        #pred_label = pred_label.reshape((pred_label.shape[0], -1))
+        pred_label = pred_label.reshape(-1, )
+        # label (b, p)
+        label = label.asnumpy().astype('int32').reshape(-1, )
+        #print(pred_label.shape, label.shape)
+
+        # filter with keep_inds
+        keep_inds = np.where(label != -1)[0]
+        #print('in_metric acc', pred_label.shape, label.shape, len(keep_inds), file=sys.stderr)
+        #print(keep_inds, file=sys.stderr)
+        _pred_label = pred_label[keep_inds]
+        _label = label[keep_inds]
+        #print('in_metric2', pred_label.shape, label.shape, len(keep_inds), file=sys.stderr)
+        if isinstance(self.name, str):
+            self.sum_metric += np.sum(_pred_label.flat == _label.flat)
+            self.num_inst += len(_pred_label.flat)
+        else:
+            self.sum_metric[0] += np.sum(_pred_label.flat == _label.flat)
+            self.num_inst[0] += len(_pred_label.flat)
+
+            keep_inds = np.where(label == 0)[0]
+            _pred_label = pred_label[keep_inds]
+            _label = label[keep_inds]
+            self.sum_metric[1] += np.sum(_pred_label.flat == _label.flat)
+            self.num_inst[1] += len(_pred_label.flat)
+
+            keep_inds = np.where(label == 1)[0]
+            _pred_label = pred_label[keep_inds]
+            _label = label[keep_inds]
+            a = np.sum(_pred_label.flat == _label.flat)
+            b = len(_pred_label.flat)
+            self.sum_metric[2] += a
+            self.num_inst[2] += b
+
+            #self.STAT[0]+=a
+            #self.STAT[1]+=b
+            #self.STAT[2]+=num_images
+            #if self.STAT[2]%400==0:
+            #  print('FG_ACC', self.pred_idx, self.STAT[2], self.STAT[0], self.STAT[1], float(self.STAT[0])/self.STAT[1], file=sys.stderr)
+            #  self.STAT = [0,0,0]
+
+
+class RPNLogLossMetric(mx.metric.EvalMetric):
+    def __init__(self, pred_idx=-1, label_idx=-1):
+        super(RPNLogLossMetric, self).__init__('RPNLogLoss')
+        self.pred, self.label = get_rpn_names()
+        self.pred_idx = pred_idx
+        self.label_idx = label_idx
+
+    def update(self, labels, preds):
+        if self.pred_idx >= 0 and self.label_idx >= 0:
+            pred = preds[self.pred_idx]
+            label = preds[self.label_idx]
+        else:
+            pred = preds[self.pred.index('rpn_cls_prob')]
+            label = labels[self.label.index('rpn_label')]
+            #label = preds[self.pred.index('rpn_label')]
+
+        # label (b, p)
+        label = label.asnumpy().astype('int32').reshape((-1))
+        # pred (b, c, p) or (b, c, h, w) --> (b, p, c) --> (b*p, c)
+        pred = pred.asnumpy().reshape(
+            (pred.shape[0], pred.shape[1], -1)).transpose((0, 2, 1))
+        pred = pred.reshape((label.shape[0], -1))
+
+        # filter with keep_inds
+        keep_inds = np.where(label != -1)[0]
+        label = label[keep_inds]
+        cls = pred[keep_inds, label]
+        #print('in_metric log', label.shape, cls.shape, file=sys.stderr)
+
+        cls += 1e-14
+        cls_loss = -1 * np.log(cls)
+        cls_loss = np.sum(cls_loss)
+        self.sum_metric += cls_loss
+        self.num_inst += label.shape[0]
+
+
+class RPNL1LossMetric(mx.metric.EvalMetric):
+    def __init__(self, loss_idx=-1, weight_idx=-1, name='RPNL1Loss'):
+        super(RPNL1LossMetric, self).__init__(name)
+        self.pred, self.label = get_rpn_names()
+        self.loss_idx = loss_idx
+        self.weight_idx = weight_idx
+        self.name = name
+
+    def update(self, labels, preds):
+        if self.loss_idx >= 0 and self.weight_idx >= 0:
+            bbox_loss = preds[self.loss_idx].asnumpy()
+            bbox_weight = preds[self.weight_idx].asnumpy()
+        else:
+            bbox_loss = preds[self.pred.index('rpn_bbox_loss')].asnumpy()
+            bbox_weight = labels[self.label.index('rpn_bbox_weight')].asnumpy()
+            #bbox_weight = preds[self.pred.index('rpn_bbox_weight')].asnumpy()
+
+        #print('in_metric', self.name, bbox_weight.shape, bbox_loss.shape)
+
+        # calculate num_inst (average on those fg anchors)
+        if config.LR_MODE == 0:
+            num_inst = np.sum(bbox_weight > 0) / (bbox_weight.shape[1] /
+                                                  config.NUM_ANCHORS)
+        else:
+            num_inst = 1
+        #print('in_metric log', bbox_loss.shape, num_inst, file=sys.stderr)
+
+        self.sum_metric += np.sum(bbox_loss)
+        self.num_inst += num_inst
diff --git a/insightface/detection/retinaface/rcnn/core/module.py b/insightface/detection/retinaface/rcnn/core/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..731a4ff1b40c8a1d132b053a3aaede86ec8aa19f
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/module.py
@@ -0,0 +1,259 @@
+"""A `MutableModule` implement the `BaseModule` API, and allows input shape
+varying with training iterations. If shapes vary, executors will rebind,
+using shared arrays from the initial module binded with maximum shape.
+"""
+
+import logging
+
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+
+
+class MutableModule(BaseModule):
+    """A mutable module is a module that supports variable input data.
+
+    Parameters
+    ----------
+    symbol : Symbol
+    data_names : list of str
+    label_names : list of str
+    logger : Logger
+    context : Context or list of Context
+    work_load_list : list of number
+    max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    fixed_param_prefix : list of str, indicating fixed parameters
+    """
+    def __init__(self,
+                 symbol,
+                 data_names,
+                 label_names,
+                 logger=logging,
+                 context=ctx.cpu(),
+                 work_load_list=None,
+                 max_data_shapes=None,
+                 max_label_shapes=None,
+                 fixed_param_prefix=None):
+        super(MutableModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+
+        self._curr_module = None
+        self._max_data_shapes = max_data_shapes
+        self._max_label_shapes = max_label_shapes
+        self._fixed_param_prefix = fixed_param_prefix
+
+        fixed_param_names = list()
+        if fixed_param_prefix is not None:
+            for name in self._symbol.list_arguments():
+                for prefix in self._fixed_param_prefix:
+                    if prefix in name:
+                        fixed_param_names.append(name)
+        self._fixed_param_names = fixed_param_names
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_params()
+
+    def init_params(self,
+                    initializer=Uniform(0.01),
+                    arg_params=None,
+                    aux_params=None,
+                    allow_missing=False,
+                    force_init=False,
+                    allow_extra=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        self._curr_module.init_params(initializer=initializer,
+                                      arg_params=arg_params,
+                                      aux_params=aux_params,
+                                      allow_missing=allow_missing,
+                                      force_init=force_init,
+                                      allow_extra=allow_extra)
+        self.params_initialized = True
+
+    def bind(self,
+             data_shapes,
+             label_shapes=None,
+             for_training=True,
+             inputs_need_grad=False,
+             force_rebind=False,
+             shared_module=None):
+        # in case we already initialized params, keep it
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+
+        max_shapes_dict = dict()
+        if self._max_data_shapes is not None:
+            max_shapes_dict.update(dict(self._max_data_shapes))
+        if self._max_label_shapes is not None:
+            max_shapes_dict.update(dict(self._max_label_shapes))
+
+        max_data_shapes = list()
+        for name, shape in data_shapes:
+            if name in max_shapes_dict:
+                max_data_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_data_shapes.append((name, shape))
+
+        max_label_shapes = list()
+        if label_shapes is not None:
+            for name, shape in label_shapes:
+                if name in max_shapes_dict:
+                    max_label_shapes.append((name, max_shapes_dict[name]))
+                else:
+                    max_label_shapes.append((name, shape))
+
+        if len(max_label_shapes) == 0:
+            max_label_shapes = None
+
+        module = Module(self._symbol,
+                        self._data_names,
+                        self._label_names,
+                        logger=self.logger,
+                        context=self._context,
+                        work_load_list=self._work_load_list,
+                        fixed_param_names=self._fixed_param_names)
+        module.bind(max_data_shapes,
+                    max_label_shapes,
+                    for_training,
+                    inputs_need_grad,
+                    force_rebind=False,
+                    shared_module=None)
+        self._curr_module = module
+
+        # copy back saved params, if already initialized
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self,
+                       kvstore='local',
+                       optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01), ),
+                       force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore,
+                                         optimizer,
+                                         optimizer_params,
+                                         force_init=force_init)
+        self.optimizer_initialized = True
+
+    def forward(self, data_batch, is_train=None):
+        assert self.binded and self.params_initialized
+
+        # get current_shapes
+        if self._curr_module.label_shapes is not None:
+            current_shapes = dict(self._curr_module.data_shapes +
+                                  self._curr_module.label_shapes)
+        else:
+            current_shapes = dict(self._curr_module.data_shapes)
+
+        # get input_shapes
+        if data_batch.provide_label is not None:
+            input_shapes = dict(data_batch.provide_data +
+                                data_batch.provide_label)
+        else:
+            input_shapes = dict(data_batch.provide_data)
+
+        # decide if shape changed
+        shape_changed = False
+        for k, v in current_shapes.items():
+            if v != input_shapes[k]:
+                shape_changed = True
+
+        if shape_changed:
+            module = Module(self._symbol,
+                            self._data_names,
+                            self._label_names,
+                            logger=self.logger,
+                            context=self._context,
+                            work_load_list=self._work_load_list,
+                            fixed_param_names=self._fixed_param_names)
+            module.bind(data_batch.provide_data,
+                        data_batch.provide_label,
+                        self._curr_module.for_training,
+                        self._curr_module.inputs_need_grad,
+                        force_rebind=False,
+                        shared_module=self._curr_module)
+            self._curr_module = module
+
+        self._curr_module.forward(data_batch, is_train=is_train)
+
+    def backward(self, out_grads=None):
+        assert self.binded and self.params_initialized
+        self._curr_module.backward(out_grads=out_grads)
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(
+            merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(
+            merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        self._curr_module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._curr_module.install_monitor(mon)
diff --git a/insightface/detection/retinaface/rcnn/core/module_bak.py b/insightface/detection/retinaface/rcnn/core/module_bak.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c819025100e4f0c25acf0ca7d9bd68de143483c
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/module_bak.py
@@ -0,0 +1,260 @@
+"""A `MutableModule` implement the `BaseModule` API, and allows input shape
+varying with training iterations. If shapes vary, executors will rebind,
+using shared arrays from the initial module binded with maximum shape.
+"""
+
+import logging
+
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+
+
+class MutableModule(BaseModule):
+    """A mutable module is a module that supports variable input data.
+
+    Parameters
+    ----------
+    symbol : Symbol
+    data_names : list of str
+    label_names : list of str
+    logger : Logger
+    context : Context or list of Context
+    work_load_list : list of number
+    max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    fixed_param_prefix : list of str, indicating fixed parameters
+    """
+    def __init__(self,
+                 symbol,
+                 data_names,
+                 label_names,
+                 logger=logging,
+                 context=ctx.cpu(),
+                 work_load_list=None,
+                 max_data_shapes=None,
+                 max_label_shapes=None,
+                 fixed_param_prefix=None):
+        super(MutableModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+
+        self._curr_module = None
+        self._max_data_shapes = max_data_shapes
+        self._max_label_shapes = max_label_shapes
+        self._fixed_param_prefix = fixed_param_prefix
+
+        fixed_param_names = list()
+        if fixed_param_prefix is not None:
+            for name in self._symbol.list_arguments():
+                for prefix in self._fixed_param_prefix:
+                    if prefix in name:
+                        fixed_param_names.append(name)
+        self._fixed_param_names = fixed_param_names
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_params()
+
+    def init_params(self,
+                    initializer=Uniform(0.01),
+                    arg_params=None,
+                    aux_params=None,
+                    allow_missing=False,
+                    force_init=False,
+                    allow_extra=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        self._curr_module.init_params(initializer=initializer,
+                                      arg_params=arg_params,
+                                      aux_params=aux_params,
+                                      allow_missing=allow_missing,
+                                      force_init=force_init,
+                                      allow_extra=allow_extra)
+        self.params_initialized = True
+
+    def bind(self,
+             data_shapes,
+             label_shapes=None,
+             for_training=True,
+             inputs_need_grad=False,
+             force_rebind=False,
+             shared_module=None,
+             grad_req='write'):
+        # in case we already initialized params, keep it
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+
+        max_shapes_dict = dict()
+        if self._max_data_shapes is not None:
+            max_shapes_dict.update(dict(self._max_data_shapes))
+        if self._max_label_shapes is not None:
+            max_shapes_dict.update(dict(self._max_label_shapes))
+
+        max_data_shapes = list()
+        for name, shape in data_shapes:
+            if name in max_shapes_dict:
+                max_data_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_data_shapes.append((name, shape))
+
+        max_label_shapes = list()
+        if label_shapes is not None:
+            for name, shape in label_shapes:
+                if name in max_shapes_dict:
+                    max_label_shapes.append((name, max_shapes_dict[name]))
+                else:
+                    max_label_shapes.append((name, shape))
+
+        if len(max_label_shapes) == 0:
+            max_label_shapes = None
+
+        module = Module(self._symbol,
+                        self._data_names,
+                        self._label_names,
+                        logger=self.logger,
+                        context=self._context,
+                        work_load_list=self._work_load_list,
+                        fixed_param_names=self._fixed_param_names)
+        module.bind(max_data_shapes,
+                    max_label_shapes,
+                    for_training,
+                    inputs_need_grad,
+                    force_rebind=False,
+                    shared_module=None)
+        self._curr_module = module
+
+        # copy back saved params, if already initialized
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self,
+                       kvstore='local',
+                       optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01), ),
+                       force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore,
+                                         optimizer,
+                                         optimizer_params,
+                                         force_init=force_init)
+        self.optimizer_initialized = True
+
+    def forward(self, data_batch, is_train=None):
+        assert self.binded and self.params_initialized
+
+        # get current_shapes
+        if self._curr_module.label_shapes is not None:
+            current_shapes = dict(self._curr_module.data_shapes +
+                                  self._curr_module.label_shapes)
+        else:
+            current_shapes = dict(self._curr_module.data_shapes)
+
+        # get input_shapes
+        if data_batch.provide_label is not None:
+            input_shapes = dict(data_batch.provide_data +
+                                data_batch.provide_label)
+        else:
+            input_shapes = dict(data_batch.provide_data)
+
+        # decide if shape changed
+        shape_changed = False
+        for k, v in current_shapes.items():
+            if v != input_shapes[k]:
+                shape_changed = True
+
+        if shape_changed:
+            module = Module(self._symbol,
+                            self._data_names,
+                            self._label_names,
+                            logger=self.logger,
+                            context=self._context,
+                            work_load_list=self._work_load_list,
+                            fixed_param_names=self._fixed_param_names)
+            module.bind(data_batch.provide_data,
+                        data_batch.provide_label,
+                        self._curr_module.for_training,
+                        self._curr_module.inputs_need_grad,
+                        force_rebind=False,
+                        shared_module=self._curr_module)
+            self._curr_module = module
+
+        self._curr_module.forward(data_batch, is_train=is_train)
+
+    def backward(self, out_grads=None):
+        assert self.binded and self.params_initialized
+        self._curr_module.backward(out_grads=out_grads)
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(
+            merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(
+            merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        self._curr_module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._curr_module.install_monitor(mon)
diff --git a/insightface/detection/retinaface/rcnn/core/tester.py b/insightface/detection/retinaface/rcnn/core/tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5d32b1c7fdec0f1ad4da3d84f53f627b8bf8a
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/core/tester.py
@@ -0,0 +1,527 @@
+from __future__ import print_function
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import os
+import sys
+import time
+import mxnet as mx
+import numpy as np
+from builtins import range
+
+from mxnet.module import Module
+from .module import MutableModule
+from rcnn.logger import logger
+from rcnn.config import config
+from rcnn.io import image
+from rcnn.processing.bbox_transform import bbox_pred, clip_boxes
+from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper
+from rcnn.processing.bbox_transform import bbox_overlaps
+
+
+def IOU(Reframe, GTframe):
+    x1 = Reframe[0]
+    y1 = Reframe[1]
+    width1 = Reframe[2] - Reframe[0]
+    height1 = Reframe[3] - Reframe[1]
+
+    x2 = GTframe[0]
+    y2 = GTframe[1]
+    width2 = GTframe[2] - GTframe[0]
+    height2 = GTframe[3] - GTframe[1]
+
+    endx = max(x1 + width1, x2 + width2)
+    startx = min(x1, x2)
+    width = width1 + width2 - (endx - startx)
+
+    endy = max(y1 + height1, y2 + height2)
+    starty = min(y1, y2)
+    height = height1 + height2 - (endy - starty)
+
+    if width <= 0 or height <= 0:
+        ratio = 0
+    else:
+        Area = width * height
+        Area1 = width1 * height1
+        Area2 = width2 * height2
+        ratio = Area * 1. / (Area1 + Area2 - Area)
+    return ratio
+
+
+class Predictor(object):
+    def __init__(self,
+                 symbol,
+                 data_names,
+                 label_names,
+                 context=mx.cpu(),
+                 max_data_shapes=None,
+                 provide_data=None,
+                 provide_label=None,
+                 arg_params=None,
+                 aux_params=None):
+        #self._mod = MutableModule(symbol, data_names, label_names,
+        #                          context=context, max_data_shapes=max_data_shapes)
+        self._mod = Module(symbol, data_names, label_names, context=context)
+        self._mod.bind(provide_data, provide_label, for_training=False)
+        self._mod.init_params(arg_params=arg_params, aux_params=aux_params)
+
+    def predict(self, data_batch):
+        self._mod.forward(data_batch)
+        return dict(zip(self._mod.output_names,
+                        self._mod.get_outputs()))  #TODO
+        #return self._mod.get_outputs()
+
+
+def im_proposal(predictor, data_batch, data_names, scale):
+    data_dict = dict(zip(data_names, data_batch.data))
+    output = predictor.predict(data_batch)
+
+    # drop the batch index
+    boxes = output['rois_output'].asnumpy()[:, 1:]
+    scores = output['rois_score'].asnumpy()
+
+    # transform to original scale
+    boxes = boxes / scale
+
+    return scores, boxes, data_dict
+
+
+def _im_proposal(predictor, data_batch, data_names, scale):
+    data_dict = dict(zip(data_names, data_batch.data))
+    output = predictor.predict(data_batch)
+    print('output', output)
+
+    # drop the batch index
+    boxes = output['rois_output'].asnumpy()[:, 1:]
+    scores = output['rois_score'].asnumpy()
+
+    # transform to original scale
+    boxes = boxes / scale
+
+    return scores, boxes, data_dict
+
+
+def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
+    """
+    Generate detections results using RPN.
+    :param predictor: Predictor
+    :param test_data: data iterator, must be non-shuffled
+    :param imdb: image database
+    :param vis: controls visualization
+    :param thresh: thresh for valid detections
+    :return: list of detected boxes
+    """
+    assert vis or not test_data.shuffle
+    data_names = [k[0] for k in test_data.provide_data]
+
+    i = 0
+    t = time.time()
+    imdb_boxes = list()
+    original_boxes = list()
+    for im_info, data_batch in test_data:
+        t1 = time.time() - t
+        t = time.time()
+
+        scale = im_info[0, 2]
+        scores, boxes, data_dict = im_proposal(predictor, data_batch,
+                                               data_names, scale)
+        print(scores.shape, boxes.shape, file=sys.stderr)
+        t2 = time.time() - t
+        t = time.time()
+
+        # assemble proposals
+        dets = np.hstack((boxes, scores))
+        original_boxes.append(dets)
+
+        # filter proposals
+        keep = np.where(dets[:, 4:] > thresh)[0]
+        dets = dets[keep, :]
+        imdb_boxes.append(dets)
+
+        if vis:
+            vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'],
+                              scale)
+
+        logger.info('generating %d/%d ' % (i + 1, imdb.num_images) +
+                    'proposal %d ' % (dets.shape[0]) + 'data %.4fs net %.4fs' %
+                    (t1, t2))
+        i += 1
+
+    assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
+
+    # save results
+    rpn_folder = os.path.join(imdb.root_path, 'rpn_data')
+    if not os.path.exists(rpn_folder):
+        os.mkdir(rpn_folder)
+
+    rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl')
+    with open(rpn_file, 'wb') as f:
+        pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL)
+
+    if thresh > 0:
+        full_rpn_file = os.path.join(rpn_folder, imdb.name + '_full_rpn.pkl')
+        with open(full_rpn_file, 'wb') as f:
+            pickle.dump(original_boxes, f, pickle.HIGHEST_PROTOCOL)
+
+    logger.info('wrote rpn proposals to %s' % rpn_file)
+    return imdb_boxes
+
+
+def test_proposals(predictor, test_data, imdb, roidb, vis=False):
+    """
+    Test detections results using RPN.
+    :param predictor: Predictor
+    :param test_data: data iterator, must be non-shuffled
+    :param imdb: image database
+    :param roidb: roidb 
+    :param vis: controls visualization
+    :return: recall, mAP
+    """
+    assert vis or not test_data.shuffle
+    data_names = [k[0] for k in test_data.provide_data]
+
+    #bbox_file = os.path.join(rpn_folder, imdb.name + '_bbox.txt')
+    #bbox_f = open(bbox_file, 'w')
+
+    i = 0
+    t = time.time()
+    output_folder = os.path.join(imdb.root_path, 'output')
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    imdb_boxes = list()
+    original_boxes = list()
+    gt_overlaps = np.zeros(0)
+    overall = [0.0, 0.0]
+    gt_max = np.array((0.0, 0.0))
+    num_pos = 0
+    #apply scale, for SSH
+    #_, roidb = image.get_image(roidb)
+    for im_info, data_batch in test_data:
+        t1 = time.time() - t
+        t = time.time()
+
+        oscale = im_info[0, 2]
+        #print('scale', scale, file=sys.stderr)
+        scale = 1.0  #fix scale=1.0 for SSH face detector
+        scores, boxes, data_dict = im_proposal(predictor, data_batch,
+                                               data_names, scale)
+        #print(scores.shape, boxes.shape, file=sys.stderr)
+        t2 = time.time() - t
+        t = time.time()
+
+        # assemble proposals
+        dets = np.hstack((boxes, scores))
+        original_boxes.append(dets)
+
+        # filter proposals
+        keep = np.where(dets[:, 4:] > config.TEST.SCORE_THRESH)[0]
+        dets = dets[keep, :]
+        imdb_boxes.append(dets)
+
+        logger.info('generating %d/%d ' % (i + 1, imdb.num_images) +
+                    'proposal %d ' % (dets.shape[0]) + 'data %.4fs net %.4fs' %
+                    (t1, t2))
+
+        #if dets.shape[0]==0:
+        #  continue
+        if vis:
+            vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'],
+                              scale)
+        boxes = dets
+        #max_gt_overlaps = roidb[i]['gt_overlaps'].max(axis=1)
+        #gt_inds = np.where((roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0]
+        #gt_boxes = roidb[i]['boxes'][gt_inds, :]
+        gt_boxes = roidb[i]['boxes'].copy(
+        ) * oscale  # as roidb is the original one, need to scale GT for SSH
+        gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] -
+                                                            gt_boxes[:, 1] + 1)
+        num_pos += gt_boxes.shape[0]
+
+        overlaps = bbox_overlaps(boxes.astype(np.float),
+                                 gt_boxes.astype(np.float))
+        #print(im_info, gt_boxes.shape, boxes.shape, overlaps.shape, file=sys.stderr)
+
+        _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+        # choose whatever is smaller to iterate
+
+        #for j in range(gt_boxes.shape[0]):
+        #  print('gt %d,%d,%d,%d'% (gt_boxes[j][0], gt_boxes[j][1], gt_boxes[j][2]-gt_boxes[j][0], gt_boxes[j][3]-gt_boxes[j][1]), file=sys.stderr)
+        #  gt_max = np.maximum( gt_max, np.array( (gt_boxes[j][2], gt_boxes[j][3]) ) )
+        #print('gt max', gt_max, file=sys.stderr)
+        #for j in range(boxes.shape[0]):
+        #  print('anchor_box %.2f,%.2f,%.2f,%.2f'% (boxes[j][0], boxes[j][1], boxes[j][2]-boxes[j][0], boxes[j][3]-boxes[j][1]), file=sys.stderr)
+
+        #rounds = min(boxes.shape[0], gt_boxes.shape[0])
+        #for j in range(rounds):
+        #    # find which proposal maximally covers each gt box
+        #    argmax_overlaps = overlaps.argmax(axis=0)
+        #    print(j, 'argmax_overlaps', argmax_overlaps, file=sys.stderr)
+        #    # get the IoU amount of coverage for each gt box
+        #    max_overlaps = overlaps.max(axis=0)
+        #    print(j, 'max_overlaps', max_overlaps, file=sys.stderr)
+        #    # find which gt box is covered by most IoU
+        #    gt_ind = max_overlaps.argmax()
+        #    gt_ovr = max_overlaps.max()
+        #    assert (gt_ovr >= 0), '%s\n%s\n%s' % (boxes, gt_boxes, overlaps)
+        #    # find the proposal box that covers the best covered gt box
+        #    box_ind = argmax_overlaps[gt_ind]
+        #    print('max box', gt_ind, box_ind, (boxes[box_ind][0], boxes[box_ind][1], boxes[box_ind][2]-boxes[box_ind][0], boxes[box_ind][3]-boxes[box_ind][1], boxes[box_ind][4]), file=sys.stderr)
+        #    # record the IoU coverage of this gt box
+        #    _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+        #    assert (_gt_overlaps[j] == gt_ovr)
+        #    # mark the proposal box and the gt box as used
+        #    overlaps[box_ind, :] = -1
+        #    overlaps[:, gt_ind] = -1
+
+        if boxes.shape[0] > 0:
+            _gt_overlaps = overlaps.max(axis=0)
+            #print('max_overlaps', _gt_overlaps, file=sys.stderr)
+            for j in range(len(_gt_overlaps)):
+                if _gt_overlaps[j] > config.TEST.IOU_THRESH:
+                    continue
+                print(j,
+                      'failed',
+                      gt_boxes[j],
+                      'max_overlap:',
+                      _gt_overlaps[j],
+                      file=sys.stderr)
+                #_idx = np.where(overlaps[:,j]>0.4)[0]
+                #print(j, _idx, file=sys.stderr)
+                #print(overlaps[_idx,j], file=sys.stderr)
+                #for __idx in _idx:
+                #  print(gt_boxes[j], boxes[__idx], overlaps[__idx,j], IOU(gt_boxes[j], boxes[__idx,0:4]), file=sys.stderr)
+
+            # append recorded IoU coverage level
+            found = (_gt_overlaps > config.TEST.IOU_THRESH).sum()
+            _recall = found / float(gt_boxes.shape[0])
+            print('recall',
+                  _recall,
+                  gt_boxes.shape[0],
+                  boxes.shape[0],
+                  gt_areas,
+                  file=sys.stderr)
+            overall[0] += found
+            overall[1] += gt_boxes.shape[0]
+            #gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+            #_recall = (gt_overlaps >= threshold).sum() / float(num_pos)
+            _recall = float(overall[0]) / overall[1]
+            print('recall_all', _recall, file=sys.stderr)
+
+        boxes[:, 0:4] /= oscale
+        _vec = roidb[i]['image'].split('/')
+        out_dir = os.path.join(output_folder, _vec[-2])
+        if not os.path.exists(out_dir):
+            os.mkdir(out_dir)
+        out_file = os.path.join(out_dir, _vec[-1].replace('jpg', 'txt'))
+        with open(out_file, 'w') as f:
+            name = '/'.join(roidb[i]['image'].split('/')[-2:])
+            f.write("%s\n" % (name))
+            f.write("%d\n" % (boxes.shape[0]))
+            for b in range(boxes.shape[0]):
+                box = boxes[b]
+                f.write(
+                    "%d %d %d %d %g \n" %
+                    (box[0], box[1], box[2] - box[0], box[3] - box[1], box[4]))
+        i += 1
+
+    #bbox_f.close()
+    return
+    gt_overlaps = np.sort(gt_overlaps)
+    recalls = np.zeros_like(thresholds)
+
+    # compute recall for each IoU threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+    ar = recalls.mean()
+
+    # print results
+    print('average recall for {}: {:.3f}'.format(area_name, ar))
+    for threshold, recall in zip(thresholds, recalls):
+        print('recall @{:.2f}: {:.3f}'.format(threshold, recall))
+
+    assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
+
+    # save results
+
+    rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl')
+    with open(rpn_file, 'wb') as f:
+        pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL)
+
+    logger.info('wrote rpn proposals to %s' % rpn_file)
+    return imdb_boxes
+
+
+def im_detect(predictor, data_batch, data_names, scale):
+    output = predictor.predict(data_batch)
+
+    data_dict = dict(zip(data_names, data_batch.data))
+    if config.TEST.HAS_RPN:
+        rois = output['rois_output'].asnumpy()[:, 1:]
+    else:
+        rois = data_dict['rois'].asnumpy().reshape((-1, 5))[:, 1:]
+    im_shape = data_dict['data'].shape
+
+    # save output
+    scores = output['cls_prob_reshape_output'].asnumpy()[0]
+    bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0]
+
+    # post processing
+    pred_boxes = bbox_pred(rois, bbox_deltas)
+    pred_boxes = clip_boxes(pred_boxes, im_shape[-2:])
+
+    # we used scaled image & roi to train, so it is necessary to transform them back
+    pred_boxes = pred_boxes / scale
+
+    return scores, pred_boxes, data_dict
+
+
+def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3):
+    """
+    wrapper for calculating offline validation for faster data analysis
+    in this example, all threshold are set by hand
+    :param predictor: Predictor
+    :param test_data: data iterator, must be non-shuffle
+    :param imdb: image database
+    :param vis: controls visualization
+    :param thresh: valid detection threshold
+    :return:
+    """
+    assert vis or not test_data.shuffle
+    data_names = [k[0] for k in test_data.provide_data]
+
+    nms = py_nms_wrapper(config.TEST.NMS)
+
+    # limit detections to max_per_image over all classes
+    max_per_image = -1
+
+    num_images = imdb.num_images
+    # all detections are collected into:
+    #    all_boxes[cls][image] = N x 5 array of detections in
+    #    (x1, y1, x2, y2, score)
+    all_boxes = [[[] for _ in range(num_images)]
+                 for _ in range(imdb.num_classes)]
+
+    i = 0
+    t = time.time()
+    for im_info, data_batch in test_data:
+        t1 = time.time() - t
+        t = time.time()
+
+        scale = im_info[0, 2]
+        scores, boxes, data_dict = im_detect(predictor, data_batch, data_names,
+                                             scale)
+
+        t2 = time.time() - t
+        t = time.time()
+
+        for j in range(1, imdb.num_classes):
+            indexes = np.where(scores[:, j] > thresh)[0]
+            cls_scores = scores[indexes, j, np.newaxis]
+            cls_boxes = boxes[indexes, j * 4:(j + 1) * 4]
+            cls_dets = np.hstack((cls_boxes, cls_scores))
+            keep = nms(cls_dets)
+            all_boxes[j][i] = cls_dets[keep, :]
+
+        if max_per_image > 0:
+            image_scores = np.hstack(
+                [all_boxes[j][i][:, -1] for j in range(1, imdb.num_classes)])
+            if len(image_scores) > max_per_image:
+                image_thresh = np.sort(image_scores)[-max_per_image]
+                for j in range(1, imdb.num_classes):
+                    keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
+                    all_boxes[j][i] = all_boxes[j][i][keep, :]
+
+        if vis:
+            boxes_this_image = [[]] + [
+                all_boxes[j][i] for j in range(1, imdb.num_classes)
+            ]
+            vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image,
+                              imdb.classes, scale)
+
+        t3 = time.time() - t
+        t = time.time()
+        logger.info('testing %d/%d data %.4fs net %.4fs post %.4fs' %
+                    (i, imdb.num_images, t1, t2, t3))
+        i += 1
+
+    det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl')
+    with open(det_file, 'wb') as f:
+        pickle.dump(all_boxes, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+    imdb.evaluate_detections(all_boxes)
+
+
+def vis_all_detection(im_array, detections, class_names, scale):
+    """
+    visualize all detections in one image
+    :param im_array: [b=1 c h w] in rgb
+    :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ]
+    :param class_names: list of names in imdb
+    :param scale: visualize the scaled image
+    :return:
+    """
+    import matplotlib.pyplot as plt
+    import random
+    im = image.transform_inverse(im_array, config.PIXEL_MEANS)
+    plt.imshow(im)
+    for j, name in enumerate(class_names):
+        if name == '__background__':
+            continue
+        color = (random.random(), random.random(), random.random()
+                 )  # generate a random color
+        dets = detections[j]
+        for det in dets:
+            bbox = det[:4] * scale
+            score = det[-1]
+            rect = plt.Rectangle((bbox[0], bbox[1]),
+                                 bbox[2] - bbox[0],
+                                 bbox[3] - bbox[1],
+                                 fill=False,
+                                 edgecolor=color,
+                                 linewidth=3.5)
+            plt.gca().add_patch(rect)
+            plt.gca().text(bbox[0],
+                           bbox[1] - 2,
+                           '{:s} {:.3f}'.format(name, score),
+                           bbox=dict(facecolor=color, alpha=0.5),
+                           fontsize=12,
+                           color='white')
+    plt.show()
+
+
+def draw_all_detection(im_array, detections, class_names, scale):
+    """
+    visualize all detections in one image
+    :param im_array: [b=1 c h w] in rgb
+    :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ]
+    :param class_names: list of names in imdb
+    :param scale: visualize the scaled image
+    :return:
+    """
+    import cv2
+    import random
+    color_white = (255, 255, 255)
+    im = image.transform_inverse(im_array, config.PIXEL_MEANS)
+    # change to bgr
+    im = cv2.cvtColor(im, cv2.cv.CV_RGB2BGR)
+    for j, name in enumerate(class_names):
+        if name == '__background__':
+            continue
+        color = (random.randint(0, 256), random.randint(0, 256),
+                 random.randint(0, 256))  # generate a random color
+        dets = detections[j]
+        for det in dets:
+            bbox = det[:4] * scale
+            score = det[-1]
+            bbox = map(int, bbox)
+            cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                          color=color,
+                          thickness=2)
+            cv2.putText(im,
+                        '%s %.3f' % (class_names[j], score),
+                        (bbox[0], bbox[1] + 10),
+                        color=color_white,
+                        fontFace=cv2.FONT_HERSHEY_COMPLEX,
+                        fontScale=0.5)
+    return im
diff --git a/insightface/detection/retinaface/rcnn/cython/.gitignore b/insightface/detection/retinaface/rcnn/cython/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..15a165d427164752e6ca66d787cf8dbf21a87cd5
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/.gitignore
@@ -0,0 +1,3 @@
+*.c
+*.cpp
+*.so
diff --git a/insightface/detection/retinaface/rcnn/cython/__init__.py b/insightface/detection/retinaface/rcnn/cython/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/cython/anchors.pyx b/insightface/detection/retinaface/rcnn/cython/anchors.pyx
new file mode 100755
index 0000000000000000000000000000000000000000..7005199125c8c82a59d662cdebcfe8c0117e3ffd
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/anchors.pyx
@@ -0,0 +1,35 @@
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float32
+ctypedef np.float32_t DTYPE_t
+
+def anchors_cython(int height, int width, int stride, np.ndarray[DTYPE_t, ndim=2] base_anchors):
+    """
+    Parameters
+    ----------
+    height: height of plane
+    width:  width of plane
+    stride: stride ot the original image
+    anchors_base: (A, 4) a base set of anchors
+    Returns
+    -------
+    all_anchors: (height, width, A, 4) ndarray of anchors spreading over the plane
+    """
+    cdef unsigned int A = base_anchors.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=4] all_anchors = np.zeros((height, width, A, 4), dtype=DTYPE)
+    cdef unsigned int iw, ih
+    cdef unsigned int k
+    cdef unsigned int sh
+    cdef unsigned int sw
+    for iw in range(width):
+        sw = iw * stride
+        for ih in range(height):
+            sh = ih * stride
+            for k in range(A):
+                all_anchors[ih, iw, k, 0] = base_anchors[k, 0] + sw
+                all_anchors[ih, iw, k, 1] = base_anchors[k, 1] + sh
+                all_anchors[ih, iw, k, 2] = base_anchors[k, 2] + sw
+                all_anchors[ih, iw, k, 3] = base_anchors[k, 3] + sh
+    return all_anchors
\ No newline at end of file
diff --git a/insightface/detection/retinaface/rcnn/cython/bbox.pyx b/insightface/detection/retinaface/rcnn/cython/bbox.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..0c49e120e5ab177e53c318709f23f06372367911
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/bbox.pyx
@@ -0,0 +1,55 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+def bbox_overlaps_cython(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
diff --git a/insightface/detection/retinaface/rcnn/cython/cpu_nms.pyx b/insightface/detection/retinaface/rcnn/cython/cpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..1d0bef3321d78fc73556906649ab61eaaea60d86
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/cpu_nms.pyx
@@ -0,0 +1,68 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
diff --git a/insightface/detection/retinaface/rcnn/cython/gpu_nms.hpp b/insightface/detection/retinaface/rcnn/cython/gpu_nms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..68b6d42cd88b59496b22a9e77919abe529b09014
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/gpu_nms.hpp
@@ -0,0 +1,2 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id);
diff --git a/insightface/detection/retinaface/rcnn/cython/gpu_nms.pyx b/insightface/detection/retinaface/rcnn/cython/gpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..59d84afe94e42de3c456b73580ed83358a2b30d8
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/gpu_nms.pyx
@@ -0,0 +1,31 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/insightface/detection/retinaface/rcnn/cython/nms_kernel.cu b/insightface/detection/retinaface/rcnn/cython/nms_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..038a59012f60ebdf1182ecb778eb3b01a69bc5ed
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/nms_kernel.cu
@@ -0,0 +1,144 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include "gpu_nms.hpp"
+#include <vector>
+#include <iostream>
+
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    if (error != cudaSuccess) { \
+      std::cout << cudaGetErrorString(error) << std::endl; \
+    } \
+  } while (0)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _set_device(int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id) {
+    return;
+  }
+  // The call to cudaSetDevice must come before any calls to Get, which
+  // may perform initialization using the GPU.
+  CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id) {
+  _set_device(device_id);
+
+  float* boxes_dev = NULL;
+  unsigned long long* mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+  CUDA_CHECK(cudaMalloc(&boxes_dev,
+                        boxes_num * boxes_dim * sizeof(float)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev,
+                        boxes_host,
+                        boxes_num * boxes_dim * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(cudaMalloc(&mask_dev,
+                        boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  *num_out = num_to_keep;
+
+  CUDA_CHECK(cudaFree(boxes_dev));
+  CUDA_CHECK(cudaFree(mask_dev));
+}
diff --git a/insightface/detection/retinaface/rcnn/cython/setup.py b/insightface/detection/retinaface/rcnn/cython/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af1a1aed9a9ff0ac9450f21fd1136c310214d43
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/cython/setup.py
@@ -0,0 +1,165 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+from os.path import join as pjoin
+from setuptools import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+import numpy as np
+
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    # Adapted fom
+    # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
+        nvcc = find_in_path('nvcc',
+                            os.environ['PATH'] + os.pathsep + default_path)
+        if nvcc is None:
+            raise EnvironmentError(
+                'The nvcc binary could not be '
+                'located in your $PATH. Either add it to your path, or set $CUDAHOME'
+            )
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {
+        'home': home,
+        'nvcc': nvcc,
+        'include': pjoin(home, 'include'),
+        'lib64': pjoin(home, 'lib64')
+    }
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError(
+                'The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+
+
+# Test if cuda could be foun
+try:
+    CUDA = locate_cuda()
+except EnvironmentError:
+    CUDA = None
+
+# Obtain the numpy include directory.  This logic works across numpy versions.
+try:
+    numpy_include = np.get_include()
+except AttributeError:
+    numpy_include = np.get_numpy_include()
+
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+ext_modules = [
+    Extension("bbox", ["bbox.pyx"],
+              extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+              include_dirs=[numpy_include]),
+    Extension("anchors", ["anchors.pyx"],
+              extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+              include_dirs=[numpy_include]),
+    Extension("cpu_nms", ["cpu_nms.pyx"],
+              extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+              include_dirs=[numpy_include]),
+]
+
+if CUDA is not None:
+    ext_modules.append(
+        Extension(
+            'gpu_nms',
+            ['nms_kernel.cu', 'gpu_nms.pyx'],
+            library_dirs=[CUDA['lib64']],
+            libraries=['cudart'],
+            language='c++',
+            runtime_library_dirs=[CUDA['lib64']],
+            # this syntax is specific to this build system
+            # we're only going to use certain compiler args with nvcc and not with
+            # gcc the implementation of this trick is in customize_compiler() below
+            extra_compile_args={
+                'gcc': ["-Wno-unused-function"],
+                'nvcc': [
+                    '-arch=sm_35', '--ptxas-options=-v', '-c',
+                    '--compiler-options', "'-fPIC'"
+                ]
+            },
+            include_dirs=[numpy_include, CUDA['include']]))
+else:
+    print('Skipping GPU_NMS')
+
+setup(
+    name='frcnn_cython',
+    ext_modules=ext_modules,
+    # inject our custom trigger
+    cmdclass={'build_ext': custom_build_ext},
+)
diff --git a/insightface/detection/retinaface/rcnn/dataset/__init__.py b/insightface/detection/retinaface/rcnn/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcee572aeb234733990eb49e4a3d54b458426b0f
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/dataset/__init__.py
@@ -0,0 +1,2 @@
+from .imdb import IMDB
+from .retinaface import retinaface
diff --git a/insightface/detection/retinaface/rcnn/dataset/ds_utils.py b/insightface/detection/retinaface/rcnn/dataset/ds_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9432515eeb45040e0ccc87809773315f4aaf836b
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/dataset/ds_utils.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def unique_boxes(boxes, scale=1.0):
+    """ return indices of unique boxes """
+    v = np.array([1, 1e3, 1e6, 1e9])
+    hashes = np.round(boxes * scale).dot(v).astype(np.int)
+    _, index = np.unique(hashes, return_index=True)
+    return np.sort(index)
+
+
+def filter_small_boxes(boxes, min_size):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    keep = np.where((w >= min_size) & (h > min_size))[0]
+    return keep
diff --git a/insightface/detection/retinaface/rcnn/dataset/imdb.py b/insightface/detection/retinaface/rcnn/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c19817a857bfb9a2a030f728851f47e125e04e9
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/dataset/imdb.py
@@ -0,0 +1,351 @@
+"""
+General image database
+An image database creates a list of relative image path called image_set_index and
+transform index to absolute image path. As to training, it is necessary that ground
+truth and proposals are mixed together for training.
+roidb
+basic format [image_index]
+['image', 'height', 'width', 'flipped',
+'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets']
+"""
+
+from ..logger import logger
+import os
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import numpy as np
+from ..processing.bbox_transform import bbox_overlaps
+
+
+class IMDB(object):
+    def __init__(self, name, image_set, root_path, dataset_path):
+        """
+        basic information about an image database
+        :param name: name of image database will be used for any output
+        :param root_path: root path store cache and proposal data
+        :param dataset_path: dataset path store images and image lists
+        """
+        self.name = name + '_' + image_set
+        self.image_set = image_set
+        self.root_path = root_path
+        self.data_path = dataset_path
+
+        # abstract attributes
+        self.classes = []
+        self.num_classes = 0
+        self.image_set_index = []
+        self.num_images = 0
+
+        self.config = {}
+
+    def image_path_from_index(self, index):
+        raise NotImplementedError
+
+    def gt_roidb(self):
+        raise NotImplementedError
+
+    def evaluate_detections(self, detections):
+        raise NotImplementedError
+
+    @property
+    def cache_path(self):
+        """
+        make a directory to store all caches
+        :return: cache path
+        """
+        cache_path = os.path.join(self.root_path, 'cache')
+        if not os.path.exists(cache_path):
+            os.mkdir(cache_path)
+        return cache_path
+
+    def image_path_at(self, index):
+        """
+        access image at index in image database
+        :param index: image index in image database
+        :return: image path
+        """
+        return self.image_path_from_index(self.image_set_index[index])
+
+    def load_rpn_data(self, full=False):
+        if full:
+            rpn_file = os.path.join(self.root_path, 'rpn_data',
+                                    self.name + '_full_rpn.pkl')
+        else:
+            rpn_file = os.path.join(self.root_path, 'rpn_data',
+                                    self.name + '_rpn.pkl')
+        assert os.path.exists(
+            rpn_file), '%s rpn data not found at %s' % (self.name, rpn_file)
+        logger.info('%s loading rpn data from %s' % (self.name, rpn_file))
+        with open(rpn_file, 'rb') as f:
+            box_list = pickle.load(f)
+        return box_list
+
+    def load_rpn_roidb(self, gt_roidb):
+        """
+        turn rpn detection boxes into roidb
+        :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        """
+        box_list = self.load_rpn_data()
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def rpn_roidb(self, gt_roidb, append_gt=False):
+        """
+        get rpn roidb and ground truth roidb
+        :param gt_roidb: ground truth roidb
+        :param append_gt: append ground truth
+        :return: roidb of rpn
+        """
+        if append_gt:
+            logger.info('%s appending ground truth annotations' % self.name)
+            rpn_roidb = self.load_rpn_roidb(gt_roidb)
+            roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb)
+        else:
+            roidb = self.load_rpn_roidb(gt_roidb)
+        return roidb
+
+    def create_roidb_from_box_list(self, box_list, gt_roidb):
+        """
+        given ground truth, prepare roidb
+        :param box_list: [image_index] ndarray of [box_index][x1, x2, y1, y2]
+        :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        """
+        assert len(
+            box_list
+        ) == self.num_images, 'number of boxes matrix must match number of images'
+        roidb = []
+        for i in range(self.num_images):
+            roi_rec = dict()
+            roi_rec['image'] = gt_roidb[i]['image']
+            roi_rec['height'] = gt_roidb[i]['height']
+            roi_rec['width'] = gt_roidb[i]['width']
+
+            boxes = box_list[i]
+            if boxes.shape[1] == 5:
+                boxes = boxes[:, :4]
+            num_boxes = boxes.shape[0]
+            overlaps = np.zeros((num_boxes, self.num_classes),
+                                dtype=np.float32)
+            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
+                gt_boxes = gt_roidb[i]['boxes']
+                gt_classes = gt_roidb[i]['gt_classes']
+                # n boxes and k gt_boxes => n * k overlap
+                gt_overlaps = bbox_overlaps(boxes.astype(np.float),
+                                            gt_boxes.astype(np.float))
+                # for each box in n boxes, select only maximum overlap (must be greater than zero)
+                argmaxes = gt_overlaps.argmax(axis=1)
+                maxes = gt_overlaps.max(axis=1)
+                I = np.where(maxes > 0)[0]
+                overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+
+            roi_rec.update({
+                'boxes':
+                boxes,
+                'gt_classes':
+                np.zeros((num_boxes, ), dtype=np.int32),
+                'gt_overlaps':
+                overlaps,
+                'max_classes':
+                overlaps.argmax(axis=1),
+                'max_overlaps':
+                overlaps.max(axis=1),
+                'flipped':
+                False
+            })
+
+            # background roi => background class
+            zero_indexes = np.where(roi_rec['max_overlaps'] == 0)[0]
+            assert all(roi_rec['max_classes'][zero_indexes] == 0)
+            # foreground roi => foreground class
+            nonzero_indexes = np.where(roi_rec['max_overlaps'] > 0)[0]
+            assert all(roi_rec['max_classes'][nonzero_indexes] != 0)
+
+            roidb.append(roi_rec)
+
+        return roidb
+
+    def append_flipped_images(self, roidb):
+        """
+        append flipped images to an roidb
+        flip boxes coordinates, images will be actually flipped when loading into network
+        :param roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        """
+        logger.info('%s append flipped images to roidb' % self.name)
+        assert self.num_images == len(roidb)
+        for i in range(self.num_images):
+            roi_rec = roidb[i]
+            entry = {
+                'image': roi_rec['image'],
+                'stream': roi_rec['stream'],
+                'height': roi_rec['height'],
+                'width': roi_rec['width'],
+                #'boxes': boxes,
+                'gt_classes': roidb[i]['gt_classes'],
+                'gt_overlaps': roidb[i]['gt_overlaps'],
+                'max_classes': roidb[i]['max_classes'],
+                'max_overlaps': roidb[i]['max_overlaps'],
+                'flipped': True
+            }
+            for k in roi_rec:
+                if not k.startswith('boxes'):
+                    continue
+                boxes = roi_rec[k].copy()
+                oldx1 = boxes[:, 0].copy()
+                oldx2 = boxes[:, 2].copy()
+                boxes[:, 0] = roi_rec['width'] - oldx2 - 1
+                boxes[:, 2] = roi_rec['width'] - oldx1 - 1
+                assert (boxes[:, 2] >= boxes[:, 0]).all()
+                entry[k] = boxes
+            if 'landmarks' in roi_rec:
+                k = 'landmarks'
+                landmarks = roi_rec[k].copy()
+                landmarks[:, :, 0] *= -1
+                landmarks[:, :, 0] += (roi_rec['width'] - 1)
+                #for a in range(0,10,2):
+                #  oldx1 = landmarks[:, a].copy()
+                #  landmarks[:,a] = roi_rec['width'] - oldx1 - 1
+                order = [1, 0, 2, 4, 3]
+                flandmarks = landmarks.copy()
+                for idx, a in enumerate(order):
+                    flandmarks[:, idx, :] = landmarks[:, a, :]
+
+                entry[k] = flandmarks
+            if 'blur' in roi_rec:
+                entry['blur'] = roi_rec['blur']
+            roidb.append(entry)
+
+        self.image_set_index *= 2
+        return roidb
+
+    def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None):
+        """
+        evaluate detection proposal recall metrics
+        record max overlap value for each gt box; return vector of overlap values
+        :param roidb: used to evaluate
+        :param candidate_boxes: if not given, use roidb's non-gt boxes
+        :param thresholds: array-like recall threshold
+        :return: None
+        ar: average recall, recalls: vector recalls at each IoU overlap threshold
+        thresholds: vector of IoU overlap threshold, gt_overlaps: vector of all ground-truth overlaps
+        """
+        area_names = [
+            'all', '0-25', '25-50', '50-100', '100-200', '200-300', '300-inf'
+        ]
+        area_ranges = [[0**2, 1e5**2], [0**2, 25**2], [25**2, 50**2],
+                       [50**2, 100**2], [100**2, 200**2], [200**2, 300**2],
+                       [300**2, 1e5**2]]
+        area_counts = []
+        for area_name, area_range in zip(area_names[1:], area_ranges[1:]):
+            area_count = 0
+            for i in range(self.num_images):
+                if candidate_boxes is None:
+                    # default is use the non-gt boxes from roidb
+                    non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0]
+                    boxes = roidb[i]['boxes'][non_gt_inds, :]
+                else:
+                    boxes = candidate_boxes[i]
+                boxes_areas = (boxes[:, 2] - boxes[:, 0] +
+                               1) * (boxes[:, 3] - boxes[:, 1] + 1)
+                valid_range_inds = np.where((boxes_areas >= area_range[0])
+                                            & (boxes_areas < area_range[1]))[0]
+                area_count += len(valid_range_inds)
+            area_counts.append(area_count)
+        total_counts = float(sum(area_counts))
+        for area_name, area_count in zip(area_names[1:], area_counts):
+            logger.info('percentage of %s is %f' %
+                        (area_name, area_count / total_counts))
+        logger.info('average number of proposal is %f' %
+                    (total_counts / self.num_images))
+        for area_name, area_range in zip(area_names, area_ranges):
+            gt_overlaps = np.zeros(0)
+            num_pos = 0
+            for i in range(self.num_images):
+                # check for max_overlaps == 1 avoids including crowd annotations
+                max_gt_overlaps = roidb[i]['gt_overlaps'].max(axis=1)
+                gt_inds = np.where((roidb[i]['gt_classes'] > 0)
+                                   & (max_gt_overlaps == 1))[0]
+                gt_boxes = roidb[i]['boxes'][gt_inds, :]
+                gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] +
+                            1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
+                valid_gt_inds = np.where((gt_areas >= area_range[0])
+                                         & (gt_areas < area_range[1]))[0]
+                gt_boxes = gt_boxes[valid_gt_inds, :]
+                num_pos += len(valid_gt_inds)
+
+                if candidate_boxes is None:
+                    # default is use the non-gt boxes from roidb
+                    non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0]
+                    boxes = roidb[i]['boxes'][non_gt_inds, :]
+                else:
+                    boxes = candidate_boxes[i]
+                if boxes.shape[0] == 0:
+                    continue
+
+                overlaps = bbox_overlaps(boxes.astype(np.float),
+                                         gt_boxes.astype(np.float))
+
+                _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+                # choose whatever is smaller to iterate
+                rounds = min(boxes.shape[0], gt_boxes.shape[0])
+                for j in range(rounds):
+                    # find which proposal maximally covers each gt box
+                    argmax_overlaps = overlaps.argmax(axis=0)
+                    # get the IoU amount of coverage for each gt box
+                    max_overlaps = overlaps.max(axis=0)
+                    # find which gt box is covered by most IoU
+                    gt_ind = max_overlaps.argmax()
+                    gt_ovr = max_overlaps.max()
+                    assert (gt_ovr >=
+                            0), '%s\n%s\n%s' % (boxes, gt_boxes, overlaps)
+                    # find the proposal box that covers the best covered gt box
+                    box_ind = argmax_overlaps[gt_ind]
+                    # record the IoU coverage of this gt box
+                    _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+                    assert (_gt_overlaps[j] == gt_ovr)
+                    # mark the proposal box and the gt box as used
+                    overlaps[box_ind, :] = -1
+                    overlaps[:, gt_ind] = -1
+                # append recorded IoU coverage level
+                gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+
+            gt_overlaps = np.sort(gt_overlaps)
+            if thresholds is None:
+                step = 0.05
+                thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+            recalls = np.zeros_like(thresholds)
+
+            # compute recall for each IoU threshold
+            for i, t in enumerate(thresholds):
+                recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+            ar = recalls.mean()
+
+            # print results
+            print('average recall for {}: {:.3f}, number:{}'.format(
+                area_name, ar, num_pos))
+            for threshold, recall in zip(thresholds, recalls):
+                print('recall @{:.2f}: {:.3f}'.format(threshold, recall))
+
+    @staticmethod
+    def merge_roidbs(a, b):
+        """
+        merge roidbs into one
+        :param a: roidb to be merged into
+        :param b: roidb to be merged
+        :return: merged imdb
+        """
+        assert len(a) == len(b)
+        for i in range(len(a)):
+            a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
+            a[i]['gt_classes'] = np.hstack(
+                (a[i]['gt_classes'], b[i]['gt_classes']))
+            a[i]['gt_overlaps'] = np.vstack(
+                (a[i]['gt_overlaps'], b[i]['gt_overlaps']))
+            a[i]['max_classes'] = np.hstack(
+                (a[i]['max_classes'], b[i]['max_classes']))
+            a[i]['max_overlaps'] = np.hstack(
+                (a[i]['max_overlaps'], b[i]['max_overlaps']))
+        return a
diff --git a/insightface/detection/retinaface/rcnn/dataset/retinaface.py b/insightface/detection/retinaface/rcnn/dataset/retinaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e3b85689fb7048376571d07c3fe18657a05a1f4
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/dataset/retinaface.py
@@ -0,0 +1,197 @@
+from __future__ import print_function
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import cv2
+import os
+import numpy as np
+import json
+#from PIL import Image
+
+from ..logger import logger
+from .imdb import IMDB
+from .ds_utils import unique_boxes, filter_small_boxes
+from ..config import config
+
+
+class retinaface(IMDB):
+    def __init__(self, image_set, root_path, data_path):
+        super(retinaface, self).__init__('retinaface', image_set, root_path,
+                                         data_path)
+        #assert image_set=='train'
+
+        split = image_set
+        self._split = image_set
+        self._image_set = image_set
+
+        self.root_path = root_path
+        self.data_path = data_path
+
+        self._dataset_path = self.data_path
+        self._imgs_path = os.path.join(self._dataset_path, image_set, 'images')
+        self._fp_bbox_map = {}
+        label_file = os.path.join(self._dataset_path, image_set, 'label.txt')
+        name = None
+        for line in open(label_file, 'r'):
+            line = line.strip()
+            if line.startswith('#'):
+                name = line[1:].strip()
+                self._fp_bbox_map[name] = []
+                continue
+            assert name is not None
+            assert name in self._fp_bbox_map
+            self._fp_bbox_map[name].append(line)
+        print('origin image size', len(self._fp_bbox_map))
+
+        #self.num_images = len(self._image_paths)
+        #self._image_index = range(len(self._image_paths))
+        self.classes = ['bg', 'face']
+        self.num_classes = len(self.classes)
+
+    def gt_roidb(self):
+        cache_file = os.path.join(
+            self.cache_path,
+            '{}_{}_gt_roidb.pkl'.format(self.name, self._split))
+        if os.path.exists(cache_file):
+            with open(cache_file, 'rb') as fid:
+                roidb = pickle.load(fid)
+            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+            self.num_images = len(roidb)
+            return roidb
+
+        roidb = []
+        max_num_boxes = 0
+        nonattr_box_num = 0
+        landmark_num = 0
+
+        pp = 0
+        for fp in self._fp_bbox_map:
+            pp += 1
+            if pp % 1000 == 0:
+                print('loading', pp)
+            if self._split == 'test':
+                image_path = os.path.join(self._imgs_path, fp)
+                roi = {'image': image_path}
+                roidb.append(roi)
+                continue
+            boxes = np.zeros([len(self._fp_bbox_map[fp]), 4], np.float)
+            landmarks = np.zeros([len(self._fp_bbox_map[fp]), 5, 3], np.float)
+            blur = np.zeros((len(self._fp_bbox_map[fp]), ), np.float)
+            boxes_mask = []
+
+            gt_classes = np.ones([len(self._fp_bbox_map[fp])], np.int32)
+            overlaps = np.zeros([len(self._fp_bbox_map[fp]), 2], np.float)
+
+            imsize = cv2.imread(os.path.join(self._imgs_path,
+                                             fp)).shape[0:2][::-1]
+            ix = 0
+
+            for aline in self._fp_bbox_map[fp]:
+                #imsize = Image.open(os.path.join(self._imgs_path, fp)).size
+                values = [float(x) for x in aline.strip().split()]
+                bbox = [
+                    values[0], values[1], values[0] + values[2],
+                    values[1] + values[3]
+                ]
+
+                x1 = bbox[0]
+                y1 = bbox[1]
+                x2 = min(imsize[0], bbox[2])
+                y2 = min(imsize[1], bbox[3])
+                if x1 >= x2 or y1 >= y2:
+                    continue
+
+                if config.BBOX_MASK_THRESH > 0:
+                    if (
+                            x2 - x1
+                    ) < config.BBOX_MASK_THRESH or y2 - y1 < config.BBOX_MASK_THRESH:
+                        boxes_mask.append(np.array([x1, y1, x2, y2], np.float))
+                        continue
+                if (
+                        x2 - x1
+                ) < config.TRAIN.MIN_BOX_SIZE or y2 - y1 < config.TRAIN.MIN_BOX_SIZE:
+                    continue
+
+                boxes[ix, :] = np.array([x1, y1, x2, y2], np.float)
+                if self._split == 'train':
+                    landmark = np.array(values[4:19],
+                                        dtype=np.float32).reshape((5, 3))
+                    for li in range(5):
+                        #print(landmark)
+                        if landmark[li][0] == -1. and landmark[li][
+                                1] == -1.:  #missing landmark
+                            assert landmark[li][2] == -1
+                        else:
+                            assert landmark[li][2] >= 0
+                            if li == 0:
+                                landmark_num += 1
+                            if landmark[li][2] == 0.0:  #visible
+                                landmark[li][2] = 1.0
+                            else:
+                                landmark[li][2] = 0.0
+
+                    landmarks[ix] = landmark
+
+                    blur[ix] = values[19]
+                    #print(aline, blur[ix])
+                    if blur[ix] < 0.0:
+                        blur[ix] = 0.3
+                        nonattr_box_num += 1
+
+                cls = int(1)
+                gt_classes[ix] = cls
+                overlaps[ix, cls] = 1.0
+                ix += 1
+            max_num_boxes = max(max_num_boxes, ix)
+            #overlaps = scipy.sparse.csr_matrix(overlaps)
+            if self._split == 'train' and ix == 0:
+                continue
+            boxes = boxes[:ix, :]
+            landmarks = landmarks[:ix, :, :]
+            blur = blur[:ix]
+            gt_classes = gt_classes[:ix]
+            overlaps = overlaps[:ix, :]
+            image_path = os.path.join(self._imgs_path, fp)
+            with open(image_path, 'rb') as fin:
+                stream = fin.read()
+            stream = np.fromstring(stream, dtype=np.uint8)
+
+            roi = {
+                'image': image_path,
+                'stream': stream,
+                'height': imsize[1],
+                'width': imsize[0],
+                'boxes': boxes,
+                'landmarks': landmarks,
+                'blur': blur,
+                'gt_classes': gt_classes,
+                'gt_overlaps': overlaps,
+                'max_classes': overlaps.argmax(axis=1),
+                'max_overlaps': overlaps.max(axis=1),
+                'flipped': False,
+            }
+            if len(boxes_mask) > 0:
+                boxes_mask = np.array(boxes_mask)
+                roi['boxes_mask'] = boxes_mask
+            roidb.append(roi)
+        for roi in roidb:
+            roi['max_num_boxes'] = max_num_boxes
+        self.num_images = len(roidb)
+        print('roidb size', len(roidb))
+        print('non attr box num', nonattr_box_num)
+        print('landmark num', landmark_num)
+        with open(cache_file, 'wb') as fid:
+            pickle.dump(roidb, fid, pickle.HIGHEST_PROTOCOL)
+        print('wrote gt roidb to {}'.format(cache_file))
+
+        return roidb
+
+    def write_detections(self, all_boxes, output_dir='./output/'):
+        pass
+
+    def evaluate_detections(self,
+                            all_boxes,
+                            output_dir='./output/',
+                            method_name='insightdetection'):
+        pass
diff --git a/insightface/detection/retinaface/rcnn/io/__init__.py b/insightface/detection/retinaface/rcnn/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/io/image.py b/insightface/detection/retinaface/rcnn/io/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..0296fb4de0eebdc22f1261a70eefa9fbe815ddfe
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/io/image.py
@@ -0,0 +1,886 @@
+from __future__ import print_function
+import numpy as np
+import cv2
+import os
+import math
+import sys
+import random
+from ..config import config
+
+
+def brightness_aug(src, x):
+    alpha = 1.0 + random.uniform(-x, x)
+    src *= alpha
+    return src
+
+
+def contrast_aug(src, x):
+    alpha = 1.0 + random.uniform(-x, x)
+    coef = np.array([[[0.299, 0.587, 0.114]]])
+    gray = src * coef
+    gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
+    src *= alpha
+    src += gray
+    return src
+
+
+def saturation_aug(src, x):
+    alpha = 1.0 + random.uniform(-x, x)
+    coef = np.array([[[0.299, 0.587, 0.114]]])
+    gray = src * coef
+    gray = np.sum(gray, axis=2, keepdims=True)
+    gray *= (1.0 - alpha)
+    src *= alpha
+    src += gray
+    return src
+
+
+def color_aug(img, x):
+    if config.COLOR_MODE > 1:
+        augs = [brightness_aug, contrast_aug, saturation_aug]
+        random.shuffle(augs)
+    else:
+        augs = [brightness_aug]
+    for aug in augs:
+        #print(img.shape)
+        img = aug(img, x)
+        #print(img.shape)
+    return img
+
+
+def get_image(roidb, scale=False):
+    """
+    preprocess image and return processed roidb
+    :param roidb: a list of roidb
+    :return: list of img as in mxnet format
+    roidb add new item['im_info']
+    0 --- x (width, second dim of im)
+    |
+    y (height, first dim of im)
+    """
+    num_images = len(roidb)
+    processed_ims = []
+    processed_roidb = []
+    for i in range(num_images):
+        roi_rec = roidb[i]
+        if 'stream' in roi_rec:
+            im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR)
+        else:
+            assert os.path.exists(
+                roi_rec['image']), '{} does not exist'.format(roi_rec['image'])
+            im = cv2.imread(roi_rec['image'])
+        if roidb[i]['flipped']:
+            im = im[:, ::-1, :]
+        new_rec = roi_rec.copy()
+        if scale:
+            scale_range = config.TRAIN.SCALE_RANGE
+            im_scale = np.random.uniform(scale_range[0], scale_range[1])
+            im = cv2.resize(im,
+                            None,
+                            None,
+                            fx=im_scale,
+                            fy=im_scale,
+                            interpolation=cv2.INTER_LINEAR)
+        elif not config.ORIGIN_SCALE:
+            scale_ind = random.randrange(len(config.SCALES))
+            target_size = config.SCALES[scale_ind][0]
+            max_size = config.SCALES[scale_ind][1]
+            im, im_scale = resize(im,
+                                  target_size,
+                                  max_size,
+                                  stride=config.IMAGE_STRIDE)
+        else:
+            im_scale = 1.0
+        im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS)
+        if 'boxes_mask' in roi_rec:
+            im = im.astype(np.float32)
+            boxes_mask = roi_rec['boxes_mask'].copy() * im_scale
+            boxes_mask = boxes_mask.astype(np.int)
+            for j in range(boxes_mask.shape[0]):
+                m = boxes_mask[j]
+                im_tensor[:, :, m[1]:m[3], m[0]:m[2]] = 0.0
+                #print('find mask', m, file=sys.stderr)
+        processed_ims.append(im_tensor)
+        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
+        if config.TRAIN.IMAGE_ALIGN > 0:
+            if im_tensor.shape[
+                    2] % config.TRAIN.IMAGE_ALIGN != 0 or im_tensor.shape[
+                        3] % config.TRAIN.IMAGE_ALIGN != 0:
+                new_height = math.ceil(
+                    float(im_tensor.shape[2]) /
+                    config.TRAIN.IMAGE_ALIGN) * config.TRAIN.IMAGE_ALIGN
+                new_width = math.ceil(
+                    float(im_tensor.shape[3]) /
+                    config.TRAIN.IMAGE_ALIGN) * config.TRAIN.IMAGE_ALIGN
+                new_im_tensor = np.zeros(
+                    (1, 3, int(new_height), int(new_width)))
+                new_im_tensor[:, :, 0:im_tensor.shape[2],
+                              0:im_tensor.shape[3]] = im_tensor
+                print(im_tensor.shape, new_im_tensor.shape, file=sys.stderr)
+                im_tensor = new_im_tensor
+        #print('boxes', new_rec['boxes'], file=sys.stderr)
+        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
+        new_rec['im_info'] = im_info
+        processed_roidb.append(new_rec)
+    return processed_ims, processed_roidb
+
+
+TMP_ID = -1
+
+
+#bakup method
+def __get_crop_image(roidb):
+    """
+    preprocess image and return processed roidb
+    :param roidb: a list of roidb
+    :return: list of img as in mxnet format
+    roidb add new item['im_info']
+    0 --- x (width, second dim of im)
+    |
+    y (height, first dim of im)
+    """
+    #roidb and each roi_rec can not be changed as it will be reused in next epoch
+    num_images = len(roidb)
+    processed_ims = []
+    processed_roidb = []
+    for i in range(num_images):
+        roi_rec = roidb[i]
+        if 'stream' in roi_rec:
+            im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR)
+        else:
+            assert os.path.exists(
+                roi_rec['image']), '{} does not exist'.format(roi_rec['image'])
+            im = cv2.imread(roi_rec['image'])
+        if roidb[i]['flipped']:
+            im = im[:, ::-1, :]
+        if 'boxes_mask' in roi_rec:
+            #im = im.astype(np.float32)
+            boxes_mask = roi_rec['boxes_mask'].copy()
+            boxes_mask = boxes_mask.astype(np.int)
+            for j in range(boxes_mask.shape[0]):
+                m = boxes_mask[j]
+                im[m[1]:m[3], m[0]:m[2], :] = 0
+                #print('find mask', m, file=sys.stderr)
+        new_rec = roi_rec.copy()
+
+        #choose one gt randomly
+        SIZE = config.SCALES[0][0]
+        TARGET_BOX_SCALES = np.array([16, 32, 64, 128, 256, 512])
+        assert roi_rec['boxes'].shape[0] > 0
+        candidates = []
+        for i in range(roi_rec['boxes'].shape[0]):
+            box = roi_rec['boxes'][i]
+            box_size = max(box[2] - box[0], box[3] - box[1])
+            if box_size < config.TRAIN.MIN_BOX_SIZE:
+                continue
+            #if box[0]<0 or box[1]<0:
+            #  continue
+            #if box[2]>im.shape[1] or box[3]>im.shape[0]:
+            #  continue;
+            candidates.append(i)
+        assert len(candidates) > 0
+        box_ind = random.choice(candidates)
+        box = roi_rec['boxes'][box_ind]
+        box_size = max(box[2] - box[0], box[3] - box[1])
+        dist = np.abs(TARGET_BOX_SCALES - box_size)
+        nearest = np.argmin(dist)
+        target_ind = random.randrange(min(len(TARGET_BOX_SCALES), nearest + 2))
+        target_box_size = TARGET_BOX_SCALES[target_ind]
+        im_scale = float(target_box_size) / box_size
+        #min_scale = float(SIZE)/np.min(im.shape[0:2])
+        #if im_scale<min_scale:
+        #  im_scale = min_scale
+        im = cv2.resize(im,
+                        None,
+                        None,
+                        fx=im_scale,
+                        fy=im_scale,
+                        interpolation=cv2.INTER_LINEAR)
+        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
+        box_scale = new_rec['boxes'][box_ind].copy().astype(np.int)
+        ul_min = box_scale[2:4] - SIZE
+        ul_max = box_scale[0:2]
+        assert ul_min[0] <= ul_max[0]
+        assert ul_min[1] <= ul_max[1]
+        #print('ul', ul_min, ul_max, box)
+        up, left = np.random.randint(ul_min[1],
+                                     ul_max[1] + 1), np.random.randint(
+                                         ul_min[0], ul_max[0] + 1)
+        #print('box', box, up, left)
+        M = [
+            [1.0, 0.0, -left],
+            [0.0, 1.0, -up],
+        ]
+        M = np.array(M)
+        im = cv2.warpAffine(im,
+                            M, (SIZE, SIZE),
+                            borderValue=tuple(config.PIXEL_MEANS))
+        #tbox = np.array([left, left+SIZE, up, up+SIZE], dtype=np.int)
+        #im_new = np.zeros( (SIZE, SIZE,3), dtype=im.dtype)
+        #for i in range(3):
+        #  im_new[:,:,i] = config.PIXEL_MEANS[i]
+        new_rec['boxes'][:, 0] -= left
+        new_rec['boxes'][:, 2] -= left
+        new_rec['boxes'][:, 1] -= up
+        new_rec['boxes'][:, 3] -= up
+        box_trans = new_rec['boxes'][box_ind].copy().astype(np.int)
+        #print('sel box', im_scale, box, box_scale, box_trans, file=sys.stderr)
+        #print('before', new_rec['boxes'].shape[0])
+        boxes_new = []
+        classes_new = []
+        for i in range(new_rec['boxes'].shape[0]):
+            box = new_rec['boxes'][i]
+            box_size = max(box[2] - box[0], box[3] - box[1])
+            center = np.array(([box[0], box[1]] + [box[2], box[3]])) / 2
+            if center[0] < 0 or center[1] < 0 or center[0] >= im.shape[
+                    1] or center[1] >= im.shape[0]:
+                continue
+            if box_size < config.TRAIN.MIN_BOX_SIZE:
+                continue
+            boxes_new.append(box)
+            classes_new.append(new_rec['gt_classes'][i])
+        new_rec['boxes'] = np.array(boxes_new)
+        new_rec['gt_classes'] = np.array(classes_new)
+        #print('after', new_rec['boxes'].shape[0])
+        #assert new_rec['boxes'].shape[0]>0
+        DEBUG = True
+        if DEBUG:
+            global TMP_ID
+            if TMP_ID < 10:
+                tim = im.copy()
+                for i in range(new_rec['boxes'].shape[0]):
+                    box = new_rec['boxes'][i].copy().astype(np.int)
+                    cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]),
+                                  (255, 0, 0), 1)
+                filename = './trainimages/train%d.png' % TMP_ID
+                TMP_ID += 1
+                cv2.imwrite(filename, tim)
+
+        im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS,
+                              config.PIXEL_SCALE)
+
+        processed_ims.append(im_tensor)
+        #print('boxes', new_rec['boxes'], file=sys.stderr)
+        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
+        new_rec['im_info'] = im_info
+        processed_roidb.append(new_rec)
+    return processed_ims, processed_roidb
+
+
+def expand_bboxes(bboxes,
+                  image_width,
+                  image_height,
+                  expand_left=2.,
+                  expand_up=2.,
+                  expand_right=2.,
+                  expand_down=2.):
+    """
+    Expand bboxes, expand 2 times by defalut.
+    """
+    expand_boxes = []
+    for bbox in bboxes:
+        xmin = bbox[0]
+        ymin = bbox[1]
+        xmax = bbox[2]
+        ymax = bbox[3]
+        w = xmax - xmin
+        h = ymax - ymin
+        ex_xmin = max(xmin - w / expand_left, 0.)
+        ex_ymin = max(ymin - h / expand_up, 0.)
+        ex_xmax = min(xmax + w / expand_right, image_width)
+        ex_ymax = min(ymax + h / expand_down, image_height)
+        expand_boxes.append([ex_xmin, ex_ymin, ex_xmax, ex_ymax])
+    return expand_boxes
+
+
+def get_crop_image1(roidb):
+    """
+    preprocess image and return processed roidb
+    :param roidb: a list of roidb
+    :return: list of img as in mxnet format
+    roidb add new item['im_info']
+    0 --- x (width, second dim of im)
+    |
+    y (height, first dim of im)
+    """
+    #roidb and each roi_rec can not be changed as it will be reused in next epoch
+    num_images = len(roidb)
+    processed_ims = []
+    processed_roidb = []
+    for i in range(num_images):
+        roi_rec = roidb[i]
+        if 'stream' in roi_rec:
+            im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR)
+        else:
+            assert os.path.exists(
+                roi_rec['image']), '{} does not exist'.format(roi_rec['image'])
+            im = cv2.imread(roi_rec['image'])
+        if roidb[i]['flipped']:
+            im = im[:, ::-1, :]
+        if 'boxes_mask' in roi_rec:
+            #im = im.astype(np.float32)
+            boxes_mask = roi_rec['boxes_mask'].copy()
+            boxes_mask = boxes_mask.astype(np.int)
+            for j in range(boxes_mask.shape[0]):
+                m = boxes_mask[j]
+                im[m[1]:m[3], m[0]:m[2], :] = 127
+                #print('find mask', m, file=sys.stderr)
+        SIZE = config.SCALES[0][0]
+        PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0]
+        #PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0, 0.8, 1.0, 0.8, 1.0]
+        _scale = random.choice(PRE_SCALES)
+        #_scale = np.random.uniform(PRE_SCALES[0], PRE_SCALES[-1])
+        size = int(np.min(im.shape[0:2]) * _scale)
+        #size = int(np.round(_scale*np.min(im.shape[0:2])))
+        im_scale = float(SIZE) / size
+        #origin_im_scale = im_scale
+        #size = np.round(np.min(im.shape[0:2])*im_scale)
+        #im_scale *= (float(SIZE)/size)
+        origin_shape = im.shape
+        if _scale > 10.0:  #avoid im.size<SIZE, never?
+            sizex = int(np.round(im.shape[1] * im_scale))
+            sizey = int(np.round(im.shape[0] * im_scale))
+            if sizex < SIZE:
+                sizex = SIZE
+                print('keepx', sizex)
+            if sizey < SIZE:
+                sizey = SIZE
+                print('keepy', sizex)
+            im = cv2.resize(im, (sizex, sizey), interpolation=cv2.INTER_LINEAR)
+        else:
+            im = cv2.resize(im,
+                            None,
+                            None,
+                            fx=im_scale,
+                            fy=im_scale,
+                            interpolation=cv2.INTER_LINEAR)
+
+        assert im.shape[0] >= SIZE and im.shape[1] >= SIZE
+        #print('image size', origin_shape, _scale, SIZE, size, im_scale)
+
+        new_rec = roi_rec.copy()
+        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
+        if config.FACE_LANDMARK:
+            new_rec['landmarks'] = roi_rec['landmarks'].copy()
+            new_rec['landmarks'][:, :, 0:2] *= im_scale
+        retry = 0
+        LIMIT = 25
+        size = SIZE
+        while retry < LIMIT:
+            up, left = (np.random.randint(0, im.shape[0] - size + 1),
+                        np.random.randint(0, im.shape[1] - size + 1))
+            boxes_new = new_rec['boxes'].copy()
+            im_new = im[up:(up + size), left:(left + size), :]
+            #print('crop', up, left, size, im_scale)
+            boxes_new[:, 0] -= left
+            boxes_new[:, 2] -= left
+            boxes_new[:, 1] -= up
+            boxes_new[:, 3] -= up
+            if config.FACE_LANDMARK:
+                landmarks_new = new_rec['landmarks'].copy()
+                landmarks_new[:, :, 0] -= left
+                landmarks_new[:, :, 1] -= up
+                #for i in range(0,10,2):
+                #  landmarks_new[:,i] -= left
+                #for i in range(1,10,2):
+                #  landmarks_new[:,i] -= up
+                valid_landmarks = []
+            #im_new = cv2.resize(im_new, (SIZE, SIZE), interpolation=cv2.INTER_LINEAR)
+            #boxes_new *= im_scale
+            #print(origin_shape, im_new.shape, im_scale)
+            valid = []
+            valid_boxes = []
+            for i in range(boxes_new.shape[0]):
+                box = boxes_new[i]
+                #center = np.array(([box[0], box[1]]+[box[2], box[3]]))/2
+                centerx = (box[0] + box[2]) / 2
+                centery = (box[1] + box[3]) / 2
+
+                #box[0] = max(0, box[0])
+                #box[1] = max(0, box[1])
+                #box[2] = min(im_new.shape[1], box[2])
+                #box[3] = min(im_new.shape[0], box[3])
+                box_size = max(box[2] - box[0], box[3] - box[1])
+
+                if centerx < 0 or centery < 0 or centerx >= im_new.shape[
+                        1] or centery >= im_new.shape[0]:
+                    continue
+                if box_size < config.TRAIN.MIN_BOX_SIZE:
+                    continue
+                #filter by landmarks? TODO
+                valid.append(i)
+                valid_boxes.append(box)
+                if config.FACE_LANDMARK:
+                    valid_landmarks.append(landmarks_new[i])
+            if len(valid) > 0 or retry == LIMIT - 1:
+                im = im_new
+                new_rec['boxes'] = np.array(valid_boxes)
+                new_rec['gt_classes'] = new_rec['gt_classes'][valid]
+                if config.FACE_LANDMARK:
+                    new_rec['landmarks'] = np.array(valid_landmarks)
+                if config.HEAD_BOX:
+                    face_box = new_rec['boxes']
+                    head_box = expand_bboxes(face_box,
+                                             image_width=im.shape[1],
+                                             image_height=im.shape[0])
+                    new_rec['boxes_head'] = np.array(head_box)
+                break
+
+            retry += 1
+
+        if config.COLOR_MODE > 0 and config.COLOR_JITTERING > 0.0:
+            im = im.astype(np.float32)
+            im = color_aug(im, config.COLOR_JITTERING)
+
+        #assert np.all(new_rec['landmarks'][:,10]>0.0)
+        global TMP_ID
+        if TMP_ID >= 0 and TMP_ID < 10:
+            tim = im.copy().astype(np.uint8)
+            for i in range(new_rec['boxes'].shape[0]):
+                box = new_rec['boxes'][i].copy().astype(np.int)
+                cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]),
+                              (255, 0, 0), 1)
+                print('draw box:', box)
+            if config.FACE_LANDMARK:
+                for i in range(new_rec['landmarks'].shape[0]):
+                    landmark = new_rec['landmarks'][i].copy()
+                    if landmark[0][2] < 0:
+                        print('zero', landmark)
+                        continue
+                    landmark = landmark.astype(np.int)
+                    print('draw landmark', landmark)
+                    for k in range(5):
+                        color = (0, 0, 255)
+                        if k == 0 or k == 3:
+                            color = (0, 255, 0)
+                        pp = (landmark[k][0], landmark[k][1])
+                        cv2.circle(tim, (pp[0], pp[1]), 1, color, 2)
+            filename = './trainimages/train%d.png' % TMP_ID
+            print('write', filename)
+            cv2.imwrite(filename, tim)
+            TMP_ID += 1
+
+        im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS,
+                              config.PIXEL_SCALE)
+
+        processed_ims.append(im_tensor)
+        #print('boxes', new_rec['boxes'], file=sys.stderr)
+        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
+        new_rec['im_info'] = np.array(im_info, dtype=np.float32)
+        processed_roidb.append(new_rec)
+    return processed_ims, processed_roidb
+
+
+def get_crop_image2(roidb):
+    """
+    preprocess image and return processed roidb
+    :param roidb: a list of roidb
+    :return: list of img as in mxnet format
+    roidb add new item['im_info']
+    0 --- x (width, second dim of im)
+    |
+    y (height, first dim of im)
+    """
+    #roidb and each roi_rec can not be changed as it will be reused in next epoch
+    num_images = len(roidb)
+    processed_ims = []
+    processed_roidb = []
+    for i in range(num_images):
+        roi_rec = roidb[i]
+        if 'stream' in roi_rec:
+            im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR)
+        else:
+            assert os.path.exists(
+                roi_rec['image']), '{} does not exist'.format(roi_rec['image'])
+            im = cv2.imread(roi_rec['image'])
+        if roidb[i]['flipped']:
+            im = im[:, ::-1, :]
+        if 'boxes_mask' in roi_rec:
+            #im = im.astype(np.float32)
+            boxes_mask = roi_rec['boxes_mask'].copy()
+            boxes_mask = boxes_mask.astype(np.int)
+            for j in range(boxes_mask.shape[0]):
+                m = boxes_mask[j]
+                im[m[1]:m[3], m[0]:m[2], :] = 0
+                #print('find mask', m, file=sys.stderr)
+        SIZE = config.SCALES[0][0]
+        scale_array = np.array([16, 32, 64, 128, 256, 512], dtype=np.float32)
+        candidates = []
+        for i in range(roi_rec['boxes'].shape[0]):
+            box = roi_rec['boxes'][i]
+            box_size = max(box[2] - box[0], box[3] - box[1])
+            if box_size < config.TRAIN.MIN_BOX_SIZE:
+                continue
+            #if box[0]<0 or box[1]<0:
+            #  continue
+            #if box[2]>im.shape[1] or box[3]>im.shape[0]:
+            #  continue;
+            candidates.append(i)
+        assert len(candidates) > 0
+        box_ind = random.choice(candidates)
+        box = roi_rec['boxes'][box_ind]
+        width = box[2] - box[0]
+        height = box[3] - box[1]
+        wid = width
+        hei = height
+        resize_width, resize_height = config.SCALES[0]
+        image_width = im.shape[0]
+        image_height = im.shape[1]
+        area = width * height
+        range_size = 0
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size,
+                                           0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = 0.0
+        h_off = 0.0
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+        im_new = im[up:(up + size), left:(left + size), :]
+
+        sampled_bbox = bbox(w_off, h_off,
+                            w_off + float(sample_bbox_size / image_width),
+                            h_off + float(sample_bbox_size / image_height))
+        return sampled_bbox
+
+        box_size = max(box[2] - box[0], box[3] - box[1])
+        dist = np.abs(TARGET_BOX_SCALES - box_size)
+        nearest = np.argmin(dist)
+        target_ind = random.randrange(min(len(TARGET_BOX_SCALES), nearest + 2))
+        target_box_size = TARGET_BOX_SCALES[target_ind]
+        im_scale = float(target_box_size) / box_size
+        PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0]
+        _scale = random.choice(PRE_SCALES)
+        #_scale = np.random.uniform(PRE_SCALES[0], PRE_SCALES[-1])
+        size = int(np.round(_scale * np.min(im.shape[0:2])))
+        im_scale = float(SIZE) / size
+        #origin_im_scale = im_scale
+        #size = np.round(np.min(im.shape[0:2])*im_scale)
+        #im_scale *= (float(SIZE)/size)
+        origin_shape = im.shape
+        if _scale > 10.0:  #avoid im.size<SIZE, never?
+            sizex = int(np.round(im.shape[1] * im_scale))
+            sizey = int(np.round(im.shape[0] * im_scale))
+            if sizex < SIZE:
+                sizex = SIZE
+                print('keepx', sizex)
+            if sizey < SIZE:
+                sizey = SIZE
+                print('keepy', sizex)
+            im = cv2.resize(im, (sizex, sizey), interpolation=cv2.INTER_LINEAR)
+        else:
+            im = cv2.resize(im,
+                            None,
+                            None,
+                            fx=im_scale,
+                            fy=im_scale,
+                            interpolation=cv2.INTER_LINEAR)
+        assert im.shape[0] >= SIZE and im.shape[1] >= SIZE
+
+        new_rec = roi_rec.copy()
+        new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale
+        if config.FACE_LANDMARK:
+            new_rec['landmarks'] = roi_rec['landmarks'].copy() * im_scale
+        retry = 0
+        LIMIT = 25
+        size = SIZE
+        while retry < LIMIT:
+            up, left = (np.random.randint(0, im.shape[0] - size + 1),
+                        np.random.randint(0, im.shape[1] - size + 1))
+            boxes_new = new_rec['boxes'].copy()
+            im_new = im[up:(up + size), left:(left + size), :]
+            #print('crop', up, left, size, im_scale)
+            boxes_new[:, 0] -= left
+            boxes_new[:, 2] -= left
+            boxes_new[:, 1] -= up
+            boxes_new[:, 3] -= up
+            if config.FACE_LANDMARK:
+                landmarks_new = new_rec['landmarks'].copy()
+                for i in range(0, 10, 2):
+                    landmarks_new[:, i] -= left
+                for i in range(1, 10, 2):
+                    landmarks_new[:, i] -= up
+                valid_landmarks = []
+            #im_new = cv2.resize(im_new, (SIZE, SIZE), interpolation=cv2.INTER_LINEAR)
+            #boxes_new *= im_scale
+            #print(origin_shape, im_new.shape, im_scale)
+            valid = []
+            valid_boxes = []
+            for i in range(boxes_new.shape[0]):
+                box = boxes_new[i]
+                #center = np.array(([box[0], box[1]]+[box[2], box[3]]))/2
+                centerx = (box[0] + box[2]) / 2
+                centery = (box[1] + box[3]) / 2
+
+                #box[0] = max(0, box[0])
+                #box[1] = max(0, box[1])
+                #box[2] = min(im_new.shape[1], box[2])
+                #box[3] = min(im_new.shape[0], box[3])
+                box_size = max(box[2] - box[0], box[3] - box[1])
+
+                if centerx < 0 or centery < 0 or centerx >= im_new.shape[
+                        1] or centery >= im_new.shape[0]:
+                    continue
+                if box_size < config.TRAIN.MIN_BOX_SIZE:
+                    continue
+                #filter by landmarks? TODO
+                valid.append(i)
+                valid_boxes.append(box)
+                if config.FACE_LANDMARK:
+                    valid_landmarks.append(landmarks_new[i])
+            if len(valid) > 0 or retry == LIMIT - 1:
+                im = im_new
+                new_rec['boxes'] = np.array(valid_boxes)
+                new_rec['gt_classes'] = new_rec['gt_classes'][valid]
+                if config.FACE_LANDMARK:
+                    new_rec['landmarks'] = np.array(valid_landmarks)
+                break
+
+            retry += 1
+
+        if config.COLOR_JITTERING > 0.0:
+            im = im.astype(np.float32)
+            im = color_aug(im, config.COLOR_JITTERING)
+
+        #assert np.all(new_rec['landmarks'][:,10]>0.0)
+        global TMP_ID
+        if TMP_ID >= 0 and TMP_ID < 10:
+            tim = im.copy().astype(np.uint8)
+            for i in range(new_rec['boxes'].shape[0]):
+                box = new_rec['boxes'][i].copy().astype(np.int)
+                cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]),
+                              (255, 0, 0), 1)
+                print('draw box:', box)
+            if config.FACE_LANDMARK:
+                for i in range(new_rec['landmarks'].shape[0]):
+                    landmark = new_rec['landmarks'][i].copy()
+                    if landmark[10] == 0.0:
+                        print('zero', landmark)
+                        continue
+                    landmark = landmark.astype(np.int)
+                    print('draw landmark', landmark)
+                    for k in range(5):
+                        color = (0, 0, 255)
+                        if k == 0 or k == 3:
+                            color = (0, 255, 0)
+                        pp = (landmark[k * 2], landmark[1 + k * 2])
+                        cv2.circle(tim, (pp[0], pp[1]), 1, color, 2)
+            filename = './trainimages/train%d.png' % TMP_ID
+            print('write', filename)
+            cv2.imwrite(filename, tim)
+            TMP_ID += 1
+
+        im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS,
+                              config.PIXEL_SCALE)
+
+        processed_ims.append(im_tensor)
+        #print('boxes', new_rec['boxes'], file=sys.stderr)
+        im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale]
+        new_rec['im_info'] = np.array(im_info, dtype=np.float32)
+        processed_roidb.append(new_rec)
+    return processed_ims, processed_roidb
+
+
+def do_mixup(im1, roidb1, im2, roidb2):
+    im = (im1 + im2) / 2.0
+    roidb = {}
+    #print(roidb1.keys())
+    #for k in roidb1:
+    for k in ['boxes', 'landmarks', 'gt_classes', 'im_info']:
+        v1 = roidb1[k]
+        v2 = roidb2[k]
+        if k != 'im_info':
+            #print('try', k, v1.shape, v2.shape)
+            if v1.shape[0] > 0 and v2.shape[0] > 0:
+                v = np.concatenate((v1, v2), axis=0)
+            else:
+                v = v1
+        else:
+            v = v1
+        #print(k, v1.shape, v2.shape, v.shape)
+        roidb[k] = v
+    return im, roidb
+
+
+def get_crop_image(roidb):
+    ims, roidbs = get_crop_image1(roidb)
+    if config.MIXUP > 0.0 and np.random.random() < config.MIXUP:
+        for i in range(len(ims)):
+            im = ims[i]
+            roidb = roidbs[i]
+            j = np.random.randint(0, len(ims) - 1)
+            if j >= i:
+                j += 1
+            im, roidb = do_mixup(im, roidb, ims[j], roidbs[j])
+            ims[i] = im
+            roidbs[i] = roidb
+    return ims, roidbs
+
+
+def resize(im, target_size, max_size, stride=0, min_size=0):
+    """
+    only resize input image to target size and return scale
+    :param im: BGR image input by opencv
+    :param target_size: one dimensional size (the short side)
+    :param max_size: one dimensional max size (the long side)
+    :param stride: if given, pad the image to designated stride
+    :return:
+    """
+    im_shape = im.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(target_size) / float(im_size_min)
+    # prevent bigger axis from being more than max_size:
+    if np.round(im_scale * im_size_max) > max_size:
+        im_scale = float(max_size) / float(im_size_max)
+        if min_size > 0 and np.round(im_scale * im_size_min) < min_size:
+            im_scale = float(min_size) / float(im_size_min)
+    im = cv2.resize(im,
+                    None,
+                    None,
+                    fx=im_scale,
+                    fy=im_scale,
+                    interpolation=cv2.INTER_LINEAR)
+
+    if stride == 0:
+        return im, im_scale
+    else:
+        # pad to product of stride
+        im_height = int(np.ceil(im.shape[0] / float(stride)) * stride)
+        im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
+        im_channel = im.shape[2]
+        padded_im = np.zeros((im_height, im_width, im_channel))
+        padded_im[:im.shape[0], :im.shape[1], :] = im
+        return padded_im, im_scale
+
+
+def transform(im, pixel_means, pixel_stds, pixel_scale):
+    """
+    transform into mxnet tensor,
+    subtract pixel size and transform to correct format
+    :param im: [height, width, channel] in BGR
+    :param pixel_means: [B, G, R pixel means]
+    :return: [batch, channel, height, width]
+    """
+    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+    for i in range(3):
+        im_tensor[0, i, :, :] = (im[:, :, 2 - i] / pixel_scale -
+                                 pixel_means[2 - i]) / pixel_stds[2 - i]
+    return im_tensor
+
+
+def transform_inverse(im_tensor, pixel_means):
+    """
+    transform from mxnet im_tensor to ordinary RGB image
+    im_tensor is limited to one image
+    :param im_tensor: [batch, channel, height, width]
+    :param pixel_means: [B, G, R pixel means]
+    :return: im [height, width, channel(RGB)]
+    """
+    assert im_tensor.shape[0] == 1
+    im_tensor = im_tensor.copy()
+    # put channel back
+    channel_swap = (0, 2, 3, 1)
+    im_tensor = im_tensor.transpose(channel_swap)
+    im = im_tensor[0]
+    assert im.shape[2] == 3
+    im += pixel_means[[2, 1, 0]]
+    im = im.astype(np.uint8)
+    return im
+
+
+def tensor_vstack(tensor_list, pad=0):
+    """
+    vertically stack tensors
+    :param tensor_list: list of tensor to be stacked vertically
+    :param pad: label to pad with
+    :return: tensor with max shape
+    """
+    ndim = len(tensor_list[0].shape)
+    dtype = tensor_list[0].dtype
+    islice = tensor_list[0].shape[0]
+    dimensions = []
+    first_dim = sum([tensor.shape[0] for tensor in tensor_list])
+    dimensions.append(first_dim)
+    for dim in range(1, ndim):
+        dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
+    if pad == 0:
+        all_tensor = np.zeros(tuple(dimensions), dtype=dtype)
+    elif pad == 1:
+        all_tensor = np.ones(tuple(dimensions), dtype=dtype)
+    else:
+        all_tensor = np.full(tuple(dimensions), pad, dtype=dtype)
+    if ndim == 1:
+        for ind, tensor in enumerate(tensor_list):
+            all_tensor[ind * islice:(ind + 1) * islice] = tensor
+    elif ndim == 2:
+        for ind, tensor in enumerate(tensor_list):
+            all_tensor[ind * islice:(ind + 1) *
+                       islice, :tensor.shape[1]] = tensor
+    elif ndim == 3:
+        for ind, tensor in enumerate(tensor_list):
+            all_tensor[ind * islice:(ind + 1) *
+                       islice, :tensor.shape[1], :tensor.shape[2]] = tensor
+    elif ndim == 4:
+        for ind, tensor in enumerate(tensor_list):
+            all_tensor[ind * islice:(ind + 1) * islice, :tensor.
+                       shape[1], :tensor.shape[2], :tensor.shape[3]] = tensor
+    elif ndim == 5:
+        for ind, tensor in enumerate(tensor_list):
+            all_tensor[ind * islice:(ind + 1) *
+                       islice, :tensor.shape[1], :tensor.shape[2], :tensor.
+                       shape[3], :tensor.shape[4]] = tensor
+    else:
+        print(tensor_list[0].shape)
+        raise Exception('Sorry, unimplemented.')
+    return all_tensor
diff --git a/insightface/detection/retinaface/rcnn/io/rcnn.py b/insightface/detection/retinaface/rcnn/io/rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b3a443f6c101a83760f950553537441e6dda037
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/io/rcnn.py
@@ -0,0 +1,661 @@
+"""
+Fast R-CNN:
+data =
+    {'data': [num_images, c, h, w],
+    'rois': [num_rois, 5]}
+label =
+    {'label': [num_rois],
+    'bbox_target': [num_rois, 4 * num_classes],
+    'bbox_weight': [num_rois, 4 * num_classes]}
+roidb extended format [image_index]
+    ['image', 'height', 'width', 'flipped',
+     'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets']
+"""
+
+import numpy as np
+import numpy.random as npr
+
+from ..config import config
+from ..io.image import get_image, tensor_vstack
+from ..processing.bbox_transform import bbox_overlaps, bbox_transform
+from ..processing.bbox_regression import expand_bbox_regression_targets
+
+
+def get_rcnn_testbatch(roidb):
+    """
+    return a dict of testbatch
+    :param roidb: ['image', 'flipped'] + ['boxes']
+    :return: data, label, im_info
+    """
+    assert len(roidb) == 1, 'Single batch only'
+    imgs, roidb = get_image(roidb)
+    im_array = imgs[0]
+    im_info = np.array([roidb[0]['im_info']], dtype=np.float32)
+
+    im_rois = roidb[0]['boxes']
+    rois = im_rois
+    batch_index = 0 * np.ones((rois.shape[0], 1))
+    rois_array = np.hstack((batch_index, rois))[np.newaxis, :]
+
+    data = {'data': im_array, 'rois': rois_array, 'im_info': im_info}
+    label = {}
+
+    return data, label
+
+
+def get_rcnn_batch(roidb):
+    """
+    return a dict of multiple images
+    :param roidb: a list of dict, whose length controls batch size
+    ['images', 'flipped'] + ['gt_boxes', 'boxes', 'gt_overlap'] => ['bbox_targets']
+    :return: data, label
+    """
+    num_images = len(roidb)
+    imgs, roidb = get_image(roidb)
+    im_array = tensor_vstack(imgs)
+
+    assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \
+        'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS)
+    rois_per_image = int(config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES)
+    fg_rois_per_image = int(round(config.TRAIN.FG_FRACTION * rois_per_image))
+
+    rois_array = list()
+    labels_array = list()
+    bbox_targets_array = list()
+    bbox_weights_array = list()
+
+    for im_i in range(num_images):
+        roi_rec = roidb[im_i]
+
+        # infer num_classes from gt_overlaps
+        num_classes = roi_rec['gt_overlaps'].shape[1]
+
+        # label = class RoI has max overlap with
+        rois = roi_rec['boxes']
+        labels = roi_rec['max_classes']
+        overlaps = roi_rec['max_overlaps']
+        bbox_targets = roi_rec['bbox_targets']
+
+        im_rois, labels, bbox_targets, bbox_weights = \
+            sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes,
+                        labels, overlaps, bbox_targets)
+
+        # project im_rois
+        # do not round roi
+        rois = im_rois
+        batch_index = im_i * np.ones((rois.shape[0], 1))
+        rois_array_this_image = np.hstack((batch_index, rois))
+        rois_array.append(rois_array_this_image)
+
+        # add labels
+        labels_array.append(labels)
+        bbox_targets_array.append(bbox_targets)
+        bbox_weights_array.append(bbox_weights)
+
+    rois_array = np.array(rois_array)
+    labels_array = np.array(labels_array)
+    bbox_targets_array = np.array(bbox_targets_array)
+    bbox_weights_array = np.array(bbox_weights_array)
+
+    data = {'data': im_array, 'rois': rois_array}
+    label = {
+        'label': labels_array,
+        'bbox_target': bbox_targets_array,
+        'bbox_weight': bbox_weights_array
+    }
+
+    return data, label
+
+
+def sample_rois(rois,
+                fg_rois_per_image,
+                rois_per_image,
+                num_classes,
+                labels=None,
+                overlaps=None,
+                bbox_targets=None,
+                gt_boxes=None):
+    """
+    generate random sample of ROIs comprising foreground and background examples
+    :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index
+    :param fg_rois_per_image: foreground roi number
+    :param rois_per_image: total roi number
+    :param num_classes: number of classes
+    :param labels: maybe precomputed
+    :param overlaps: maybe precomputed (max_overlaps)
+    :param bbox_targets: maybe precomputed
+    :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls)
+    :return: (labels, rois, bbox_targets, bbox_weights)
+    """
+    if labels is None:
+        overlaps = bbox_overlaps(rois[:, 1:].astype(np.float),
+                                 gt_boxes[:, :4].astype(np.float))
+        gt_assignment = overlaps.argmax(axis=1)
+        overlaps = overlaps.max(axis=1)
+        labels = gt_boxes[gt_assignment, 4]
+
+    # foreground RoI with FG_THRESH overlap
+    fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0]
+    # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs
+    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size)
+    # Sample foreground regions without replacement
+    if len(fg_indexes) > fg_rois_per_this_image:
+        fg_indexes = npr.choice(fg_indexes,
+                                size=fg_rois_per_this_image,
+                                replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI)
+                          & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image (guarding against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_indexes.size)
+    # Sample foreground regions without replacement
+    if len(bg_indexes) > bg_rois_per_this_image:
+        bg_indexes = npr.choice(bg_indexes,
+                                size=bg_rois_per_this_image,
+                                replace=False)
+
+    # indexes selected
+    keep_indexes = np.append(fg_indexes, bg_indexes)
+    neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0]
+    neg_rois = rois[neg_idx]
+    # pad more to ensure a fixed minibatch size
+    while keep_indexes.shape[0] < rois_per_image:
+        gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0])
+        gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False)
+        keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes])
+
+    # select labels
+    labels = labels[keep_indexes]
+    # set labels of bg_rois to be 0
+    labels[fg_rois_per_this_image:] = 0
+    rois = rois[keep_indexes]
+
+    # load or compute bbox_target
+    if bbox_targets is not None:
+        bbox_target_data = bbox_targets[keep_indexes, :]
+    else:
+        targets = bbox_transform(rois[:, 1:],
+                                 gt_boxes[gt_assignment[keep_indexes], :4])
+        if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+            targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) /
+                       np.array(config.TRAIN.BBOX_STDS))
+        bbox_target_data = np.hstack((labels[:, np.newaxis], targets))
+
+    bbox_targets, bbox_weights = \
+        expand_bbox_regression_targets(bbox_target_data, num_classes)
+
+    return rois, labels, bbox_targets, bbox_weights
+
+
+def get_fpn_rcnn_testbatch(roidb):
+    """
+    return a dict of testbatch
+    :param roidb: ['image', 'flipped'] + ['boxes']
+    :return: data, label, im_info
+    """
+    assert len(roidb) == 1, 'Single batch only'
+    imgs, roidb = get_image(roidb)
+    im_array = imgs[0]
+    im_info = np.array([roidb[0]['im_info']], dtype=np.float32)
+
+    im_rois = roidb[0]['boxes']
+    rois = im_rois
+
+    # assign rois
+    rois_area = np.sqrt((rois[:, 2] - rois[:, 0]) * (rois[:, 3] - rois[:, 1]))
+    area_threshold = {'P5': 448, 'P4': 224, 'P3': 112}
+    rois_p5 = rois[area_threshold['P5'] <= rois_area]
+    rois_p4 = rois[np.logical_and(area_threshold['P4'] <= rois_area,
+                                  rois_area < area_threshold['P5'])]
+    rois_p3 = rois[np.logical_and(area_threshold['P3'] <= rois_area,
+                                  rois_area < area_threshold['P4'])]
+    rois_p2 = rois[np.logical_and(0 < rois_area,
+                                  rois_area < area_threshold['P3'])]
+
+    # pad a virtual rois if on rois assigned
+    if rois_p5.size == 0:
+        rois_p5 = np.array([[12, 34, 56, 78]])
+    if rois_p4.size == 0:
+        rois_p4 = np.array([[12, 34, 56, 78]])
+    if rois_p3.size == 0:
+        rois_p3 = np.array([[12, 34, 56, 78]])
+    if rois_p2.size == 0:
+        rois_p2 = np.array([[12, 34, 56, 78]])
+
+    p5_batch_index = 0 * np.ones((rois_p5.shape[0], 1))
+    rois_p5_array = np.hstack((p5_batch_index, rois_p5))[np.newaxis, :]
+
+    p4_batch_index = 0 * np.ones((rois_p4.shape[0], 1))
+    rois_p4_array = np.hstack((p4_batch_index, rois_p4))[np.newaxis, :]
+
+    p3_batch_index = 0 * np.ones((rois_p3.shape[0], 1))
+    rois_p3_array = np.hstack((p3_batch_index, rois_p3))[np.newaxis, :]
+
+    p2_batch_index = 0 * np.ones((rois_p2.shape[0], 1))
+    rois_p2_array = np.hstack((p2_batch_index, rois_p2))[np.newaxis, :]
+
+    data = {
+        'data': im_array,
+        'rois_stride32': rois_p5_array,
+        'rois_stride16': rois_p4_array,
+        'rois_stride8': rois_p3_array,
+        'rois_stride4': rois_p2_array
+    }
+    label = {}
+
+    return data, label, im_info
+
+
+def get_fpn_maskrcnn_batch(roidb):
+    """
+    return a dictionary that contains raw data.
+    """
+    num_images = len(roidb)
+    imgs, roidb = get_image(roidb, scale=config.TRAIN.SCALE)  #TODO
+    #imgs, roidb = get_image(roidb)
+    im_array = tensor_vstack(imgs)
+
+    assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \
+        'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS)
+    rois_per_image = config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES
+    fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION *
+                                 rois_per_image).astype(int)
+
+    rois_on_imgs = dict()
+    labels_on_imgs = dict()
+    bbox_targets_on_imgs = dict()
+    bbox_weights_on_imgs = dict()
+    mask_targets_on_imgs = dict()
+    mask_weights_on_imgs = dict()
+    for s in config.RCNN_FEAT_STRIDE:
+        rois_on_imgs.update({'stride%s' % s: list()})
+        labels_on_imgs.update({'stride%s' % s: list()})
+        bbox_targets_on_imgs.update({'stride%s' % s: list()})
+        bbox_weights_on_imgs.update({'stride%s' % s: list()})
+        mask_targets_on_imgs.update({'stride%s' % s: list()})
+        mask_weights_on_imgs.update({'stride%s' % s: list()})
+
+    # Sample rois
+    level_related_data_on_imgs = {}
+    for im_i in range(num_images):
+        roi_rec = roidb[im_i]
+        # infer num_classes from gt_overlaps
+        num_classes = roi_rec['gt_overlaps'].shape[1]
+        # label = class RoI has max overlap with
+        rois = roi_rec['boxes']
+        labels = roi_rec['max_classes']
+        overlaps = roi_rec['max_overlaps']
+        bbox_targets = roi_rec['bbox_targets']
+        im_info = roi_rec['im_info']
+
+        mask_targets = roi_rec['mask_targets']
+        mask_labels = roi_rec['mask_labels']
+        mask_inds = roi_rec['mask_inds']
+
+        assign_levels = roi_rec['assign_levels']
+
+        im_rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels, mask_targets_on_levels, mask_weights_on_levels = \
+            sample_rois_fpn(rois, assign_levels, fg_rois_per_image, rois_per_image, num_classes,
+                            labels, overlaps, bbox_targets, mask_targets=mask_targets, mask_labels=mask_labels, mask_inds=mask_inds, im_info=im_info)
+
+        level_related_data_on_imgs.update({
+            'img_%s' % im_i: {
+                'rois_on_levels': im_rois_on_levels,
+                'labels_on_levels': labels_on_levels,
+                'bbox_targets_on_levels': bbox_targets_on_levels,
+                'bbox_weights_on_levels': bbox_weights_on_levels,
+                'mask_targets_on_levels': mask_targets_on_levels,
+                'mask_weights_on_levels': mask_weights_on_levels,
+            }
+        })
+
+    return im_array, level_related_data_on_imgs
+
+
+def sample_rois(rois,
+                fg_rois_per_image,
+                rois_per_image,
+                num_classes,
+                labels=None,
+                overlaps=None,
+                bbox_targets=None,
+                gt_boxes=None,
+                mask_targets=None,
+                mask_labels=None,
+                mask_inds=None):
+    """
+    generate random sample of ROIs comprising foreground and background examples
+    :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index
+    :param fg_rois_per_image: foreground roi number
+    :param rois_per_image: total roi number
+    :param num_classes: number of classes
+    :param labels: maybe precomputed
+    :param overlaps: maybe precomputed (max_overlaps)
+    :param bbox_targets: maybe precomputed
+    :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls)
+    :return: (rois, labels, bbox_targets, bbox_weights)
+    """
+    if labels is None:
+        if len(gt_boxes) == 0:
+            gt_boxes = np.zeros((1, 5))
+            gt_assignment = np.zeros((len(rois), ), dtype=np.int32)
+            overlaps = np.zeros((len(rois), ))
+            labels = np.zeros((len(rois), ))
+        else:
+            overlaps = bbox_overlaps(rois[:, 1:].astype(np.float),
+                                     gt_boxes[:, :4].astype(np.float))
+            gt_assignment = overlaps.argmax(axis=1)
+            overlaps = overlaps.max(axis=1)
+            labels = gt_boxes[gt_assignment, 4]
+
+    num_rois = rois.shape[0]
+    # foreground RoI with FG_THRESH overlap
+    fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0]
+    # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs
+    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size)
+    # Sample foreground regions without replacement
+    if len(fg_indexes) > fg_rois_per_this_image:
+        fg_indexes = npr.choice(fg_indexes,
+                                size=fg_rois_per_this_image,
+                                replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI)
+                          & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image (guarding against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_indexes.size)
+    # Sample foreground regions without replacement
+    if len(bg_indexes) > bg_rois_per_this_image:
+        bg_indexes = npr.choice(bg_indexes,
+                                size=bg_rois_per_this_image,
+                                replace=False)
+
+    # indexes selected
+    keep_indexes = np.append(fg_indexes, bg_indexes)
+
+    neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0]
+    neg_rois = rois[neg_idx]
+
+    # pad more to ensure a fixed minibatch size
+    while keep_indexes.shape[0] < rois_per_image:
+        gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0])
+        gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False)
+        keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes])
+
+    # select labels
+    labels = labels[keep_indexes]
+    # set labels of bg_rois to be 0
+    labels[fg_rois_per_this_image:] = 0
+    rois = rois[keep_indexes]
+    if mask_targets is not None:
+        assert mask_labels is not None
+        assert mask_inds is not None
+
+        def _mask_umap(mask_targets, mask_labels, mask_inds):
+            _mask_targets = np.zeros((num_rois, num_classes, 28, 28),
+                                     dtype=np.int8)
+            _mask_weights = np.zeros((num_rois, num_classes, 28, 28),
+                                     dtype=np.int8)
+            _mask_targets[mask_inds, mask_labels] = mask_targets
+            _mask_weights[mask_inds, mask_labels] = 1
+            _mask_weights[:, 0] = 0  # set background mask weight to zeros
+            return _mask_targets, _mask_weights  # [num_rois, num_classes, 28, 28]
+
+        mask_targets, mask_weights = _mask_umap(mask_targets, mask_labels,
+                                                mask_inds)
+        mask_targets = mask_targets[keep_indexes]
+        mask_weights = mask_weights[keep_indexes]
+
+    # load or compute bbox_target
+    if bbox_targets is not None:
+        bbox_target_data = bbox_targets[keep_indexes, :]
+    else:
+        targets = bbox_transform(rois[:, 1:],
+                                 gt_boxes[gt_assignment[keep_indexes], :4])
+        if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+            targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) /
+                       np.array(config.TRAIN.BBOX_STDS))
+        bbox_target_data = np.hstack((labels[:, np.newaxis], targets))
+
+    bbox_targets, bbox_weights = \
+        expand_bbox_regression_targets(bbox_target_data, num_classes)
+
+    if mask_targets is not None:
+        return rois, labels, bbox_targets, bbox_weights, mask_targets, mask_weights
+    else:
+        return rois, labels, bbox_targets, bbox_weights
+
+
+def sample_rois_fpn(rois,
+                    assign_levels,
+                    fg_rois_per_image,
+                    rois_per_image,
+                    num_classes,
+                    labels=None,
+                    overlaps=None,
+                    bbox_targets=None,
+                    mask_targets=None,
+                    mask_labels=None,
+                    mask_inds=None,
+                    gt_boxes=None,
+                    im_info=None):
+    """
+    generate random sample of ROIs comprising foreground and background examples
+    :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index
+    :param assign_levels: [n]
+    :param fg_rois_per_image: foreground roi number
+    :param rois_per_image: total roi number
+    :param num_classes: number of classes
+    :param labels: maybe precomputed
+    :param overlaps: maybe precomputed (max_overlaps)
+    :param bbox_targets: maybe precomputed
+    :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls)
+    :return: (rois, labels, bbox_targets, bbox_weights)
+    """
+    DEBUG = False
+    if labels is None:
+        if len(gt_boxes) == 0:
+            gt_boxes = np.zeros((1, 5))
+            gt_assignment = np.zeros((len(rois), ), dtype=np.int32)
+            overlaps = np.zeros((len(rois), ))
+            labels = np.zeros((len(rois), ))
+        else:
+            overlaps = bbox_overlaps(rois[:, 1:].astype(np.float),
+                                     gt_boxes[:, :4].astype(np.float))
+            gt_assignment = overlaps.argmax(axis=1)
+            overlaps = overlaps.max(axis=1)
+            labels = gt_boxes[gt_assignment, 4]
+
+    num_rois = rois.shape[0]
+    # foreground RoI with FG_THRESH overlap
+    fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0]
+    # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs
+    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size)
+
+    if DEBUG:
+        print 'fg total num:', len(fg_indexes)
+
+    # Sample foreground regions without replacement
+    if len(fg_indexes) > fg_rois_per_this_image:
+        fg_indexes = npr.choice(fg_indexes,
+                                size=fg_rois_per_this_image,
+                                replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI)
+                          & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+    if DEBUG:
+        print 'bg total num:', len(bg_indexes)
+    # Compute number of background RoIs to take from this image (guarding against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_indexes.size)
+    # Sample foreground regions without replacement
+    if len(bg_indexes) > bg_rois_per_this_image:
+        bg_indexes = npr.choice(bg_indexes,
+                                size=bg_rois_per_this_image,
+                                replace=False)
+    if DEBUG:
+        print 'fg num:', len(fg_indexes)
+        print 'bg num:', len(bg_indexes)
+
+    # bg rois statistics
+    if DEBUG:
+        bg_assign = assign_levels[bg_indexes]
+        bg_rois_on_levels = dict()
+        for i, s in enumerate(config.RCNN_FEAT_STRIDE):
+            bg_rois_on_levels.update(
+                {'stride%s' % s: len(np.where(bg_assign == s)[0])})
+        print bg_rois_on_levels
+
+    # indexes selected
+    keep_indexes = np.append(fg_indexes, bg_indexes)
+
+    neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0]
+    neg_rois = rois[neg_idx]
+
+    # pad more to ensure a fixed minibatch size
+    while keep_indexes.shape[0] < rois_per_image:
+        gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0])
+        gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False)
+        keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes])
+
+    # select labels
+    labels = labels[keep_indexes]
+    # set labels of bg_rois to be 0
+    labels[fg_rois_per_this_image:] = 0
+    rois = rois[keep_indexes]
+    assign_levels = assign_levels[keep_indexes]
+
+    if mask_targets is not None:
+        assert mask_labels is not None
+        assert mask_inds is not None
+
+        def _mask_umap(mask_targets, mask_labels, mask_inds):
+            _mask_targets = np.zeros((num_rois, num_classes, 28, 28),
+                                     dtype=np.int8)
+            _mask_weights = np.zeros((num_rois, num_classes, 1, 1),
+                                     dtype=np.int8)
+            _mask_targets[mask_inds, mask_labels] = mask_targets
+            _mask_weights[mask_inds, mask_labels] = 1
+            return _mask_targets, _mask_weights  # [num_rois, num_classes, 28, 28]
+
+        mask_targets, mask_weights = _mask_umap(mask_targets, mask_labels,
+                                                mask_inds)
+        mask_targets = mask_targets[keep_indexes]
+        mask_weights = mask_weights[keep_indexes]
+
+    # load or compute bbox_target
+    if bbox_targets is not None:
+        bbox_target_data = bbox_targets[keep_indexes, :]
+    else:
+        targets = bbox_transform(rois[:, 1:],
+                                 gt_boxes[gt_assignment[keep_indexes], :4])
+        if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+            targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) /
+                       np.array(config.TRAIN.BBOX_STDS))
+        bbox_target_data = np.hstack((labels[:, np.newaxis], targets))
+
+    bbox_targets, bbox_weights = \
+        expand_bbox_regression_targets(bbox_target_data, num_classes)
+
+    # Assign to levels
+    rois_on_levels = dict()
+    labels_on_levels = dict()
+    bbox_targets_on_levels = dict()
+    bbox_weights_on_levels = dict()
+    if mask_targets is not None:
+        mask_targets_on_levels = dict()
+        mask_weights_on_levels = dict()
+    for i, s in enumerate(config.RCNN_FEAT_STRIDE):
+        index = np.where(assign_levels == s)
+        _rois = rois[index]
+        _labels = labels[index]
+        _bbox_targets = bbox_targets[index]
+        _bbox_weights = bbox_weights[index]
+        if mask_targets is not None:
+            _mask_targets = mask_targets[index]
+            _mask_weights = mask_weights[index]
+
+        rois_on_levels.update({'stride%s' % s: _rois})
+        labels_on_levels.update({'stride%s' % s: _labels})
+        bbox_targets_on_levels.update({'stride%s' % s: _bbox_targets})
+        bbox_weights_on_levels.update({'stride%s' % s: _bbox_weights})
+        if mask_targets is not None:
+            mask_targets_on_levels.update({'stride%s' % s: _mask_targets})
+            mask_weights_on_levels.update({'stride%s' % s: _mask_weights})
+
+    if mask_targets is not None:
+        return rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels, mask_targets_on_levels, mask_weights_on_levels
+    else:
+        return rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels
+
+
+def get_rois(rois,
+             rois_per_image,
+             num_classes,
+             labels=None,
+             overlaps=None,
+             bbox_targets=None,
+             gt_boxes=None):
+    """
+    get top N ROIs, used in online hard example mining
+    :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index
+    :param rois_per_image: total roi number
+    :param num_classes: number of classes
+    :param labels: maybe precomputed
+    :param overlaps: maybe precomputed (max_overlaps)
+    :param bbox_targets: maybe precomputed
+    :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls)
+    :return: (rois, labels, bbox_targets, bbox_weights)
+    """
+    if labels is None:
+        if len(gt_boxes) == 0:
+            gt_boxes = np.array([[1, 1, 1, 1, 0]])
+        overlaps = bbox_overlaps(rois[:, 1:].astype(np.float),
+                                 gt_boxes[:, :4].astype(np.float))
+        gt_assignment = overlaps.argmax(axis=1)
+        overlaps = overlaps.max(axis=1)
+        labels = gt_boxes[gt_assignment, 4]
+
+    # select indices
+    keep_indexes = np.arange(rois.shape[0])
+    if keep_indexes.shape[0] > rois_per_image:
+        keep_indexes = npr.choice(keep_indexes,
+                                  size=rois_per_image,
+                                  replace=False)
+
+    # if not enough, pad until rois_per_image is satisfied
+    while keep_indexes.shape[0] < rois_per_image:
+        gap = np.minimum(rois_per_image - keep_indexes.shape[0], len(rois))
+        gap_indexes = npr.choice(range(len(rois)), size=gap, replace=False)
+        keep_indexes = np.append(keep_indexes, gap_indexes)
+
+    # suppress any bg defined by overlap
+    bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI)
+                          & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+    labels[bg_indexes] = 0
+
+    labels = labels[keep_indexes]
+    rois = rois[keep_indexes]
+
+    # load or compute bbox_target
+    if bbox_targets is not None:
+        bbox_target_data = bbox_targets[keep_indexes, :]
+    else:
+        targets = bbox_transform(rois[:, 1:],
+                                 gt_boxes[gt_assignment[keep_indexes], :4])
+        if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+            targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) /
+                       np.array(config.TRAIN.BBOX_STDS))
+        bbox_target_data = np.hstack((labels[:, np.newaxis], targets))
+
+    bbox_targets, bbox_weights = \
+        expand_bbox_regression_targets(bbox_target_data, num_classes)
+
+    return rois, labels, bbox_targets, bbox_weights
diff --git a/insightface/detection/retinaface/rcnn/io/rpn.py b/insightface/detection/retinaface/rcnn/io/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..998fad8c85685517618b8fb2f34fddf3c5f992fa
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/io/rpn.py
@@ -0,0 +1,887 @@
+"""
+RPN:
+data =
+    {'data': [num_images, c, h, w],
+     'im_info': [num_images, 4] (optional)}
+label =
+    {'gt_boxes': [num_boxes, 5] (optional),
+     'label': [batch_size, 1] <- [batch_size, num_anchors, feat_height, feat_width],
+     'bbox_target': [batch_size, num_anchors, feat_height, feat_width],
+     'bbox_weight': [batch_size, num_anchors, feat_height, feat_width]}
+"""
+
+from __future__ import print_function
+import sys
+import logging
+import datetime
+import numpy as np
+import numpy.random as npr
+
+from ..logger import logger
+from ..config import config
+from .image import get_image, tensor_vstack, get_crop_image
+from ..processing.generate_anchor import generate_anchors, anchors_plane
+from ..processing.bbox_transform import bbox_overlaps, bbox_transform, landmark_transform
+
+STAT = {0: 0, 8: 0, 16: 0, 32: 0}
+
+
+def get_rpn_testbatch(roidb):
+    """
+    return a dict of testbatch
+    :param roidb: ['image', 'flipped']
+    :return: data, label, im_info
+    """
+    assert len(roidb) == 1, 'Single batch only'
+    imgs, roidb = get_image(roidb)
+    im_array = imgs[0]
+    im_info = np.array([roidb[0]['im_info']], dtype=np.float32)
+
+    data = {'data': im_array, 'im_info': im_info}
+    label = {}
+
+    return data, label, im_info
+
+
+def get_rpn_batch(roidb):
+    """
+    prototype for rpn batch: data, im_info, gt_boxes
+    :param roidb: ['image', 'flipped'] + ['gt_boxes', 'boxes', 'gt_classes']
+    :return: data, label
+    """
+    assert len(roidb) == 1, 'Single batch only'
+    imgs, roidb = get_image(roidb)
+    im_array = imgs[0]
+    im_info = np.array([roidb[0]['im_info']], dtype=np.float32)
+
+    # gt boxes: (x1, y1, x2, y2, cls)
+    if roidb[0]['gt_classes'].size > 0:
+        gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
+        gt_boxes = np.empty((roidb[0]['boxes'].shape[0], 5), dtype=np.float32)
+        gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :]
+        gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
+    else:
+        gt_boxes = np.empty((0, 5), dtype=np.float32)
+
+    data = {'data': im_array, 'im_info': im_info}
+    label = {'gt_boxes': gt_boxes}
+
+    return data, label
+
+
+def get_crop_batch(roidb):
+    """
+    prototype for rpn batch: data, im_info, gt_boxes
+    :param roidb: ['image', 'flipped'] + ['gt_boxes', 'boxes', 'gt_classes']
+    :return: data, label
+    """
+    #assert len(roidb) == 1, 'Single batch only'
+    data_list = []
+    label_list = []
+    imgs, roidb = get_crop_image(roidb)
+    assert len(imgs) == len(roidb)
+    for i in range(len(imgs)):
+        im_array = imgs[i]
+        im_info = np.array([roidb[i]['im_info']], dtype=np.float32)
+
+        # gt boxes: (x1, y1, x2, y2, cls)
+        if roidb[i]['gt_classes'].size > 0:
+            gt_inds = np.where(roidb[i]['gt_classes'] != 0)[0]
+            gt_boxes = np.empty((roidb[i]['boxes'].shape[0], 5),
+                                dtype=np.float32)
+            gt_boxes[:, 0:4] = roidb[i]['boxes'][gt_inds, :]
+            gt_boxes[:, 4] = roidb[i]['gt_classes'][gt_inds]
+            if config.USE_BLUR:
+                gt_blur = roidb[i]['blur']
+            if config.FACE_LANDMARK:
+                #gt_landmarks = np.empty((roidb[i]['landmarks'].shape[0], 11), dtype=np.float32)
+                gt_landmarks = roidb[i]['landmarks'][gt_inds, :, :]
+            if config.HEAD_BOX:
+                gt_boxes_head = np.empty((roidb[i]['boxes_head'].shape[0], 5),
+                                         dtype=np.float32)
+                gt_boxes_head[:, 0:4] = roidb[i]['boxes_head'][gt_inds, :]
+                gt_boxes_head[:, 4] = roidb[i]['gt_classes'][gt_inds]
+        else:
+            gt_boxes = np.empty((0, 5), dtype=np.float32)
+            if config.USE_BLUR:
+                gt_blur = np.empty((0, ), dtype=np.float32)
+            if config.FACE_LANDMARK:
+                gt_landmarks = np.empty((0, 5, 3), dtype=np.float32)
+            if config.HEAD_BOX:
+                gt_boxes_head = np.empty((0, 5), dtype=np.float32)
+
+        data = {'data': im_array, 'im_info': im_info}
+        label = {'gt_boxes': gt_boxes}
+        if config.USE_BLUR:
+            label['gt_blur'] = gt_blur
+        if config.FACE_LANDMARK:
+            label['gt_landmarks'] = gt_landmarks
+        if config.HEAD_BOX:
+            label['gt_boxes_head'] = gt_boxes_head
+        data_list.append(data)
+        label_list.append(label)
+
+    return data_list, label_list
+
+
+def assign_anchor_fpn(feat_shape,
+                      gt_label,
+                      im_info,
+                      landmark=False,
+                      prefix='face',
+                      select_stride=0):
+    """
+    assign ground truth boxes to anchor positions
+    :param feat_shape: infer output shape
+    :param gt_boxes: assign ground truth
+    :param im_info: filter out anchors overlapped with edges
+    :return: tuple
+    labels: of shape (batch_size, 1) <- (batch_size, num_anchors, feat_height, feat_width)
+    bbox_targets: of shape (batch_size, num_anchors * 4, feat_height, feat_width)
+    bbox_weights: mark the assigned anchors
+    """
+    def _unmap(data, count, inds, fill=0):
+        """" unmap a subset inds of data into original data of size count """
+        if len(data.shape) == 1:
+            ret = np.empty((count, ), dtype=np.float32)
+            ret.fill(fill)
+            ret[inds] = data
+        else:
+            ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
+            ret.fill(fill)
+            ret[inds, :] = data
+        return ret
+
+    global STAT
+    DEBUG = False
+
+    im_info = im_info[0]
+    gt_boxes = gt_label['gt_boxes']
+    # clean up boxes
+    nonneg = np.where(gt_boxes[:, 4] != -1)[0]
+    gt_boxes = gt_boxes[nonneg]
+    if config.USE_BLUR:
+        gt_blur = gt_label['gt_blur']
+        gt_blur = gt_blur[nonneg]
+    if landmark:
+        gt_landmarks = gt_label['gt_landmarks']
+        gt_landmarks = gt_landmarks[nonneg]
+        assert gt_boxes.shape[0] == gt_landmarks.shape[0]
+    #scales = np.array(scales, dtype=np.float32)
+    feat_strides = config.RPN_FEAT_STRIDE
+    bbox_pred_len = 4
+    landmark_pred_len = 10
+    if config.USE_BLUR:
+        gt_boxes[:, 4] = gt_blur
+        bbox_pred_len = 5
+    if config.USE_OCCLUSION:
+        landmark_pred_len = 15
+
+    anchors_list = []
+    anchors_num_list = []
+    inds_inside_list = []
+    feat_infos = []
+    A_list = []
+    for i in range(len(feat_strides)):
+        stride = feat_strides[i]
+        sstride = str(stride)
+        base_size = config.RPN_ANCHOR_CFG[sstride]['BASE_SIZE']
+        allowed_border = config.RPN_ANCHOR_CFG[sstride]['ALLOWED_BORDER']
+        ratios = config.RPN_ANCHOR_CFG[sstride]['RATIOS']
+        scales = config.RPN_ANCHOR_CFG[sstride]['SCALES']
+        base_anchors = generate_anchors(base_size=base_size,
+                                        ratios=list(ratios),
+                                        scales=np.array(scales,
+                                                        dtype=np.float32),
+                                        stride=stride,
+                                        dense_anchor=config.DENSE_ANCHOR)
+        num_anchors = base_anchors.shape[0]
+        feat_height, feat_width = feat_shape[i][-2:]
+        feat_stride = feat_strides[i]
+        feat_infos.append([feat_height, feat_width])
+
+        A = num_anchors
+        A_list.append(A)
+        K = feat_height * feat_width
+
+        all_anchors = anchors_plane(feat_height, feat_width, feat_stride,
+                                    base_anchors)
+        all_anchors = all_anchors.reshape((K * A, 4))
+        #print('anchor0', stride, all_anchors[0])
+
+        total_anchors = int(K * A)
+        anchors_num_list.append(total_anchors)
+        # only keep anchors inside the image
+        inds_inside = np.where(
+            (all_anchors[:, 0] >= -allowed_border)
+            & (all_anchors[:, 1] >= -allowed_border)
+            & (all_anchors[:, 2] < im_info[1] + allowed_border)
+            & (all_anchors[:, 3] < im_info[0] + allowed_border))[0]
+        if DEBUG:
+            print('total_anchors', total_anchors)
+            print('inds_inside', len(inds_inside))
+
+        # keep only inside anchors
+        anchors = all_anchors[inds_inside, :]
+        #print('AA', anchors.shape, len(inds_inside))
+
+        anchors_list.append(anchors)
+        inds_inside_list.append(inds_inside)
+
+    # Concat anchors from each level
+    anchors = np.concatenate(anchors_list)
+    for i in range(1, len(inds_inside_list)):
+        inds_inside_list[i] = inds_inside_list[i] + sum(anchors_num_list[:i])
+    inds_inside = np.concatenate(inds_inside_list)
+    total_anchors = sum(anchors_num_list)
+    #print('total_anchors', anchors.shape[0], len(inds_inside), file=sys.stderr)
+
+    # label: 1 is positive, 0 is negative, -1 is dont care
+    labels = np.empty((len(inds_inside), ), dtype=np.float32)
+    labels.fill(-1)
+    #print('BB', anchors.shape, len(inds_inside))
+    #print('gt_boxes', gt_boxes.shape, file=sys.stderr)
+
+    if gt_boxes.size > 0:
+        # overlap between the anchors and the gt boxes
+        # overlaps (ex, gt)
+        overlaps = bbox_overlaps(anchors.astype(np.float),
+                                 gt_boxes.astype(np.float))
+        argmax_overlaps = overlaps.argmax(axis=1)
+        #print('AAA', argmax_overlaps.shape)
+        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                                   np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+        if not config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels first so that positive labels can clobber them
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+        # fg label: for each gt, anchor with highest overlap
+        if config.TRAIN.RPN_FORCE_POSITIVE:
+            labels[gt_argmax_overlaps] = 1
+
+        # fg label: above threshold IoU
+        labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+        if config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels last so that negative labels can clobber positives
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+    else:
+        labels[:] = 0
+    fg_inds = np.where(labels == 1)[0]
+    #print('fg count', len(fg_inds))
+
+    # subsample positive labels if we have too many
+    if config.TRAIN.RPN_ENABLE_OHEM == 0:
+        fg_inds = np.where(labels == 1)[0]
+        num_fg = int(config.TRAIN.RPN_FG_FRACTION *
+                     config.TRAIN.RPN_BATCH_SIZE)
+        if len(fg_inds) > num_fg:
+            disable_inds = npr.choice(fg_inds,
+                                      size=(len(fg_inds) - num_fg),
+                                      replace=False)
+            if DEBUG:
+                disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
+            labels[disable_inds] = -1
+
+        # subsample negative labels if we have too many
+        num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1)
+        bg_inds = np.where(labels == 0)[0]
+        if len(bg_inds) > num_bg:
+            disable_inds = npr.choice(bg_inds,
+                                      size=(len(bg_inds) - num_bg),
+                                      replace=False)
+            if DEBUG:
+                disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+            labels[disable_inds] = -1
+
+        #fg_inds = np.where(labels == 1)[0]
+        #num_fg = len(fg_inds)
+        #num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1)
+
+        #bg_inds = np.where(labels == 0)[0]
+        #if len(bg_inds) > num_bg:
+        #    disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+        #    if DEBUG:
+        #        disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+        #    labels[disable_inds] = -1
+    else:
+        fg_inds = np.where(labels == 1)[0]
+        num_fg = len(fg_inds)
+        bg_inds = np.where(labels == 0)[0]
+        num_bg = len(bg_inds)
+
+    #print('anchor stat', num_fg, num_bg)
+
+    bbox_targets = np.zeros((len(inds_inside), bbox_pred_len),
+                            dtype=np.float32)
+    if gt_boxes.size > 0:
+        #print('GT', gt_boxes.shape, gt_boxes[argmax_overlaps, :4].shape)
+        bbox_targets[:, :] = bbox_transform(anchors,
+                                            gt_boxes[argmax_overlaps, :])
+        #bbox_targets[:,4] = gt_blur
+
+    bbox_weights = np.zeros((len(inds_inside), bbox_pred_len),
+                            dtype=np.float32)
+    #bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS)
+    bbox_weights[labels == 1, 0:4] = 1.0
+    if bbox_pred_len > 4:
+        bbox_weights[labels == 1, 4:bbox_pred_len] = 0.1
+
+    if landmark:
+        landmark_targets = np.zeros((len(inds_inside), landmark_pred_len),
+                                    dtype=np.float32)
+        #landmark_weights = np.zeros((len(inds_inside), 10), dtype=np.float32)
+        landmark_weights = np.zeros((len(inds_inside), landmark_pred_len),
+                                    dtype=np.float32)
+        #landmark_weights[labels == 1, :] = np.array(config.TRAIN.RPN_LANDMARK_WEIGHTS)
+        if landmark_pred_len == 10:
+            landmark_weights[labels == 1, :] = 1.0
+        elif landmark_pred_len == 15:
+            v = [1.0, 1.0, 0.1] * 5
+            assert len(v) == 15
+            landmark_weights[labels == 1, :] = np.array(v)
+        else:
+            assert False
+        #TODO here
+        if gt_landmarks.size > 0:
+            #print('AAA',argmax_overlaps)
+            a_landmarks = gt_landmarks[argmax_overlaps, :, :]
+            landmark_targets[:] = landmark_transform(anchors, a_landmarks)
+            invalid = np.where(a_landmarks[:, 0, 2] < 0.0)[0]
+            #assert len(invalid)==0
+            #landmark_weights[invalid, :] = np.array(config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS)
+            landmark_weights[invalid, :] = 0.0
+
+    #if DEBUG:
+    #    _sums = bbox_targets[labels == 1, :].sum(axis=0)
+    #    _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
+    #    _counts = np.sum(labels == 1)
+    #    means = _sums / (_counts + 1e-14)
+    #    stds = np.sqrt(_squared_sums / _counts - means ** 2)
+    #    print 'means', means
+    #    print 'stdevs', stds
+    # map up to original set of anchors
+    #print(labels.shape, total_anchors, inds_inside.shape, inds_inside[0], inds_inside[-1])
+    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+    bbox_weights = _unmap(bbox_weights, total_anchors, inds_inside, fill=0)
+    if landmark:
+        landmark_targets = _unmap(landmark_targets,
+                                  total_anchors,
+                                  inds_inside,
+                                  fill=0)
+        landmark_weights = _unmap(landmark_weights,
+                                  total_anchors,
+                                  inds_inside,
+                                  fill=0)
+    #print('CC', anchors.shape, len(inds_inside))
+
+    #if DEBUG:
+    #    if gt_boxes.size > 0:
+    #        print 'rpn: max max_overlaps', np.max(max_overlaps)
+    #    print 'rpn: num_positives', np.sum(labels == 1)
+    #    print 'rpn: num_negatives', np.sum(labels == 0)
+    #    _fg_sum = np.sum(labels == 1)
+    #    _bg_sum = np.sum(labels == 0)
+    #    _count = 1
+    #    print 'rpn: num_positive avg', _fg_sum / _count
+    #    print 'rpn: num_negative avg', _bg_sum / _count
+
+    # resahpe
+    label_list = list()
+    bbox_target_list = list()
+    bbox_weight_list = list()
+    if landmark:
+        landmark_target_list = list()
+        landmark_weight_list = list()
+    anchors_num_range = [0] + anchors_num_list
+    label = {}
+    for i in range(len(feat_strides)):
+        stride = feat_strides[i]
+        feat_height, feat_width = feat_infos[i]
+        A = A_list[i]
+        _label = labels[sum(anchors_num_range[:i +
+                                              1]):sum(anchors_num_range[:i +
+                                                                        1]) +
+                        anchors_num_range[i + 1]]
+        if select_stride > 0 and stride != select_stride:
+            #print('set', stride, select_stride)
+            _label[:] = -1
+        #print('_label', _label.shape, select_stride)
+        #_fg_inds = np.where(_label == 1)[0]
+        #n_fg = len(_fg_inds)
+        #STAT[0]+=1
+        #STAT[stride]+=n_fg
+        #if STAT[0]%100==0:
+        #  print('rpn_stat', STAT, file=sys.stderr)
+        bbox_target = bbox_targets[sum(anchors_num_range[:i + 1]
+                                       ):sum(anchors_num_range[:i + 1]) +
+                                   anchors_num_range[i + 1]]
+        bbox_weight = bbox_weights[sum(anchors_num_range[:i + 1]
+                                       ):sum(anchors_num_range[:i + 1]) +
+                                   anchors_num_range[i + 1]]
+        if landmark:
+            landmark_target = landmark_targets[
+                sum(anchors_num_range[:i + 1]):sum(anchors_num_range[:i + 1]) +
+                anchors_num_range[i + 1]]
+            landmark_weight = landmark_weights[
+                sum(anchors_num_range[:i + 1]):sum(anchors_num_range[:i + 1]) +
+                anchors_num_range[i + 1]]
+
+        _label = _label.reshape(
+            (1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
+        _label = _label.reshape((1, A * feat_height * feat_width))
+        bbox_target = bbox_target.reshape(
+            (1, feat_height * feat_width,
+             A * bbox_pred_len)).transpose(0, 2, 1)
+        bbox_weight = bbox_weight.reshape(
+            (1, feat_height * feat_width, A * bbox_pred_len)).transpose(
+                (0, 2, 1))
+        label['%s_label_stride%d' % (prefix, stride)] = _label
+        label['%s_bbox_target_stride%d' % (prefix, stride)] = bbox_target
+        label['%s_bbox_weight_stride%d' % (prefix, stride)] = bbox_weight
+        if landmark:
+            landmark_target = landmark_target.reshape(
+                (1, feat_height * feat_width,
+                 A * landmark_pred_len)).transpose(0, 2, 1)
+            landmark_weight = landmark_weight.reshape(
+                (1, feat_height * feat_width,
+                 A * landmark_pred_len)).transpose((0, 2, 1))
+            label['%s_landmark_target_stride%d' %
+                  (prefix, stride)] = landmark_target
+            label['%s_landmark_weight_stride%d' %
+                  (prefix, stride)] = landmark_weight
+        #print('in_rpn', stride,_label.shape, bbox_target.shape, bbox_weight.shape, file=sys.stderr)
+        label_list.append(_label)
+        #print('DD', _label.shape)
+        bbox_target_list.append(bbox_target)
+        bbox_weight_list.append(bbox_weight)
+        if landmark:
+            landmark_target_list.append(landmark_target)
+            landmark_weight_list.append(landmark_weight)
+
+    label_concat = np.concatenate(label_list, axis=1)
+    bbox_target_concat = np.concatenate(bbox_target_list, axis=2)
+    bbox_weight_concat = np.concatenate(bbox_weight_list, axis=2)
+    #fg_inds = np.where(label_concat[0] == 1)[0]
+    #print('fg_inds_in_rpn2', fg_inds, file=sys.stderr)
+
+    label.update({
+        '%s_label' % prefix: label_concat,
+        '%s_bbox_target' % prefix: bbox_target_concat,
+        '%s_bbox_weight' % prefix: bbox_weight_concat
+    })
+    if landmark:
+        landmark_target_concat = np.concatenate(landmark_target_list, axis=2)
+        landmark_weight_concat = np.concatenate(landmark_weight_list, axis=2)
+        label['%s_landmark_target' % prefix] = landmark_target_concat
+        label['%s_landmark_weight' % prefix] = landmark_weight_concat
+    return label
+
+
+class AA:
+    def __init__(self, feat_shape):
+        self.feat_shape = feat_shape
+        feat_strides = config.RPN_FEAT_STRIDE
+        anchors_list = []
+        anchors_num_list = []
+        inds_inside_list = []
+        feat_infos = []
+        A_list = []
+        DEBUG = False
+        for i in range(len(feat_strides)):
+            stride = feat_strides[i]
+            sstride = str(stride)
+            base_size = config.RPN_ANCHOR_CFG[sstride]['BASE_SIZE']
+            allowed_border = config.RPN_ANCHOR_CFG[sstride]['ALLOWED_BORDER']
+            ratios = config.RPN_ANCHOR_CFG[sstride]['RATIOS']
+            scales = config.RPN_ANCHOR_CFG[sstride]['SCALES']
+            base_anchors = generate_anchors(base_size=base_size,
+                                            ratios=list(ratios),
+                                            scales=np.array(scales,
+                                                            dtype=np.float32),
+                                            stride=stride,
+                                            dense_anchor=config.DENSE_ANCHOR)
+            num_anchors = base_anchors.shape[0]
+            feat_height, feat_width = feat_shape[i][-2:]
+            feat_stride = feat_strides[i]
+            feat_infos.append([feat_height, feat_width])
+
+            A = num_anchors
+            A_list.append(A)
+            K = feat_height * feat_width
+
+            all_anchors = anchors_plane(feat_height, feat_width, feat_stride,
+                                        base_anchors)
+            all_anchors = all_anchors.reshape((K * A, 4))
+            #print('anchor0', stride, all_anchors[0])
+
+            total_anchors = int(K * A)
+            anchors_num_list.append(total_anchors)
+            # only keep anchors inside the image
+            inds_inside = np.where(
+                (all_anchors[:, 0] >= -allowed_border)
+                & (all_anchors[:, 1] >= -allowed_border)
+                & (all_anchors[:, 2] < config.SCALES[0][1] + allowed_border) &
+                (all_anchors[:, 3] < config.SCALES[0][1] + allowed_border))[0]
+            if DEBUG:
+                print('total_anchors', total_anchors)
+                print('inds_inside', len(inds_inside))
+
+            # keep only inside anchors
+            anchors = all_anchors[inds_inside, :]
+            #print('AA', anchors.shape, len(inds_inside))
+
+            anchors_list.append(anchors)
+            inds_inside_list.append(inds_inside)
+        anchors = np.concatenate(anchors_list)
+        for i in range(1, len(inds_inside_list)):
+            inds_inside_list[i] = inds_inside_list[i] + sum(
+                anchors_num_list[:i])
+        inds_inside = np.concatenate(inds_inside_list)
+        #self.anchors_list = anchors_list
+        #self.inds_inside_list = inds_inside_list
+        self.anchors = anchors
+        self.inds_inside = inds_inside
+        self.anchors_num_list = anchors_num_list
+        self.feat_infos = feat_infos
+        self.A_list = A_list
+        self._times = [0.0, 0.0, 0.0, 0.0]
+
+    @staticmethod
+    def _unmap(data, count, inds, fill=0):
+        """" unmap a subset inds of data into original data of size count """
+        if len(data.shape) == 1:
+            ret = np.empty((count, ), dtype=np.float32)
+            ret.fill(fill)
+            ret[inds] = data
+        else:
+            ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
+            ret.fill(fill)
+            ret[inds, :] = data
+        return ret
+
+    def assign_anchor_fpn(self,
+                          gt_label,
+                          im_info,
+                          landmark=False,
+                          prefix='face',
+                          select_stride=0):
+
+        #ta = datetime.datetime.now()
+        im_info = im_info[0]
+        gt_boxes = gt_label['gt_boxes']
+        # clean up boxes
+        nonneg = np.where(gt_boxes[:, 4] != -1)[0]
+        gt_boxes = gt_boxes[nonneg]
+        if config.USE_BLUR:
+            gt_blur = gt_label['gt_blur']
+            gt_blur = gt_blur[nonneg]
+        if landmark:
+            gt_landmarks = gt_label['gt_landmarks']
+            gt_landmarks = gt_landmarks[nonneg]
+            assert gt_boxes.shape[0] == gt_landmarks.shape[0]
+        #scales = np.array(scales, dtype=np.float32)
+        feat_strides = config.RPN_FEAT_STRIDE
+        bbox_pred_len = 4
+        landmark_pred_len = 10
+        if config.USE_BLUR:
+            gt_boxes[:, 4] = gt_blur
+            bbox_pred_len = 5
+        if config.USE_OCCLUSION:
+            landmark_pred_len = 15
+
+        #anchors_list = self.anchors_list
+        #inds_inside_list = self.inds_inside_list
+        anchors = self.anchors
+        inds_inside = self.inds_inside
+        anchors_num_list = self.anchors_num_list
+        feat_infos = self.feat_infos
+        A_list = self.A_list
+
+        total_anchors = sum(anchors_num_list)
+        #print('total_anchors', anchors.shape[0], len(inds_inside), file=sys.stderr)
+
+        # label: 1 is positive, 0 is negative, -1 is dont care
+        labels = np.empty((len(inds_inside), ), dtype=np.float32)
+        labels.fill(-1)
+        #print('BB', anchors.shape, len(inds_inside))
+        #print('gt_boxes', gt_boxes.shape, file=sys.stderr)
+        #tb = datetime.datetime.now()
+        #self._times[0] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+
+        if gt_boxes.size > 0:
+            # overlap between the anchors and the gt boxes
+            # overlaps (ex, gt)
+            overlaps = bbox_overlaps(anchors.astype(np.float),
+                                     gt_boxes.astype(np.float))
+            argmax_overlaps = overlaps.argmax(axis=1)
+            #print('AAA', argmax_overlaps.shape)
+            max_overlaps = overlaps[np.arange(len(inds_inside)),
+                                    argmax_overlaps]
+            gt_argmax_overlaps = overlaps.argmax(axis=0)
+            gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                                       np.arange(overlaps.shape[1])]
+            gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+            if not config.TRAIN.RPN_CLOBBER_POSITIVES:
+                # assign bg labels first so that positive labels can clobber them
+                labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+            # fg label: for each gt, anchor with highest overlap
+            if config.TRAIN.RPN_FORCE_POSITIVE:
+                labels[gt_argmax_overlaps] = 1
+
+            # fg label: above threshold IoU
+            labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+            if config.TRAIN.RPN_CLOBBER_POSITIVES:
+                # assign bg labels last so that negative labels can clobber positives
+                labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+        else:
+            labels[:] = 0
+        fg_inds = np.where(labels == 1)[0]
+        #print('fg count', len(fg_inds))
+
+        # subsample positive labels if we have too many
+        if config.TRAIN.RPN_ENABLE_OHEM == 0:
+            fg_inds = np.where(labels == 1)[0]
+            num_fg = int(config.TRAIN.RPN_FG_FRACTION *
+                         config.TRAIN.RPN_BATCH_SIZE)
+            if len(fg_inds) > num_fg:
+                disable_inds = npr.choice(fg_inds,
+                                          size=(len(fg_inds) - num_fg),
+                                          replace=False)
+                if DEBUG:
+                    disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
+                labels[disable_inds] = -1
+
+            # subsample negative labels if we have too many
+            num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1)
+            bg_inds = np.where(labels == 0)[0]
+            if len(bg_inds) > num_bg:
+                disable_inds = npr.choice(bg_inds,
+                                          size=(len(bg_inds) - num_bg),
+                                          replace=False)
+                if DEBUG:
+                    disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+                labels[disable_inds] = -1
+
+            #fg_inds = np.where(labels == 1)[0]
+            #num_fg = len(fg_inds)
+            #num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1)
+
+            #bg_inds = np.where(labels == 0)[0]
+            #if len(bg_inds) > num_bg:
+            #    disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+            #    if DEBUG:
+            #        disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+            #    labels[disable_inds] = -1
+        else:
+            fg_inds = np.where(labels == 1)[0]
+            num_fg = len(fg_inds)
+            bg_inds = np.where(labels == 0)[0]
+            num_bg = len(bg_inds)
+
+        #print('anchor stat', num_fg, num_bg)
+
+        bbox_targets = np.zeros((len(inds_inside), bbox_pred_len),
+                                dtype=np.float32)
+        if gt_boxes.size > 0:
+            #print('GT', gt_boxes.shape, gt_boxes[argmax_overlaps, :4].shape)
+            bbox_targets[:, :] = bbox_transform(anchors,
+                                                gt_boxes[argmax_overlaps, :])
+            #bbox_targets[:,4] = gt_blur
+        #tb = datetime.datetime.now()
+        #self._times[1] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+
+        bbox_weights = np.zeros((len(inds_inside), bbox_pred_len),
+                                dtype=np.float32)
+        #bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS)
+        bbox_weights[labels == 1, 0:4] = 1.0
+        if bbox_pred_len > 4:
+            bbox_weights[labels == 1, 4:bbox_pred_len] = 0.1
+
+        if landmark:
+            landmark_targets = np.zeros((len(inds_inside), landmark_pred_len),
+                                        dtype=np.float32)
+            #landmark_weights = np.zeros((len(inds_inside), 10), dtype=np.float32)
+            landmark_weights = np.zeros((len(inds_inside), landmark_pred_len),
+                                        dtype=np.float32)
+            #landmark_weights[labels == 1, :] = np.array(config.TRAIN.RPN_LANDMARK_WEIGHTS)
+            if landmark_pred_len == 10:
+                landmark_weights[labels == 1, :] = 1.0
+            elif landmark_pred_len == 15:
+                v = [1.0, 1.0, 0.1] * 5
+                assert len(v) == 15
+                landmark_weights[labels == 1, :] = np.array(v)
+            else:
+                assert False
+            #TODO here
+            if gt_landmarks.size > 0:
+                #print('AAA',argmax_overlaps)
+                a_landmarks = gt_landmarks[argmax_overlaps, :, :]
+                landmark_targets[:] = landmark_transform(anchors, a_landmarks)
+                invalid = np.where(a_landmarks[:, 0, 2] < 0.0)[0]
+                #assert len(invalid)==0
+                #landmark_weights[invalid, :] = np.array(config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS)
+                landmark_weights[invalid, :] = 0.0
+        #tb = datetime.datetime.now()
+        #self._times[2] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+
+        #if DEBUG:
+        #    _sums = bbox_targets[labels == 1, :].sum(axis=0)
+        #    _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
+        #    _counts = np.sum(labels == 1)
+        #    means = _sums / (_counts + 1e-14)
+        #    stds = np.sqrt(_squared_sums / _counts - means ** 2)
+        #    print 'means', means
+        #    print 'stdevs', stds
+        # map up to original set of anchors
+        #print(labels.shape, total_anchors, inds_inside.shape, inds_inside[0], inds_inside[-1])
+        labels = AA._unmap(labels, total_anchors, inds_inside, fill=-1)
+        bbox_targets = AA._unmap(bbox_targets,
+                                 total_anchors,
+                                 inds_inside,
+                                 fill=0)
+        bbox_weights = AA._unmap(bbox_weights,
+                                 total_anchors,
+                                 inds_inside,
+                                 fill=0)
+        if landmark:
+            landmark_targets = AA._unmap(landmark_targets,
+                                         total_anchors,
+                                         inds_inside,
+                                         fill=0)
+            landmark_weights = AA._unmap(landmark_weights,
+                                         total_anchors,
+                                         inds_inside,
+                                         fill=0)
+        #print('CC', anchors.shape, len(inds_inside))
+
+        bbox_targets[:,
+                     0::4] = bbox_targets[:, 0::4] / config.TRAIN.BBOX_STDS[0]
+        bbox_targets[:,
+                     1::4] = bbox_targets[:, 1::4] / config.TRAIN.BBOX_STDS[1]
+        bbox_targets[:,
+                     2::4] = bbox_targets[:, 2::4] / config.TRAIN.BBOX_STDS[2]
+        bbox_targets[:,
+                     3::4] = bbox_targets[:, 3::4] / config.TRAIN.BBOX_STDS[3]
+        landmark_targets /= config.TRAIN.LANDMARK_STD
+        #print('applied STD')
+
+        #if DEBUG:
+        #    if gt_boxes.size > 0:
+        #        print 'rpn: max max_overlaps', np.max(max_overlaps)
+        #    print 'rpn: num_positives', np.sum(labels == 1)
+        #    print 'rpn: num_negatives', np.sum(labels == 0)
+        #    _fg_sum = np.sum(labels == 1)
+        #    _bg_sum = np.sum(labels == 0)
+        #    _count = 1
+        #    print 'rpn: num_positive avg', _fg_sum / _count
+        #    print 'rpn: num_negative avg', _bg_sum / _count
+
+        # resahpe
+        label_list = list()
+        bbox_target_list = list()
+        bbox_weight_list = list()
+        if landmark:
+            landmark_target_list = list()
+            landmark_weight_list = list()
+        anchors_num_range = [0] + anchors_num_list
+        label = {}
+        for i in range(len(feat_strides)):
+            stride = feat_strides[i]
+            feat_height, feat_width = feat_infos[i]
+            A = A_list[i]
+            _label = labels[sum(anchors_num_range[:i + 1]
+                                ):sum(anchors_num_range[:i + 1]) +
+                            anchors_num_range[i + 1]]
+            if select_stride > 0 and stride != select_stride:
+                #print('set', stride, select_stride)
+                _label[:] = -1
+            #print('_label', _label.shape, select_stride)
+            #_fg_inds = np.where(_label == 1)[0]
+            #n_fg = len(_fg_inds)
+            #STAT[0]+=1
+            #STAT[stride]+=n_fg
+            #if STAT[0]%100==0:
+            #  print('rpn_stat', STAT, file=sys.stderr)
+            bbox_target = bbox_targets[sum(anchors_num_range[:i + 1]
+                                           ):sum(anchors_num_range[:i + 1]) +
+                                       anchors_num_range[i + 1]]
+            bbox_weight = bbox_weights[sum(anchors_num_range[:i + 1]
+                                           ):sum(anchors_num_range[:i + 1]) +
+                                       anchors_num_range[i + 1]]
+            if landmark:
+                landmark_target = landmark_targets[
+                    sum(anchors_num_range[:i +
+                                          1]):sum(anchors_num_range[:i + 1]) +
+                    anchors_num_range[i + 1]]
+                landmark_weight = landmark_weights[
+                    sum(anchors_num_range[:i +
+                                          1]):sum(anchors_num_range[:i + 1]) +
+                    anchors_num_range[i + 1]]
+
+            _label = _label.reshape(
+                (1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
+            _label = _label.reshape((1, A * feat_height * feat_width))
+            bbox_target = bbox_target.reshape(
+                (1, feat_height * feat_width,
+                 A * bbox_pred_len)).transpose(0, 2, 1)
+            bbox_weight = bbox_weight.reshape(
+                (1, feat_height * feat_width, A * bbox_pred_len)).transpose(
+                    (0, 2, 1))
+            label['%s_label_stride%d' % (prefix, stride)] = _label
+            label['%s_bbox_target_stride%d' % (prefix, stride)] = bbox_target
+            label['%s_bbox_weight_stride%d' % (prefix, stride)] = bbox_weight
+            if landmark:
+                landmark_target = landmark_target.reshape(
+                    (1, feat_height * feat_width,
+                     A * landmark_pred_len)).transpose(0, 2, 1)
+                landmark_weight = landmark_weight.reshape(
+                    (1, feat_height * feat_width,
+                     A * landmark_pred_len)).transpose((0, 2, 1))
+                label['%s_landmark_target_stride%d' %
+                      (prefix, stride)] = landmark_target
+                label['%s_landmark_weight_stride%d' %
+                      (prefix, stride)] = landmark_weight
+            #print('in_rpn', stride,_label.shape, bbox_target.shape, bbox_weight.shape, file=sys.stderr)
+            label_list.append(_label)
+            #print('DD', _label.shape)
+            bbox_target_list.append(bbox_target)
+            bbox_weight_list.append(bbox_weight)
+            if landmark:
+                landmark_target_list.append(landmark_target)
+                landmark_weight_list.append(landmark_weight)
+
+        label_concat = np.concatenate(label_list, axis=1)
+        bbox_target_concat = np.concatenate(bbox_target_list, axis=2)
+        bbox_weight_concat = np.concatenate(bbox_weight_list, axis=2)
+        #fg_inds = np.where(label_concat[0] == 1)[0]
+        #print('fg_inds_in_rpn2', fg_inds, file=sys.stderr)
+
+        label.update({
+            '%s_label' % prefix: label_concat,
+            '%s_bbox_target' % prefix: bbox_target_concat,
+            '%s_bbox_weight' % prefix: bbox_weight_concat
+        })
+        if landmark:
+            landmark_target_concat = np.concatenate(landmark_target_list,
+                                                    axis=2)
+            landmark_weight_concat = np.concatenate(landmark_weight_list,
+                                                    axis=2)
+            label['%s_landmark_target' % prefix] = landmark_target_concat
+            label['%s_landmark_weight' % prefix] = landmark_weight_concat
+        #tb = datetime.datetime.now()
+        #self._times[3] += (tb-ta).total_seconds()
+        #ta = datetime.datetime.now()
+        #print(self._times)
+        return label
diff --git a/insightface/detection/retinaface/rcnn/logger.py b/insightface/detection/retinaface/rcnn/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..2806e1add180b4530956387e112ed07a566ce869
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/logger.py
@@ -0,0 +1,6 @@
+import logging
+
+# set up logger
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
diff --git a/insightface/detection/retinaface/rcnn/processing/__init__.py b/insightface/detection/retinaface/rcnn/processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/processing/assign_levels.py b/insightface/detection/retinaface/rcnn/processing/assign_levels.py
new file mode 100755
index 0000000000000000000000000000000000000000..012d73d2134cc50aee3aba73641c520084538621
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/processing/assign_levels.py
@@ -0,0 +1,36 @@
+from rcnn.config import config
+import numpy as np
+
+
+def compute_assign_targets(rois, threshold):
+    rois_area = np.sqrt(
+        (rois[:, 2] - rois[:, 0] + 1) * (rois[:, 3] - rois[:, 1] + 1))
+    num_rois = np.shape(rois)[0]
+    assign_levels = np.zeros(num_rois, dtype=np.uint8)
+    for i, stride in enumerate(config.RCNN_FEAT_STRIDE):
+        thd = threshold[i]
+        idx = np.logical_and(thd[1] <= rois_area, rois_area < thd[0])
+        assign_levels[idx] = stride
+
+    assert 0 not in assign_levels, "All rois should assign to specify levels."
+    return assign_levels
+
+
+def add_assign_targets(roidb):
+    """
+    given roidb, add ['assign_level']
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    """
+    print 'add assign targets'
+    assert len(roidb) > 0
+    assert 'boxes' in roidb[0]
+
+    area_threshold = [[np.inf, 448], [448, 224], [224, 112], [112, 0]]
+
+    assert len(config.RCNN_FEAT_STRIDE) == len(area_threshold)
+
+    num_images = len(roidb)
+    for im_i in range(num_images):
+        rois = roidb[im_i]['boxes']
+        roidb[im_i]['assign_levels'] = compute_assign_targets(
+            rois, area_threshold)
diff --git a/insightface/detection/retinaface/rcnn/processing/bbox_regression.py b/insightface/detection/retinaface/rcnn/processing/bbox_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eaf917a6f3a2282c3fabb929b270441329b5198
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/processing/bbox_regression.py
@@ -0,0 +1,263 @@
+"""
+This file has functions about generating bounding box regression targets
+"""
+
+from ..pycocotools.mask import encode
+import numpy as np
+
+from ..logger import logger
+from .bbox_transform import bbox_overlaps, bbox_transform
+from rcnn.config import config
+import math
+import cv2
+import PIL.Image as Image
+import threading
+import Queue
+
+
+def compute_bbox_regression_targets(rois, overlaps, labels):
+    """
+    given rois, overlaps, gt labels, compute bounding box regression targets
+    :param rois: roidb[i]['boxes'] k * 4
+    :param overlaps: roidb[i]['max_overlaps'] k * 1
+    :param labels: roidb[i]['max_classes'] k * 1
+    :return: targets[i][class, dx, dy, dw, dh] k * 5
+    """
+    # Ensure ROIs are floats
+    rois = rois.astype(np.float, copy=False)
+
+    # Sanity check
+    if len(rois) != len(overlaps):
+        logger.warning('bbox regression: len(rois) != len(overlaps)')
+
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        logger.warning('bbox regression: len(gt_inds) == 0')
+
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
+    targets[ex_inds, 0] = labels[ex_inds]
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
+    return targets
+
+
+def add_bbox_regression_targets(roidb):
+    """
+    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    :return: means, std variances of targets
+    """
+    logger.info('bbox regression: add bounding box regression targets')
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0]
+
+    num_images = len(roidb)
+    num_classes = roidb[0]['gt_overlaps'].shape[1]
+    for im_i in range(num_images):
+        rois = roidb[im_i]['boxes']
+        max_overlaps = roidb[im_i]['max_overlaps']
+        max_classes = roidb[im_i]['max_classes']
+        roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(
+            rois, max_overlaps, max_classes)
+
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # use fixed / precomputed means and stds instead of empirical values
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
+    else:
+        # compute mean, std values
+        class_counts = np.zeros((num_classes, 1)) + 1e-14
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in range(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in range(1, num_classes):
+                cls_indexes = np.where(targets[:, 0] == cls)[0]
+                if cls_indexes.size > 0:
+                    class_counts[cls] += cls_indexes.size
+                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
+                    squared_sums[cls, :] += (targets[cls_indexes,
+                                                     1:]**2).sum(axis=0)
+
+        means = sums / class_counts
+        # var(x) = E(x^2) - E(x)^2
+        stds = np.sqrt(squared_sums / class_counts - means**2)
+
+    # normalized targets
+    for im_i in range(num_images):
+        targets = roidb[im_i]['bbox_targets']
+        for cls in range(1, num_classes):
+            cls_indexes = np.where(targets[:, 0] == cls)[0]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :]
+
+    return means.ravel(), stds.ravel()
+
+
+def expand_bbox_regression_targets(bbox_targets_data, num_classes):
+    """
+    expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets
+    :param bbox_targets_data: [k * 5]
+    :param num_classes: number of classes
+    :return: bbox target processed [k * 4 num_classes]
+    bbox_weights ! only foreground boxes have bbox regression computation!
+    """
+    classes = bbox_targets_data[:, 0]
+    bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32)
+    bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    indexes = np.where(classes > 0)[0]
+    for index in indexes:
+        cls = classes[index]
+        start = int(4 * cls)
+        end = start + 4
+        bbox_targets[index, start:end] = bbox_targets_data[index, 1:]
+        bbox_weights[index, start:end] = config.TRAIN.BBOX_WEIGHTS
+    return bbox_targets, bbox_weights
+
+
+def compute_mask_and_label(ex_rois, ex_labels, seg, flipped):
+    # assert os.path.exists(seg_gt), 'Path does not exist: {}'.format(seg_gt)
+    # im = Image.open(seg_gt)
+    # pixel = list(im.getdata())
+    # pixel = np.array(pixel).reshape([im.size[1], im.size[0]])
+    im = Image.open(seg)
+    pixel = list(im.getdata())
+    ins_seg = np.array(pixel).reshape([im.size[1], im.size[0]])
+    if flipped:
+        ins_seg = ins_seg[:, ::-1]
+    rois = ex_rois
+    n_rois = ex_rois.shape[0]
+    label = ex_labels
+    class_id = config.CLASS_ID
+    mask_target = np.zeros((n_rois, 28, 28), dtype=np.int8)
+    mask_label = np.zeros((n_rois), dtype=np.int8)
+    for n in range(n_rois):
+        target = ins_seg[int(rois[n, 1]):int(rois[n, 3]),
+                         int(rois[n, 0]):int(rois[n, 2])]
+        ids = np.unique(target)
+        ins_id = 0
+        max_count = 0
+        for id in ids:
+            if math.floor(id / 1000) == class_id[int(label[int(n)])]:
+                px = np.where(ins_seg == int(id))
+                x_min = np.min(px[1])
+                y_min = np.min(px[0])
+                x_max = np.max(px[1])
+                y_max = np.max(px[0])
+                x1 = max(rois[n, 0], x_min)
+                y1 = max(rois[n, 1], y_min)
+                x2 = min(rois[n, 2], x_max)
+                y2 = min(rois[n, 3], y_max)
+                iou = (x2 - x1) * (y2 - y1)
+                iou = iou / ((rois[n, 2] - rois[n, 0]) *
+                             (rois[n, 3] - rois[n, 1]) + (x_max - x_min) *
+                             (y_max - y_min) - iou)
+                if iou > max_count:
+                    ins_id = id
+                    max_count = iou
+
+        if max_count == 0:
+            continue
+        # print max_count
+        mask = np.zeros(target.shape)
+        idx = np.where(target == ins_id)
+        mask[idx] = 1
+        mask = cv2.resize(mask, (28, 28), interpolation=cv2.INTER_NEAREST)
+
+        mask_target[n] = mask
+        mask_label[n] = label[int(n)]
+    return mask_target, mask_label
+
+
+def compute_bbox_mask_targets_and_label(rois, overlaps, labels, seg, flipped):
+    """
+    given rois, overlaps, gt labels, seg, compute bounding box mask targets
+    :param rois: roidb[i]['boxes'] k * 4
+    :param overlaps: roidb[i]['max_overlaps'] k * 1
+    :param labels: roidb[i]['max_classes'] k * 1
+    :return: targets[i][class, dx, dy, dw, dh] k * 5
+    """
+    # Ensure ROIs are floats
+    rois = rois.astype(np.float, copy=False)
+
+    # Sanity check
+    if len(rois) != len(overlaps):
+        print 'bbox regression: this should not happen'
+
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        print 'something wrong : zero ground truth rois'
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    mask_targets, mask_label = compute_mask_and_label(ex_rois, labels[ex_inds],
+                                                      seg, flipped)
+    return mask_targets, mask_label, ex_inds
+
+
+def add_mask_targets(roidb):
+    """
+    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    :return: means, std variances of targets
+    """
+    print 'add bounding box mask targets'
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0]
+
+    num_images = len(roidb)
+
+    # Multi threads processing
+    im_quene = Queue.Queue(maxsize=0)
+    for im_i in range(num_images):
+        im_quene.put(im_i)
+
+    def process():
+        while not im_quene.empty():
+            im_i = im_quene.get()
+            print "-----process img {}".format(im_i)
+            rois = roidb[im_i]['boxes']
+            max_overlaps = roidb[im_i]['max_overlaps']
+            max_classes = roidb[im_i]['max_classes']
+            ins_seg = roidb[im_i]['ins_seg']
+            flipped = roidb[im_i]['flipped']
+            roidb[im_i]['mask_targets'], roidb[im_i]['mask_labels'], roidb[im_i]['mask_inds'] = \
+                compute_bbox_mask_targets_and_label(rois, max_overlaps, max_classes, ins_seg, flipped)
+
+    threads = [threading.Thread(target=process, args=()) for i in range(10)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    # Single thread
+    # for im_i in range(num_images):
+    #     print "-----processing img {}".format(im_i)
+    #     rois = roidb[im_i]['boxes']
+    #     max_overlaps = roidb[im_i]['max_overlaps']
+    #     max_classes = roidb[im_i]['max_classes']
+    #     ins_seg = roidb[im_i]['ins_seg']
+    #     # roidb[im_i]['mask_targets'] = compute_bbox_mask_targets(rois, max_overlaps, max_classes, ins_seg)
+    #     roidb[im_i]['mask_targets'], roidb[im_i]['mask_labels'], roidb[im_i]['mask_inds'] = \
+    #         compute_bbox_mask_targets_and_label(rois, max_overlaps, max_classes, ins_seg)
diff --git a/insightface/detection/retinaface/rcnn/processing/bbox_transform.py b/insightface/detection/retinaface/rcnn/processing/bbox_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee3646ab3bea9ed7b0c5f378d388ef84ac857fb
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/processing/bbox_transform.py
@@ -0,0 +1,223 @@
+import numpy as np
+from ..cython.bbox import bbox_overlaps_cython
+#from rcnn.config import config
+
+
+def bbox_overlaps(boxes, query_boxes):
+    return bbox_overlaps_cython(boxes, query_boxes)
+
+
+def bbox_overlaps_py(boxes, query_boxes):
+    """
+    determine overlaps between boxes and query_boxes
+    :param boxes: n * 4 bounding boxes
+    :param query_boxes: k * 4 bounding boxes
+    :return: overlaps: n * k overlaps
+    """
+    n_ = boxes.shape[0]
+    k_ = query_boxes.shape[0]
+    overlaps = np.zeros((n_, k_), dtype=np.float)
+    for k in range(k_):
+        query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] +
+                          1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(n_):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(
+                boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(
+                    boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    box_area = (boxes[n, 2] - boxes[n, 0] +
+                                1) * (boxes[n, 3] - boxes[n, 1] + 1)
+                    all_area = float(box_area + query_box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / all_area
+    return overlaps
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4* num_classes]
+    :param im_shape: tuple of 2
+    :return: [N, 4* num_classes]
+    """
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def nonlinear_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    if gt_rois.shape[1] <= 4:
+        targets = np.vstack(
+            (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+        return targets
+    else:
+        targets = [targets_dx, targets_dy, targets_dw, targets_dh]
+        #if config.USE_BLUR:
+        #  for i in range(4, gt_rois.shape[1]):
+        #    t = gt_rois[:,i]
+        #    targets.append(t)
+        targets = np.vstack(targets).transpose()
+        return targets
+
+
+def landmark_transform(ex_rois, gt_rois):
+
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    targets = []
+    for i in range(gt_rois.shape[1]):
+        for j in range(gt_rois.shape[2]):
+            #if not config.USE_OCCLUSION and j==2:
+            #  continue
+            if j == 2:
+                continue
+            if j == 0:  #w
+                target = (gt_rois[:, i, j] - ex_ctr_x) / (ex_widths + 1e-14)
+            elif j == 1:  #h
+                target = (gt_rois[:, i, j] - ex_ctr_y) / (ex_heights + 1e-14)
+            else:  #visibile
+                target = gt_rois[:, i, j]
+            targets.append(target)
+
+    targets = np.vstack(targets).transpose()
+    return targets
+
+
+def nonlinear_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+    dx = box_deltas[:, 0::4]
+    dy = box_deltas[:, 1::4]
+    dw = box_deltas[:, 2::4]
+    dh = box_deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+    return pred_boxes
+
+
+def landmark_pred(boxes, landmark_deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, landmark_deltas.shape[1]))
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+    preds = []
+    for i in range(landmark_deltas.shape[1]):
+        if i % 2 == 0:
+            pred = (landmark_deltas[:, i] * widths + ctr_x)
+        else:
+            pred = (landmark_deltas[:, i] * heights + ctr_y)
+        preds.append(pred)
+    preds = np.vstack(preds).transpose()
+    return preds
+
+
+def iou_transform(ex_rois, gt_rois):
+    """ return bbox targets, IoU loss uses gt_rois as gt """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+    return gt_rois
+
+
+def iou_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    dx1 = box_deltas[:, 0::4]
+    dy1 = box_deltas[:, 1::4]
+    dx2 = box_deltas[:, 2::4]
+    dy2 = box_deltas[:, 3::4]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis]
+    # y1
+    pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis]
+    # x2
+    pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis]
+    # y2
+    pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis]
+
+    return pred_boxes
+
+
+# define bbox_transform and bbox_pred
+bbox_transform = nonlinear_transform
+bbox_pred = nonlinear_pred
diff --git a/insightface/detection/retinaface/rcnn/processing/generate_anchor.py b/insightface/detection/retinaface/rcnn/processing/generate_anchor.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c5ada2a635186d31911cf1faa7df0e6a65d7ff
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/processing/generate_anchor.py
@@ -0,0 +1,135 @@
+"""
+Generate base anchors on index 0
+"""
+from __future__ import print_function
+import sys
+from builtins import range
+import numpy as np
+from ..cython.anchors import anchors_cython
+#from ..config import config
+
+
+def anchors_plane(feat_h, feat_w, stride, base_anchor):
+    return anchors_cython(feat_h, feat_w, stride, base_anchor)
+
+
+def generate_anchors(base_size=16,
+                     ratios=[0.5, 1, 2],
+                     scales=2**np.arange(3, 6),
+                     stride=16,
+                     dense_anchor=False):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([
+        _scale_enum(ratio_anchors[i, :], scales)
+        for i in range(ratio_anchors.shape[0])
+    ])
+    if dense_anchor:
+        assert stride % 2 == 0
+        anchors2 = anchors.copy()
+        anchors2[:, :] += int(stride / 2)
+        anchors = np.vstack((anchors, anchors2))
+    #print('GA',base_anchor.shape, ratio_anchors.shape, anchors.shape)
+    return anchors
+
+
+#def generate_anchors_fpn(base_size=[64,32,16,8,4], ratios=[0.5, 1, 2], scales=8):
+#    """
+#    Generate anchor (reference) windows by enumerating aspect ratios X
+#    scales wrt a reference (0, 0, 15, 15) window.
+#    """
+#    anchors = []
+#    _ratios = ratios.reshape( (len(base_size), -1) )
+#    _scales = scales.reshape( (len(base_size), -1) )
+#    for i,bs in enumerate(base_size):
+#      __ratios = _ratios[i]
+#      __scales = _scales[i]
+#      #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+#      r = generate_anchors(bs, __ratios, __scales)
+#      #print('anchors_fpn', r.shape, file=sys.stderr)
+#      anchors.append(r)
+#    return anchors
+
+
+def generate_anchors_fpn(dense_anchor=False, cfg=None):
+    #assert(False)
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+    if cfg is None:
+        from ..config import config
+        cfg = config.RPN_ANCHOR_CFG
+    RPN_FEAT_STRIDE = []
+    for k in cfg:
+        RPN_FEAT_STRIDE.append(int(k))
+    RPN_FEAT_STRIDE = sorted(RPN_FEAT_STRIDE, reverse=True)
+    anchors = []
+    for k in RPN_FEAT_STRIDE:
+        v = cfg[str(k)]
+        bs = v['BASE_SIZE']
+        __ratios = np.array(v['RATIOS'])
+        __scales = np.array(v['SCALES'])
+        stride = int(k)
+        #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+        r = generate_anchors(bs, __ratios, __scales, stride, dense_anchor)
+        #print('anchors_fpn', r.shape, file=sys.stderr)
+        anchors.append(r)
+
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/insightface/detection/retinaface/rcnn/processing/nms.py b/insightface/detection/retinaface/rcnn/processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32d92d0ff738f7ad4f8ecc180ec04423a9a0a73
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/processing/nms.py
@@ -0,0 +1,67 @@
+import numpy as np
+from ..cython.cpu_nms import cpu_nms
+try:
+    from ..cython.gpu_nms import gpu_nms
+except ImportError:
+    gpu_nms = None
+
+
+def py_nms_wrapper(thresh):
+    def _nms(dets):
+        return nms(dets, thresh)
+
+    return _nms
+
+
+def cpu_nms_wrapper(thresh):
+    def _nms(dets):
+        return cpu_nms(dets, thresh)
+
+    return _nms
+
+
+def gpu_nms_wrapper(thresh, device_id):
+    def _nms(dets):
+        return gpu_nms(dets, thresh, device_id)
+
+    if gpu_nms is not None:
+        return _nms
+    else:
+        return cpu_nms_wrapper(thresh)
+
+
+def nms(dets, thresh):
+    """
+    greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+    :param dets: [[x1, y1, x2, y2 score]]
+    :param thresh: retain overlap < thresh
+    :return: indexes to keep
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/UPSTREAM_REV b/insightface/detection/retinaface/rcnn/pycocotools/UPSTREAM_REV
new file mode 100644
index 0000000000000000000000000000000000000000..9613b145b23779106bacd2a8e9bbe72dc39c6bbc
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/UPSTREAM_REV
@@ -0,0 +1 @@
+https://github.com/pdollar/coco/commit/336d2a27c91e3c0663d2dcf0b13574674d30f88e
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/__init__.py b/insightface/detection/retinaface/rcnn/pycocotools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7d85bba884ea8f83fc6ab2a1e6ade80d98d4d9
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/_mask.c b/insightface/detection/retinaface/rcnn/pycocotools/_mask.c
new file mode 100644
index 0000000000000000000000000000000000000000..0706a2fe4545a464645726310ccac117d5fb041c
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/_mask.c
@@ -0,0 +1,17234 @@
+/* Generated by Cython 0.28.5 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "/root/anaconda2/lib/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h", 
+            "/root/anaconda2/lib/python2.7/site-packages/numpy/core/include/numpy/ufuncobject.h", 
+            "maskApi.h"
+        ], 
+        "extra_compile_args": [
+            "-Wno-cpp", 
+            "-Wno-unused-function", 
+            "-std=c99"
+        ], 
+        "include_dirs": [
+            "/root/anaconda2/lib/python2.7/site-packages/numpy/core/include"
+        ], 
+        "language": "c", 
+        "name": "_mask", 
+        "sources": [
+            "_mask.pyx", 
+            "maskApi.c"
+        ]
+    }, 
+    "module_name": "_mask"
+}
+END: Cython Metadata */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_28_5"
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (0 && PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #elif defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0; // PyThread_create_key reports success always
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif // TSS (Thread Specific Storage) API
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+  #define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+{ \
+  __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
+}
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE___mask
+#define __PYX_HAVE_API___mask
+/* Early includes */
+#include <string.h>
+#include <stdio.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include <stdlib.h>
+#include "maskApi.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "_mask.pyx",
+  "stringsource",
+  "__init__.pxd",
+  "type.pxd",
+};
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":730
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":731
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":737
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":738
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":739
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":740
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":744
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":745
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":754
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":756
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":759
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":763
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":765
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":767
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+struct __pyx_obj_5_mask_RLEs;
+struct __pyx_obj_5_mask_Masks;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":770
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+
+/* "_mask.pyx":56
+ * # python class to wrap RLE array in C
+ * # the class handles the memory allocation and deallocation
+ * cdef class RLEs:             # <<<<<<<<<<<<<<
+ *     cdef RLE *_R
+ *     cdef siz _n
+ */
+struct __pyx_obj_5_mask_RLEs {
+  PyObject_HEAD
+  RLE *_R;
+  siz _n;
+};
+
+
+/* "_mask.pyx":77
+ * # python class to wrap Mask array in C
+ * # the class handles the memory allocation and deallocation
+ * cdef class Masks:             # <<<<<<<<<<<<<<
+ *     cdef byte *_mask
+ *     cdef siz _h
+ */
+struct __pyx_obj_5_mask_Masks {
+  PyObject_HEAD
+  byte *_mask;
+  siz _h;
+  siz _w;
+  siz _n;
+};
+
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* IncludeStringH.proto */
+#include <string.h>
+
+/* BytesEquals.proto */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* UnicodeEquals.proto */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* StrEquals.proto */
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals
+#else
+#define __Pyx_PyString_Equals __Pyx_PyBytes_Equals
+#endif
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs)  (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* ListAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+        Py_INCREF(x);
+        PyList_SET_ITEM(list, len, x);
+        Py_SIZE(list) = len+1;
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, long intval, int inplace);
+#else
+#define __Pyx_PyInt_AddObjC(op1, op2, intval, inplace)\
+    (inplace ? PyNumber_InPlaceAdd(op1, op2) : PyNumber_Add(op1, op2))
+#endif
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, long intval, int inplace);
+#else
+#define __Pyx_PyInt_EqObjC(op1, op2, intval, inplace)\
+    PyObject_RichCompare(op1, op2, Py_EQ)
+    #endif
+
+/* GetModuleGlobalName.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
+
+/* DictGetItem.proto */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key);
+#define __Pyx_PyObject_Dict_GetItem(obj, name)\
+    (likely(PyDict_CheckExact(obj)) ?\
+     __Pyx_PyDict_GetItem(obj, name) : PyObject_GetItem(obj, name))
+#else
+#define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key)
+#define __Pyx_PyObject_Dict_GetItem(obj, name)  PyObject_GetItem(obj, name)
+#endif
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+                                                     int is_list, int wraparound, int boundscheck);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* ListCompAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len)) {
+        Py_INCREF(x);
+        PyList_SET_ITEM(list, len, x);
+        Py_SIZE(list) = len+1;
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* FetchCommonType.proto */
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
+
+/* CythonFunction.proto */
+#define __Pyx_CyFunction_USED 1
+#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
+#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
+#define __Pyx_CYFUNCTION_CCLASS        0x04
+#define __Pyx_CyFunction_GetClosure(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_closure)
+#define __Pyx_CyFunction_GetClassObj(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_classobj)
+#define __Pyx_CyFunction_Defaults(type, f)\
+    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
+#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
+    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
+typedef struct {
+    PyCFunctionObject func;
+#if PY_VERSION_HEX < 0x030500A0
+    PyObject *func_weakreflist;
+#endif
+    PyObject *func_dict;
+    PyObject *func_name;
+    PyObject *func_qualname;
+    PyObject *func_doc;
+    PyObject *func_globals;
+    PyObject *func_code;
+    PyObject *func_closure;
+    PyObject *func_classobj;
+    void *defaults;
+    int defaults_pyobjects;
+    int flags;
+    PyObject *defaults_tuple;
+    PyObject *defaults_kwdict;
+    PyObject *(*defaults_getter)(PyObject *);
+    PyObject *func_annotations;
+} __pyx_CyFunctionObject;
+static PyTypeObject *__pyx_CyFunctionType = 0;
+#define __Pyx_CyFunction_NewEx(ml, flags, qualname, self, module, globals, code)\
+    __Pyx_CyFunction_New(__pyx_CyFunctionType, ml, flags, qualname, self, module, globals, code)
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *, PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *self,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
+                                                         size_t size,
+                                                         int pyobjects);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
+                                                            PyObject *tuple);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
+                                                             PyObject *dict);
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
+                                                              PyObject *dict);
+static int __pyx_CyFunction_init(void);
+
+/* BufferFallbackError.proto */
+static void __Pyx_RaiseBufferFallbackError(void);
+
+/* None.proto */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t, Py_ssize_t);
+
+/* BufferIndexError.proto */
+static void __Pyx_RaiseBufferIndexError(int axis);
+
+#define __Pyx_BufPtrStrided1d(type, buf, i0, s0) (type)((char*)buf + i0 * s0)
+/* PySequenceContains.proto */
+static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyObject* seq, int eq) {
+    int result = PySequence_Contains(seq, item);
+    return unlikely(result < 0) ? result : (result == (eq == Py_EQ));
+}
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* PyObject_GenericGetAttrNoDict.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr
+#endif
+
+/* PyObject_GenericGetAttr.proto */
+#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr
+#endif
+
+/* SetupReduce.proto */
+static int __Pyx_setup_reduce(PyObject* type_obj);
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_siz(siz value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_Py_intptr_t(Py_intptr_t value);
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE siz __Pyx_PyInt_As_siz(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* PyIdentifierFromString.proto */
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+  #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+/* ModuleImport.proto */
+static PyObject *__Pyx_ImportModule(const char *name);
+
+/* TypeImport.proto */
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void); /*proto*/
+
+/* Module declarations from 'libc.stdlib' */
+
+/* Module declarations from '_mask' */
+static PyTypeObject *__pyx_ptype_5_mask_RLEs = 0;
+static PyTypeObject *__pyx_ptype_5_mask_Masks = 0;
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t = { "uint8_t", NULL, sizeof(__pyx_t_5numpy_uint8_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_uint8_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_uint8_t), 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_double_t = { "double_t", NULL, sizeof(__pyx_t_5numpy_double_t), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t = { "uint32_t", NULL, sizeof(__pyx_t_5numpy_uint32_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_uint32_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_uint32_t), 0 };
+#define __Pyx_MODULE_NAME "_mask"
+extern int __pyx_module_is_main__mask;
+int __pyx_module_is_main__mask = 0;
+
+/* Implementation of '_mask' */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_AttributeError;
+static PyObject *__pyx_builtin_TypeError;
+static PyObject *__pyx_builtin_enumerate;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_RuntimeError;
+static PyObject *__pyx_builtin_ImportError;
+static const char __pyx_k_F[] = "F";
+static const char __pyx_k_N[] = "N";
+static const char __pyx_k_R[] = "R";
+static const char __pyx_k_a[] = "_a";
+static const char __pyx_k_h[] = "h";
+static const char __pyx_k_i[] = "i";
+static const char __pyx_k_j[] = "j";
+static const char __pyx_k_m[] = "m";
+static const char __pyx_k_n[] = "n";
+static const char __pyx_k_p[] = "p";
+static const char __pyx_k_w[] = "w";
+static const char __pyx_k_Rs[] = "Rs";
+static const char __pyx_k_bb[] = "bb";
+static const char __pyx_k_dt[] = "dt";
+static const char __pyx_k_gt[] = "gt";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_a_2[] = "a";
+static const char __pyx_k_all[] = "all";
+static const char __pyx_k_iou[] = "_iou";
+static const char __pyx_k_len[] = "_len";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_sys[] = "sys";
+static const char __pyx_k_area[] = "area";
+static const char __pyx_k_bb_2[] = "_bb";
+static const char __pyx_k_cnts[] = "cnts";
+static const char __pyx_k_data[] = "data";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mask[] = "_mask";
+static const char __pyx_k_name[] = "__name__";
+static const char __pyx_k_objs[] = "objs";
+static const char __pyx_k_poly[] = "poly";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_utf8[] = "utf8";
+static const char __pyx_k_array[] = "array";
+static const char __pyx_k_bbIou[] = "_bbIou";
+static const char __pyx_k_dtype[] = "dtype";
+static const char __pyx_k_iou_2[] = "iou";
+static const char __pyx_k_isbox[] = "isbox";
+static const char __pyx_k_isrle[] = "isrle";
+static const char __pyx_k_masks[] = "masks";
+static const char __pyx_k_merge[] = "merge";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_order[] = "order";
+static const char __pyx_k_pyobj[] = "pyobj";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_shape[] = "shape";
+static const char __pyx_k_uint8[] = "uint8";
+static const char __pyx_k_zeros[] = "zeros";
+static const char __pyx_k_astype[] = "astype";
+static const char __pyx_k_author[] = "__author__";
+static const char __pyx_k_counts[] = "counts";
+static const char __pyx_k_decode[] = "decode";
+static const char __pyx_k_double[] = "double";
+static const char __pyx_k_encode[] = "encode";
+static const char __pyx_k_frBbox[] = "frBbox";
+static const char __pyx_k_frPoly[] = "frPoly";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_iouFun[] = "_iouFun";
+static const char __pyx_k_mask_2[] = "mask";
+static const char __pyx_k_reduce[] = "__reduce__";
+static const char __pyx_k_rleIou[] = "_rleIou";
+static const char __pyx_k_toBbox[] = "toBbox";
+static const char __pyx_k_ucRles[] = "ucRles";
+static const char __pyx_k_uint32[] = "uint32";
+static const char __pyx_k_iscrowd[] = "iscrowd";
+static const char __pyx_k_np_poly[] = "np_poly";
+static const char __pyx_k_preproc[] = "_preproc";
+static const char __pyx_k_reshape[] = "reshape";
+static const char __pyx_k_rleObjs[] = "rleObjs";
+static const char __pyx_k_tsungyi[] = "tsungyi";
+static const char __pyx_k_c_string[] = "c_string";
+static const char __pyx_k_frString[] = "_frString";
+static const char __pyx_k_getstate[] = "__getstate__";
+static const char __pyx_k_mask_pyx[] = "_mask.pyx";
+static const char __pyx_k_setstate[] = "__setstate__";
+static const char __pyx_k_toString[] = "_toString";
+static const char __pyx_k_TypeError[] = "TypeError";
+static const char __pyx_k_enumerate[] = "enumerate";
+static const char __pyx_k_intersect[] = "intersect";
+static const char __pyx_k_py_string[] = "py_string";
+static const char __pyx_k_pyiscrowd[] = "pyiscrowd";
+static const char __pyx_k_reduce_ex[] = "__reduce_ex__";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_frPyObjects[] = "frPyObjects";
+static const char __pyx_k_RuntimeError[] = "RuntimeError";
+static const char __pyx_k_version_info[] = "version_info";
+static const char __pyx_k_reduce_cython[] = "__reduce_cython__";
+static const char __pyx_k_AttributeError[] = "AttributeError";
+static const char __pyx_k_PYTHON_VERSION[] = "PYTHON_VERSION";
+static const char __pyx_k_iou_locals__len[] = "iou.<locals>._len";
+static const char __pyx_k_setstate_cython[] = "__setstate_cython__";
+static const char __pyx_k_frUncompressedRLE[] = "frUncompressedRLE";
+static const char __pyx_k_iou_locals__bbIou[] = "iou.<locals>._bbIou";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_iou_locals__rleIou[] = "iou.<locals>._rleIou";
+static const char __pyx_k_iou_locals__preproc[] = "iou.<locals>._preproc";
+static const char __pyx_k_input_data_type_not_allowed[] = "input data type not allowed.";
+static const char __pyx_k_input_type_is_not_supported[] = "input type is not supported.";
+static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static const char __pyx_k_Python_version_must_be_2_or_3[] = "Python version must be 2 or 3";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_numpy_ndarray_input_is_only_for[] = "numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension";
+static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static const char __pyx_k_unrecognized_type_The_following[] = "unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.";
+static const char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static const char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static const char __pyx_k_The_dt_and_gt_should_have_the_sa[] = "The dt and gt should have the same data type, either RLEs, list or np.ndarray";
+static const char __pyx_k_list_input_can_be_bounding_box_N[] = "list input can be bounding box (Nx4) or RLEs ([RLE])";
+static const char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static const char __pyx_k_no_default___reduce___due_to_non[] = "no default __reduce__ due to non-trivial __cinit__";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static const char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static PyObject *__pyx_n_s_AttributeError;
+static PyObject *__pyx_n_s_F;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_n_s_N;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_PYTHON_VERSION;
+static PyObject *__pyx_kp_s_Python_version_must_be_2_or_3;
+static PyObject *__pyx_n_s_R;
+static PyObject *__pyx_n_s_Rs;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_kp_s_The_dt_and_gt_should_have_the_sa;
+static PyObject *__pyx_n_s_TypeError;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_a;
+static PyObject *__pyx_n_s_a_2;
+static PyObject *__pyx_n_s_all;
+static PyObject *__pyx_n_s_area;
+static PyObject *__pyx_n_s_array;
+static PyObject *__pyx_n_s_astype;
+static PyObject *__pyx_n_s_author;
+static PyObject *__pyx_n_s_bb;
+static PyObject *__pyx_n_s_bbIou;
+static PyObject *__pyx_n_s_bb_2;
+static PyObject *__pyx_n_s_c_string;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_cnts;
+static PyObject *__pyx_n_s_counts;
+static PyObject *__pyx_n_s_data;
+static PyObject *__pyx_n_s_decode;
+static PyObject *__pyx_n_s_double;
+static PyObject *__pyx_n_s_dt;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_encode;
+static PyObject *__pyx_n_s_enumerate;
+static PyObject *__pyx_n_s_frBbox;
+static PyObject *__pyx_n_s_frPoly;
+static PyObject *__pyx_n_s_frPyObjects;
+static PyObject *__pyx_n_s_frString;
+static PyObject *__pyx_n_s_frUncompressedRLE;
+static PyObject *__pyx_n_s_getstate;
+static PyObject *__pyx_n_s_gt;
+static PyObject *__pyx_n_s_h;
+static PyObject *__pyx_n_s_i;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_kp_s_input_data_type_not_allowed;
+static PyObject *__pyx_kp_s_input_type_is_not_supported;
+static PyObject *__pyx_n_s_intersect;
+static PyObject *__pyx_n_s_iou;
+static PyObject *__pyx_n_s_iouFun;
+static PyObject *__pyx_n_s_iou_2;
+static PyObject *__pyx_n_s_iou_locals__bbIou;
+static PyObject *__pyx_n_s_iou_locals__len;
+static PyObject *__pyx_n_s_iou_locals__preproc;
+static PyObject *__pyx_n_s_iou_locals__rleIou;
+static PyObject *__pyx_n_s_isbox;
+static PyObject *__pyx_n_s_iscrowd;
+static PyObject *__pyx_n_s_isrle;
+static PyObject *__pyx_n_s_j;
+static PyObject *__pyx_n_s_len;
+static PyObject *__pyx_kp_s_list_input_can_be_bounding_box_N;
+static PyObject *__pyx_n_s_m;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_mask;
+static PyObject *__pyx_n_s_mask_2;
+static PyObject *__pyx_kp_s_mask_pyx;
+static PyObject *__pyx_n_s_masks;
+static PyObject *__pyx_n_s_merge;
+static PyObject *__pyx_n_s_n;
+static PyObject *__pyx_n_s_name;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_kp_s_no_default___reduce___due_to_non;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_np_poly;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_kp_s_numpy_ndarray_input_is_only_for;
+static PyObject *__pyx_n_s_obj;
+static PyObject *__pyx_n_s_objs;
+static PyObject *__pyx_n_s_order;
+static PyObject *__pyx_n_s_p;
+static PyObject *__pyx_n_s_poly;
+static PyObject *__pyx_n_s_preproc;
+static PyObject *__pyx_n_s_py_string;
+static PyObject *__pyx_n_s_pyiscrowd;
+static PyObject *__pyx_n_s_pyobj;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_reduce;
+static PyObject *__pyx_n_s_reduce_cython;
+static PyObject *__pyx_n_s_reduce_ex;
+static PyObject *__pyx_n_s_reshape;
+static PyObject *__pyx_n_s_rleIou;
+static PyObject *__pyx_n_s_rleObjs;
+static PyObject *__pyx_n_s_setstate;
+static PyObject *__pyx_n_s_setstate_cython;
+static PyObject *__pyx_n_s_shape;
+static PyObject *__pyx_n_s_size;
+static PyObject *__pyx_n_s_sys;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_toBbox;
+static PyObject *__pyx_n_s_toString;
+static PyObject *__pyx_n_s_tsungyi;
+static PyObject *__pyx_n_s_ucRles;
+static PyObject *__pyx_n_s_uint32;
+static PyObject *__pyx_n_s_uint8;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_kp_s_unrecognized_type_The_following;
+static PyObject *__pyx_n_s_utf8;
+static PyObject *__pyx_n_s_version_info;
+static PyObject *__pyx_n_s_w;
+static PyObject *__pyx_n_s_zeros;
+static int __pyx_pf_5_mask_4RLEs___cinit__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self, siz __pyx_v_n); /* proto */
+static void __pyx_pf_5_mask_4RLEs_2__dealloc__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_5_mask_4RLEs_4__getattr__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self, PyObject *__pyx_v_key); /* proto */
+static PyObject *__pyx_pf_5_mask_4RLEs_6__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_RLEs *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_5_mask_4RLEs_8__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_RLEs *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static int __pyx_pf_5_mask_5Masks___cinit__(struct __pyx_obj_5_mask_Masks *__pyx_v_self, PyObject *__pyx_v_h, PyObject *__pyx_v_w, PyObject *__pyx_v_n); /* proto */
+static PyObject *__pyx_pf_5_mask_5Masks_2__array__(struct __pyx_obj_5_mask_Masks *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_5_mask_5Masks_4__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_Masks *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_5_mask_5Masks_6__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_Masks *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state); /* proto */
+static PyObject *__pyx_pf_5_mask__toString(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs); /* proto */
+static PyObject *__pyx_pf_5_mask_2_frString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_5_mask_4encode(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_mask); /* proto */
+static PyObject *__pyx_pf_5_mask_6decode(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_5_mask_8merge(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs, PyObject *__pyx_v_intersect); /* proto */
+static PyObject *__pyx_pf_5_mask_10area(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_5_mask_3iou__preproc(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_objs); /* proto */
+static PyObject *__pyx_pf_5_mask_3iou_2_rleIou(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_5_mask_RLEs *__pyx_v_dt, struct __pyx_obj_5_mask_RLEs *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou); /* proto */
+static PyObject *__pyx_pf_5_mask_3iou_4_bbIou(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dt, PyArrayObject *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou); /* proto */
+static PyObject *__pyx_pf_5_mask_3iou_6_len(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
+static PyObject *__pyx_pf_5_mask_12iou(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_dt, PyObject *__pyx_v_gt, PyObject *__pyx_v_pyiscrowd); /* proto */
+static PyObject *__pyx_pf_5_mask_14toBbox(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_5_mask_16frBbox(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_bb, siz __pyx_v_h, siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_5_mask_18frPoly(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_poly, siz __pyx_v_h, siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_5_mask_20frUncompressedRLE(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_ucRles, CYTHON_UNUSED siz __pyx_v_h, CYTHON_UNUSED siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_5_mask_22frPyObjects(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyobj, PyObject *__pyx_v_h, PyObject *__pyx_v_w); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static PyObject *__pyx_tp_new_5_mask_RLEs(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_5_mask_Masks(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_int_0;
+static PyObject *__pyx_int_1;
+static PyObject *__pyx_int_2;
+static PyObject *__pyx_int_3;
+static PyObject *__pyx_int_4;
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__10;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__13;
+static PyObject *__pyx_tuple__15;
+static PyObject *__pyx_tuple__17;
+static PyObject *__pyx_tuple__19;
+static PyObject *__pyx_tuple__20;
+static PyObject *__pyx_tuple__21;
+static PyObject *__pyx_tuple__22;
+static PyObject *__pyx_tuple__23;
+static PyObject *__pyx_tuple__24;
+static PyObject *__pyx_tuple__25;
+static PyObject *__pyx_tuple__26;
+static PyObject *__pyx_tuple__27;
+static PyObject *__pyx_tuple__28;
+static PyObject *__pyx_tuple__29;
+static PyObject *__pyx_tuple__30;
+static PyObject *__pyx_tuple__31;
+static PyObject *__pyx_tuple__32;
+static PyObject *__pyx_tuple__34;
+static PyObject *__pyx_tuple__36;
+static PyObject *__pyx_tuple__38;
+static PyObject *__pyx_tuple__40;
+static PyObject *__pyx_tuple__42;
+static PyObject *__pyx_tuple__44;
+static PyObject *__pyx_tuple__46;
+static PyObject *__pyx_tuple__48;
+static PyObject *__pyx_tuple__50;
+static PyObject *__pyx_tuple__52;
+static PyObject *__pyx_tuple__54;
+static PyObject *__pyx_codeobj__12;
+static PyObject *__pyx_codeobj__14;
+static PyObject *__pyx_codeobj__16;
+static PyObject *__pyx_codeobj__18;
+static PyObject *__pyx_codeobj__33;
+static PyObject *__pyx_codeobj__35;
+static PyObject *__pyx_codeobj__37;
+static PyObject *__pyx_codeobj__39;
+static PyObject *__pyx_codeobj__41;
+static PyObject *__pyx_codeobj__43;
+static PyObject *__pyx_codeobj__45;
+static PyObject *__pyx_codeobj__47;
+static PyObject *__pyx_codeobj__49;
+static PyObject *__pyx_codeobj__51;
+static PyObject *__pyx_codeobj__53;
+static PyObject *__pyx_codeobj__55;
+/* Late includes */
+
+/* "_mask.pyx":60
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, siz n =0):             # <<<<<<<<<<<<<<
+ *         rlesInit(&self._R, n)
+ *         self._n = n
+ */
+
+/* Python wrapper */
+static int __pyx_pw_5_mask_4RLEs_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_5_mask_4RLEs_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  siz __pyx_v_n;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_n,0};
+    PyObject* values[1] = {0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_n);
+          if (value) { values[0] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 60, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    if (values[0]) {
+      __pyx_v_n = __Pyx_PyInt_As_siz(values[0]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 60, __pyx_L3_error)
+    } else {
+      __pyx_v_n = ((siz)0);
+    }
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 60, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.RLEs.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_4RLEs___cinit__(((struct __pyx_obj_5_mask_RLEs *)__pyx_v_self), __pyx_v_n);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5_mask_4RLEs___cinit__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self, siz __pyx_v_n) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+
+  /* "_mask.pyx":61
+ * 
+ *     def __cinit__(self, siz n =0):
+ *         rlesInit(&self._R, n)             # <<<<<<<<<<<<<<
+ *         self._n = n
+ * 
+ */
+  rlesInit((&__pyx_v_self->_R), __pyx_v_n);
+
+  /* "_mask.pyx":62
+ *     def __cinit__(self, siz n =0):
+ *         rlesInit(&self._R, n)
+ *         self._n = n             # <<<<<<<<<<<<<<
+ * 
+ *     # free the RLE array here
+ */
+  __pyx_v_self->_n = __pyx_v_n;
+
+  /* "_mask.pyx":60
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, siz n =0):             # <<<<<<<<<<<<<<
+ *         rlesInit(&self._R, n)
+ *         self._n = n
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":65
+ * 
+ *     # free the RLE array here
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ */
+
+/* Python wrapper */
+static void __pyx_pw_5_mask_4RLEs_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_5_mask_4RLEs_3__dealloc__(PyObject *__pyx_v_self) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_5_mask_4RLEs_2__dealloc__(((struct __pyx_obj_5_mask_RLEs *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5_mask_4RLEs_2__dealloc__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self) {
+  siz __pyx_v_i;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  siz __pyx_t_2;
+  siz __pyx_t_3;
+  siz __pyx_t_4;
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
+
+  /* "_mask.pyx":66
+ *     # free the RLE array here
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ */
+  __pyx_t_1 = ((__pyx_v_self->_R != NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "_mask.pyx":67
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:
+ *             for i in range(self._n):             # <<<<<<<<<<<<<<
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ */
+    __pyx_t_2 = __pyx_v_self->_n;
+    __pyx_t_3 = __pyx_t_2;
+    for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+      __pyx_v_i = __pyx_t_4;
+
+      /* "_mask.pyx":68
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)             # <<<<<<<<<<<<<<
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ */
+      free((__pyx_v_self->_R[__pyx_v_i]).cnts);
+    }
+
+    /* "_mask.pyx":69
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ *             free(self._R)             # <<<<<<<<<<<<<<
+ *     def __getattr__(self, key):
+ *         if key == 'n':
+ */
+    free(__pyx_v_self->_R);
+
+    /* "_mask.pyx":66
+ *     # free the RLE array here
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ */
+  }
+
+  /* "_mask.pyx":65
+ * 
+ *     # free the RLE array here
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "_mask.pyx":70
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ *     def __getattr__(self, key):             # <<<<<<<<<<<<<<
+ *         if key == 'n':
+ *             return self._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_4RLEs_5__getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_key); /*proto*/
+static PyObject *__pyx_pw_5_mask_4RLEs_5__getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_key) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getattr__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_4RLEs_4__getattr__(((struct __pyx_obj_5_mask_RLEs *)__pyx_v_self), ((PyObject *)__pyx_v_key));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_4RLEs_4__getattr__(struct __pyx_obj_5_mask_RLEs *__pyx_v_self, PyObject *__pyx_v_key) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  __Pyx_RefNannySetupContext("__getattr__", 0);
+
+  /* "_mask.pyx":71
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ *         if key == 'n':             # <<<<<<<<<<<<<<
+ *             return self._n
+ *         raise AttributeError(key)
+ */
+  __pyx_t_1 = (__Pyx_PyString_Equals(__pyx_v_key, __pyx_n_s_n, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 71, __pyx_L1_error)
+  if (__pyx_t_1) {
+
+    /* "_mask.pyx":72
+ *     def __getattr__(self, key):
+ *         if key == 'n':
+ *             return self._n             # <<<<<<<<<<<<<<
+ *         raise AttributeError(key)
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_self->_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 72, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "_mask.pyx":71
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ *         if key == 'n':             # <<<<<<<<<<<<<<
+ *             return self._n
+ *         raise AttributeError(key)
+ */
+  }
+
+  /* "_mask.pyx":73
+ *         if key == 'n':
+ *             return self._n
+ *         raise AttributeError(key)             # <<<<<<<<<<<<<<
+ * 
+ * # python class to wrap Mask array in C
+ */
+  __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_AttributeError, __pyx_v_key); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 73, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_Raise(__pyx_t_2, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __PYX_ERR(0, 73, __pyx_L1_error)
+
+  /* "_mask.pyx":70
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ *     def __getattr__(self, key):             # <<<<<<<<<<<<<<
+ *         if key == 'n':
+ *             return self._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("_mask.RLEs.__getattr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_4RLEs_7__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_5_mask_4RLEs_7__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_4RLEs_6__reduce_cython__(((struct __pyx_obj_5_mask_RLEs *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_4RLEs_6__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_RLEs *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 0);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 2, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("_mask.RLEs.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_4RLEs_9__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/
+static PyObject *__pyx_pw_5_mask_4RLEs_9__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_4RLEs_8__setstate_cython__(((struct __pyx_obj_5_mask_RLEs *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_4RLEs_8__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_RLEs *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 0);
+
+  /* "(tree fragment)":4
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("_mask.RLEs.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":83
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, h, w, n):             # <<<<<<<<<<<<<<
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ */
+
+/* Python wrapper */
+static int __pyx_pw_5_mask_5Masks_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_5_mask_5Masks_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_h = 0;
+  PyObject *__pyx_v_w = 0;
+  PyObject *__pyx_v_n = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_n,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 1); __PYX_ERR(0, 83, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 2); __PYX_ERR(0, 83, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 83, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_h = values[0];
+    __pyx_v_w = values[1];
+    __pyx_v_n = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 83, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.Masks.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_5Masks___cinit__(((struct __pyx_obj_5_mask_Masks *)__pyx_v_self), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5_mask_5Masks___cinit__(struct __pyx_obj_5_mask_Masks *__pyx_v_self, PyObject *__pyx_v_h, PyObject *__pyx_v_w, PyObject *__pyx_v_n) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  size_t __pyx_t_4;
+  siz __pyx_t_5;
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+
+  /* "_mask.pyx":84
+ * 
+ *     def __cinit__(self, h, w, n):
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))             # <<<<<<<<<<<<<<
+ *         self._h = h
+ *         self._w = w
+ */
+  __pyx_t_1 = PyNumber_Multiply(__pyx_v_h, __pyx_v_w); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = PyNumber_Multiply(__pyx_t_1, __pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_FromSize_t((sizeof(byte))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = PyNumber_Multiply(__pyx_t_2, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_4 = __Pyx_PyInt_As_size_t(__pyx_t_3); if (unlikely((__pyx_t_4 == (size_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 84, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_self->_mask = ((byte *)malloc(__pyx_t_4));
+
+  /* "_mask.pyx":85
+ *     def __cinit__(self, h, w, n):
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h             # <<<<<<<<<<<<<<
+ *         self._w = w
+ *         self._n = n
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_h); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 85, __pyx_L1_error)
+  __pyx_v_self->_h = __pyx_t_5;
+
+  /* "_mask.pyx":86
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ *         self._w = w             # <<<<<<<<<<<<<<
+ *         self._n = n
+ *     # def __dealloc__(self):
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_w); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L1_error)
+  __pyx_v_self->_w = __pyx_t_5;
+
+  /* "_mask.pyx":87
+ *         self._h = h
+ *         self._w = w
+ *         self._n = n             # <<<<<<<<<<<<<<
+ *     # def __dealloc__(self):
+ *         # the memory management of _mask has been passed to np.ndarray
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_n); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L1_error)
+  __pyx_v_self->_n = __pyx_t_5;
+
+  /* "_mask.pyx":83
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, h, w, n):             # <<<<<<<<<<<<<<
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("_mask.Masks.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":93
+ * 
+ *     # called when passing into np.array() and return an np.ndarray in column-major order
+ *     def __array__(self):             # <<<<<<<<<<<<<<
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_5Masks_3__array__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_5_mask_5Masks_3__array__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__array__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_5Masks_2__array__(((struct __pyx_obj_5_mask_Masks *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_5Masks_2__array__(struct __pyx_obj_5_mask_Masks *__pyx_v_self) {
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_ndarray = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("__array__", 0);
+
+  /* "_mask.pyx":95
+ *     def __array__(self):
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n             # <<<<<<<<<<<<<<
+ *         # Create a 1D array, and reshape it to fortran/Matlab column-major array
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+ */
+  (__pyx_v_shape[0]) = ((((npy_intp)__pyx_v_self->_h) * __pyx_v_self->_w) * __pyx_v_self->_n);
+
+  /* "_mask.pyx":97
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ *         # Create a 1D array, and reshape it to fortran/Matlab column-major array
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')             # <<<<<<<<<<<<<<
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_UINT8, __pyx_v_self->_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_reshape); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_self->_h); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyInt_From_siz(__pyx_v_self->_w); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_self->_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_4);
+  __pyx_t_1 = 0;
+  __pyx_t_3 = 0;
+  __pyx_t_4 = 0;
+  __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  if (PyDict_SetItem(__pyx_t_5, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 97, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_ndarray = __pyx_t_3;
+  __pyx_t_3 = 0;
+
+  /* "_mask.pyx":99
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *         return ndarray
+ * 
+ */
+  if (!(likely(((__pyx_v_ndarray) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_ndarray, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 99, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_ndarray), NPY_OWNDATA);
+
+  /* "_mask.pyx":100
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+ *         return ndarray             # <<<<<<<<<<<<<<
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_ndarray);
+  __pyx_r = __pyx_v_ndarray;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":93
+ * 
+ *     # called when passing into np.array() and return an np.ndarray in column-major order
+ *     def __array__(self):             # <<<<<<<<<<<<<<
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("_mask.Masks.__array__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_ndarray);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_5Masks_5__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_5_mask_5Masks_5__reduce_cython__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__reduce_cython__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_5Masks_4__reduce_cython__(((struct __pyx_obj_5_mask_Masks *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_5Masks_4__reduce_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_Masks *__pyx_v_self) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("__reduce_cython__", 0);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 2, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __PYX_ERR(1, 2, __pyx_L1_error)
+
+  /* "(tree fragment)":1
+ * def __reduce_cython__(self):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("_mask.Masks.__reduce_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_5Masks_7__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state); /*proto*/
+static PyObject *__pyx_pw_5_mask_5Masks_7__setstate_cython__(PyObject *__pyx_v_self, PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__setstate_cython__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_5Masks_6__setstate_cython__(((struct __pyx_obj_5_mask_Masks *)__pyx_v_self), ((PyObject *)__pyx_v___pyx_state));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_5Masks_6__setstate_cython__(CYTHON_UNUSED struct __pyx_obj_5_mask_Masks *__pyx_v_self, CYTHON_UNUSED PyObject *__pyx_v___pyx_state) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("__setstate_cython__", 0);
+
+  /* "(tree fragment)":4
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_TypeError, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __PYX_ERR(1, 4, __pyx_L1_error)
+
+  /* "(tree fragment)":3
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):             # <<<<<<<<<<<<<<
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("_mask.Masks.__setstate_cython__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":103
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_1_toString(PyObject *__pyx_self, PyObject *__pyx_v_Rs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_1_toString = {"_toString", (PyCFunction)__pyx_pw_5_mask_1_toString, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_1_toString(PyObject *__pyx_self, PyObject *__pyx_v_Rs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_toString (wrapper)", 0);
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_Rs), __pyx_ptype_5_mask_RLEs, 1, "Rs", 0))) __PYX_ERR(0, 103, __pyx_L1_error)
+  __pyx_r = __pyx_pf_5_mask__toString(__pyx_self, ((struct __pyx_obj_5_mask_RLEs *)__pyx_v_Rs));
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask__toString(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs) {
+  siz __pyx_v_n;
+  PyObject *__pyx_v_py_string = 0;
+  char *__pyx_v_c_string;
+  PyObject *__pyx_v_objs = NULL;
+  siz __pyx_v_i;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  siz __pyx_t_2;
+  siz __pyx_t_3;
+  siz __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_t_8;
+  __Pyx_RefNannySetupContext("_toString", 0);
+
+  /* "_mask.pyx":104
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):
+ *     cdef siz n = Rs.n             # <<<<<<<<<<<<<<
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_Rs), __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 104, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_2 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 104, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_n = __pyx_t_2;
+
+  /* "_mask.pyx":107
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     objs = []             # <<<<<<<<<<<<<<
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 107, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_objs = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":108
+ *     cdef char* c_string
+ *     objs = []
+ *     for i in range(n):             # <<<<<<<<<<<<<<
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string
+ */
+  __pyx_t_2 = __pyx_v_n;
+  __pyx_t_3 = __pyx_t_2;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "_mask.pyx":109
+ *     objs = []
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )             # <<<<<<<<<<<<<<
+ *         py_string = c_string
+ *         objs.append({
+ */
+    __pyx_v_c_string = rleToString(((RLE *)(&(__pyx_v_Rs->_R[__pyx_v_i]))));
+
+    /* "_mask.pyx":110
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string             # <<<<<<<<<<<<<<
+ *         objs.append({
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ */
+    __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_c_string); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 110, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_XDECREF_SET(__pyx_v_py_string, ((PyObject*)__pyx_t_1));
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":112
+ *         py_string = c_string
+ *         objs.append({
+ *             'size': [Rs._R[i].h, Rs._R[i].w],             # <<<<<<<<<<<<<<
+ *             'counts': py_string
+ *         })
+ */
+    __pyx_t_1 = __Pyx_PyDict_NewPresized(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 112, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_5 = __Pyx_PyInt_From_siz((__pyx_v_Rs->_R[__pyx_v_i]).h); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 112, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyInt_From_siz((__pyx_v_Rs->_R[__pyx_v_i]).w); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 112, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_7 = PyList_New(2); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 112, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyList_SET_ITEM(__pyx_t_7, 0, __pyx_t_5);
+    __Pyx_GIVEREF(__pyx_t_6);
+    PyList_SET_ITEM(__pyx_t_7, 1, __pyx_t_6);
+    __pyx_t_5 = 0;
+    __pyx_t_6 = 0;
+    if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_size, __pyx_t_7) < 0) __PYX_ERR(0, 112, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+
+    /* "_mask.pyx":113
+ *         objs.append({
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ *             'counts': py_string             # <<<<<<<<<<<<<<
+ *         })
+ *         free(c_string)
+ */
+    if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_counts, __pyx_v_py_string) < 0) __PYX_ERR(0, 112, __pyx_L1_error)
+
+    /* "_mask.pyx":111
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string
+ *         objs.append({             # <<<<<<<<<<<<<<
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ *             'counts': py_string
+ */
+    __pyx_t_8 = __Pyx_PyList_Append(__pyx_v_objs, __pyx_t_1); if (unlikely(__pyx_t_8 == ((int)-1))) __PYX_ERR(0, 111, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "_mask.pyx":115
+ *             'counts': py_string
+ *         })
+ *         free(c_string)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+    free(__pyx_v_c_string);
+  }
+
+  /* "_mask.pyx":116
+ *         })
+ *         free(c_string)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":103
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("_mask._toString", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_py_string);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":119
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_3_frString(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_3_frString = {"_frString", (PyCFunction)__pyx_pw_5_mask_3_frString, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_3_frString(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_frString (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_2_frString(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_2_frString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  siz __pyx_v_n;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_py_string = 0;
+  char *__pyx_v_c_string;
+  PyObject *__pyx_v_i = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  char *__pyx_t_12;
+  Py_ssize_t __pyx_t_13;
+  siz __pyx_t_14;
+  siz __pyx_t_15;
+  __Pyx_RefNannySetupContext("_frString", 0);
+
+  /* "_mask.pyx":120
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):
+ *     cdef siz n = len(rleObjs)             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     cdef bytes py_string
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_rleObjs); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 120, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "_mask.pyx":121
+ * def _frString(rleObjs):
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ */
+  __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 121, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyObject_CallOneArg(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 121, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "_mask.pyx":124
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):             # <<<<<<<<<<<<<<
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')
+ */
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_t_3 = __pyx_int_0;
+  if (likely(PyList_CheckExact(__pyx_v_rleObjs)) || PyTuple_CheckExact(__pyx_v_rleObjs)) {
+    __pyx_t_2 = __pyx_v_rleObjs; __Pyx_INCREF(__pyx_t_2); __pyx_t_1 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_1 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_rleObjs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 124, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_2)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 124, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 124, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_2)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 124, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 124, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_2);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(0, 124, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __Pyx_INCREF(__pyx_t_3);
+    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_3);
+    __pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_t_3, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_3);
+    __pyx_t_3 = __pyx_t_5;
+    __pyx_t_5 = 0;
+
+    /* "_mask.pyx":125
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):
+ *         if PYTHON_VERSION == 2:             # <<<<<<<<<<<<<<
+ *             py_string = str(obj['counts']).encode('utf8')
+ *         elif PYTHON_VERSION == 3:
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_PYTHON_VERSION); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 125, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyInt_EqObjC(__pyx_t_5, __pyx_int_2, 2, 0); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 125, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_6); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 125, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    if (__pyx_t_7) {
+
+      /* "_mask.pyx":126
+ *     for i, obj in enumerate(rleObjs):
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')             # <<<<<<<<<<<<<<
+ *         elif PYTHON_VERSION == 3:
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ */
+      __pyx_t_6 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_counts); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 126, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_5 = __Pyx_PyObject_CallOneArg(((PyObject *)(&PyString_Type)), __pyx_t_6); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 126, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_encode); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 126, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 126, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      if (!(likely(PyBytes_CheckExact(__pyx_t_5))||((__pyx_t_5) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_5)->tp_name), 0))) __PYX_ERR(0, 126, __pyx_L1_error)
+      __Pyx_XDECREF_SET(__pyx_v_py_string, ((PyObject*)__pyx_t_5));
+      __pyx_t_5 = 0;
+
+      /* "_mask.pyx":125
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):
+ *         if PYTHON_VERSION == 2:             # <<<<<<<<<<<<<<
+ *             py_string = str(obj['counts']).encode('utf8')
+ *         elif PYTHON_VERSION == 3:
+ */
+      goto __pyx_L5;
+    }
+
+    /* "_mask.pyx":127
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')
+ *         elif PYTHON_VERSION == 3:             # <<<<<<<<<<<<<<
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ *         else:
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_PYTHON_VERSION); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 127, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyInt_EqObjC(__pyx_t_5, __pyx_int_3, 3, 0); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 127, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_6); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 127, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    if (likely(__pyx_t_7)) {
+
+      /* "_mask.pyx":128
+ *             py_string = str(obj['counts']).encode('utf8')
+ *         elif PYTHON_VERSION == 3:
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise Exception('Python version must be 2 or 3')
+ */
+      __pyx_t_5 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_counts); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 128, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_8 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_t_5)), ((PyObject *)(&PyString_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_8); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_8); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 128, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      if (__pyx_t_7) {
+        __pyx_t_5 = __Pyx_PyObject_GetAttrStr(((PyObject *)(&PyString_Type)), __pyx_n_s_encode); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 128, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __pyx_t_9 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_counts); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 128, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_9);
+        __pyx_t_10 = NULL;
+        if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_5))) {
+          __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_5);
+          if (likely(__pyx_t_10)) {
+            PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+            __Pyx_INCREF(__pyx_t_10);
+            __Pyx_INCREF(function);
+            __Pyx_DECREF_SET(__pyx_t_5, function);
+          }
+        }
+        if (!__pyx_t_10) {
+          __pyx_t_8 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_t_9); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+          __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          __Pyx_GOTREF(__pyx_t_8);
+        } else {
+          #if CYTHON_FAST_PYCALL
+          if (PyFunction_Check(__pyx_t_5)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_9};
+            __pyx_t_8 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+            __Pyx_GOTREF(__pyx_t_8);
+            __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          } else
+          #endif
+          #if CYTHON_FAST_PYCCALL
+          if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_9};
+            __pyx_t_8 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+            __Pyx_GOTREF(__pyx_t_8);
+            __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+          } else
+          #endif
+          {
+            __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 128, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_11);
+            __Pyx_GIVEREF(__pyx_t_10); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_10); __pyx_t_10 = NULL;
+            __Pyx_GIVEREF(__pyx_t_9);
+            PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_9);
+            __pyx_t_9 = 0;
+            __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_11, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_8);
+            __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+          }
+        }
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+        if (!(likely(PyBytes_CheckExact(__pyx_t_8))||((__pyx_t_8) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_8)->tp_name), 0))) __PYX_ERR(0, 128, __pyx_L1_error)
+        __pyx_t_6 = __pyx_t_8;
+        __pyx_t_8 = 0;
+      } else {
+        __pyx_t_8 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_counts); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 128, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        if (!(likely(PyBytes_CheckExact(__pyx_t_8))||((__pyx_t_8) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_8)->tp_name), 0))) __PYX_ERR(0, 128, __pyx_L1_error)
+        __pyx_t_6 = __pyx_t_8;
+        __pyx_t_8 = 0;
+      }
+      __Pyx_XDECREF_SET(__pyx_v_py_string, ((PyObject*)__pyx_t_6));
+      __pyx_t_6 = 0;
+
+      /* "_mask.pyx":127
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')
+ *         elif PYTHON_VERSION == 3:             # <<<<<<<<<<<<<<
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ *         else:
+ */
+      goto __pyx_L5;
+    }
+
+    /* "_mask.pyx":130
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ *         else:
+ *             raise Exception('Python version must be 2 or 3')             # <<<<<<<<<<<<<<
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ */
+    /*else*/ {
+      __pyx_t_6 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 130, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_Raise(__pyx_t_6, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __PYX_ERR(0, 130, __pyx_L1_error)
+    }
+    __pyx_L5:;
+
+    /* "_mask.pyx":131
+ *         else:
+ *             raise Exception('Python version must be 2 or 3')
+ *         c_string = py_string             # <<<<<<<<<<<<<<
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ *     return Rs
+ */
+    if (unlikely(__pyx_v_py_string == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "expected bytes, NoneType found");
+      __PYX_ERR(0, 131, __pyx_L1_error)
+    }
+    __pyx_t_12 = __Pyx_PyBytes_AsWritableString(__pyx_v_py_string); if (unlikely((!__pyx_t_12) && PyErr_Occurred())) __PYX_ERR(0, 131, __pyx_L1_error)
+    __pyx_v_c_string = __pyx_t_12;
+
+    /* "_mask.pyx":132
+ *             raise Exception('Python version must be 2 or 3')
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )             # <<<<<<<<<<<<<<
+ *     return Rs
+ * 
+ */
+    __pyx_t_13 = __Pyx_PyIndex_AsSsize_t(__pyx_v_i); if (unlikely((__pyx_t_13 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 132, __pyx_L1_error)
+    __pyx_t_6 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_size); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_8 = __Pyx_GetItemInt(__pyx_t_6, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_14 = __Pyx_PyInt_As_siz(__pyx_t_8); if (unlikely((__pyx_t_14 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_8 = __Pyx_PyObject_Dict_GetItem(__pyx_v_obj, __pyx_n_s_size); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __pyx_t_6 = __Pyx_GetItemInt(__pyx_t_8, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_15 = __Pyx_PyInt_As_siz(__pyx_t_6); if (unlikely((__pyx_t_15 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 132, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    rleFrString(((RLE *)(&(__pyx_v_Rs->_R[__pyx_t_13]))), ((char *)__pyx_v_c_string), __pyx_t_14, __pyx_t_15);
+
+    /* "_mask.pyx":124
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):             # <<<<<<<<<<<<<<
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+  /* "_mask.pyx":133
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ *     return Rs             # <<<<<<<<<<<<<<
+ * 
+ * # encode mask to RLEs objects
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+  __pyx_r = ((PyObject *)__pyx_v_Rs);
+  goto __pyx_L0;
+
+  /* "_mask.pyx":119
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_9);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_XDECREF(__pyx_t_11);
+  __Pyx_AddTraceback("_mask._frString", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_py_string);
+  __Pyx_XDECREF(__pyx_v_i);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":137
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_5encode(PyObject *__pyx_self, PyObject *__pyx_v_mask); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_5encode = {"encode", (PyCFunction)__pyx_pw_5_mask_5encode, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_5encode(PyObject *__pyx_self, PyObject *__pyx_v_mask) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("encode (wrapper)", 0);
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mask), __pyx_ptype_5numpy_ndarray, 1, "mask", 0))) __PYX_ERR(0, 137, __pyx_L1_error)
+  __pyx_r = __pyx_pf_5_mask_4encode(__pyx_self, ((PyArrayObject *)__pyx_v_mask));
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_4encode(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_mask) {
+  npy_intp __pyx_v_h;
+  npy_intp __pyx_v_w;
+  npy_intp __pyx_v_n;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = 0;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_mask;
+  __Pyx_Buffer __pyx_pybuffer_mask;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  npy_intp __pyx_t_1;
+  npy_intp __pyx_t_2;
+  npy_intp __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  __Pyx_RefNannySetupContext("encode", 0);
+  __pyx_pybuffer_mask.pybuffer.buf = NULL;
+  __pyx_pybuffer_mask.refcount = 0;
+  __pyx_pybuffernd_mask.data = NULL;
+  __pyx_pybuffernd_mask.rcbuffer = &__pyx_pybuffer_mask;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mask.rcbuffer->pybuffer, (PyObject*)__pyx_v_mask, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_F_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 137, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_mask.diminfo[0].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_mask.diminfo[0].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_mask.diminfo[1].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_mask.diminfo[1].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_mask.diminfo[2].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_mask.diminfo[2].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[2];
+
+  /* "_mask.pyx":138
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ */
+  __pyx_t_1 = (__pyx_v_mask->dimensions[0]);
+  __pyx_t_2 = (__pyx_v_mask->dimensions[1]);
+  __pyx_t_3 = (__pyx_v_mask->dimensions[2]);
+  __pyx_v_h = __pyx_t_1;
+  __pyx_v_w = __pyx_t_2;
+  __pyx_v_n = __pyx_t_3;
+
+  /* "_mask.pyx":139
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)
+ */
+  __pyx_t_4 = __Pyx_PyInt_From_Py_intptr_t(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 139, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_CallOneArg(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 139, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":140
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+  rleEncode(__pyx_v_Rs->_R, ((byte *)__pyx_v_mask->data), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* "_mask.pyx":141
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 141, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_6 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) {
+    __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_4);
+    if (likely(__pyx_t_6)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+      __Pyx_INCREF(__pyx_t_6);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_4, function);
+    }
+  }
+  if (!__pyx_t_6) {
+    __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_4, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 141, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_6, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 141, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_5);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_6, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 141, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_5);
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 141, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_6); __pyx_t_6 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_7, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_7, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 141, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_objs = __pyx_t_5;
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":142
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":137
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_mask.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.encode", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_mask.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":145
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_7decode(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_7decode = {"decode", (PyCFunction)__pyx_pw_5_mask_7decode, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_7decode(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("decode (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_6decode(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_6decode(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  siz __pyx_v_n;
+  struct __pyx_obj_5_mask_Masks *__pyx_v_masks = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  siz __pyx_t_5;
+  siz __pyx_t_6;
+  siz __pyx_t_7;
+  __Pyx_RefNannySetupContext("decode", 0);
+
+  /* "_mask.pyx":146
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 146, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 146, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5_mask_RLEs))))) __PYX_ERR(0, 146, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":147
+ * def decode(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n             # <<<<<<<<<<<<<<
+ *     masks = Masks(h, w, n)
+ *     rleDecode(<RLE*>Rs._R, masks._mask, n);
+ */
+  __pyx_t_5 = (__pyx_v_Rs->_R[0]).h;
+  __pyx_t_6 = (__pyx_v_Rs->_R[0]).w;
+  __pyx_t_7 = __pyx_v_Rs->_n;
+  __pyx_v_h = __pyx_t_5;
+  __pyx_v_w = __pyx_t_6;
+  __pyx_v_n = __pyx_t_7;
+
+  /* "_mask.pyx":148
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)             # <<<<<<<<<<<<<<
+ *     rleDecode(<RLE*>Rs._R, masks._mask, n);
+ *     return np.array(masks)
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_w); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_2);
+  PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_4);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5_mask_Masks), __pyx_t_3, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_masks = ((struct __pyx_obj_5_mask_Masks *)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "_mask.pyx":149
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)
+ *     rleDecode(<RLE*>Rs._R, masks._mask, n);             # <<<<<<<<<<<<<<
+ *     return np.array(masks)
+ * 
+ */
+  rleDecode(((RLE *)__pyx_v_Rs->_R), __pyx_v_masks->_mask, __pyx_v_n);
+
+  /* "_mask.pyx":150
+ *     masks = Masks(h, w, n)
+ *     rleDecode(<RLE*>Rs._R, masks._mask, n);
+ *     return np.array(masks)             # <<<<<<<<<<<<<<
+ * 
+ * def merge(rleObjs, intersect=0):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 150, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 150, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_masks)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 150, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_masks)};
+      __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 150, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_masks)};
+      __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 150, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    {
+      __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 150, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_masks));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_masks));
+      PyTuple_SET_ITEM(__pyx_t_1, 0+1, ((PyObject *)__pyx_v_masks));
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 150, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_4;
+  __pyx_t_4 = 0;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":145
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("_mask.decode", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_masks);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":152
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_9merge(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_9merge = {"merge", (PyCFunction)__pyx_pw_5_mask_9merge, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_9merge(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_rleObjs = 0;
+  PyObject *__pyx_v_intersect = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("merge (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_rleObjs,&__pyx_n_s_intersect,0};
+    PyObject* values[2] = {0,0};
+    values[1] = ((PyObject *)__pyx_int_0);
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_rleObjs)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_intersect);
+          if (value) { values[1] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "merge") < 0)) __PYX_ERR(0, 152, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_rleObjs = values[0];
+    __pyx_v_intersect = values[1];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("merge", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 152, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.merge", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_8merge(__pyx_self, __pyx_v_rleObjs, __pyx_v_intersect);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_8merge(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs, PyObject *__pyx_v_intersect) {
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = 0;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_R = 0;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  __Pyx_RefNannySetupContext("merge", 0);
+
+  /* "_mask.pyx":153
+ * 
+ * def merge(rleObjs, intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 153, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 153, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 153, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 153, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 153, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 153, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5_mask_RLEs))))) __PYX_ERR(0, 153, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":154
+ * def merge(rleObjs, intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)             # <<<<<<<<<<<<<<
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 154, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_R = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":155
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)             # <<<<<<<<<<<<<<
+ *     obj = _toString(R)[0]
+ *     return obj
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_v_intersect); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 155, __pyx_L1_error)
+  rleMerge(((RLE *)__pyx_v_Rs->_R), ((RLE *)__pyx_v_R->_R), ((siz)__pyx_v_Rs->_n), __pyx_t_5);
+
+  /* "_mask.pyx":156
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]             # <<<<<<<<<<<<<<
+ *     return obj
+ * 
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_R)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, ((PyObject *)__pyx_v_R)};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, ((PyObject *)__pyx_v_R)};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_R));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_R));
+      PyTuple_SET_ITEM(__pyx_t_3, 0+1, ((PyObject *)__pyx_v_R));
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_obj = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "_mask.pyx":157
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ *     return obj             # <<<<<<<<<<<<<<
+ * 
+ * def area(rleObjs):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_obj);
+  __pyx_r = __pyx_v_obj;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":152
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("_mask.merge", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_R);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":159
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_11area(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_11area = {"area", (PyCFunction)__pyx_pw_5_mask_11area, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_11area(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("area (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_10area(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_10area(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = 0;
+  uint *__pyx_v__a;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_a = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("area", 0);
+
+  /* "_mask.pyx":160
+ * 
+ * def area(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ *     rleArea(Rs._R, Rs._n, _a)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 160, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 160, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 160, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 160, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 160, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 160, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5_mask_RLEs))))) __PYX_ERR(0, 160, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":161
+ * def area(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))             # <<<<<<<<<<<<<<
+ *     rleArea(Rs._R, Rs._n, _a)
+ *     cdef np.npy_intp shape[1]
+ */
+  __pyx_v__a = ((uint *)malloc((__pyx_v_Rs->_n * (sizeof(unsigned int)))));
+
+  /* "_mask.pyx":162
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ *     rleArea(Rs._R, Rs._n, _a)             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n
+ */
+  rleArea(__pyx_v_Rs->_R, __pyx_v_Rs->_n, __pyx_v__a);
+
+  /* "_mask.pyx":164
+ *     rleArea(Rs._R, Rs._n, _a)
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n             # <<<<<<<<<<<<<<
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ */
+  (__pyx_v_shape[0]) = ((npy_intp)__pyx_v_Rs->_n);
+
+  /* "_mask.pyx":165
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n
+ *     a = np.array((Rs._n, ), dtype=np.uint8)             # <<<<<<<<<<<<<<
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_Rs->_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_uint8); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_a = __pyx_t_5;
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":166
+ *     shape[0] = <np.npy_intp> Rs._n
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ *     return a
+ */
+  __pyx_t_5 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_UINT32, __pyx_v__a); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 166, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_a, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":167
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     return a
+ * 
+ */
+  if (!(likely(((__pyx_v_a) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_a, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 167, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_a), NPY_OWNDATA);
+
+  /* "_mask.pyx":168
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ *     return a             # <<<<<<<<<<<<<<
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_a);
+  __pyx_r = __pyx_v_a;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":159
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("_mask.area", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_a);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":171
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_13iou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_13iou = {"iou", (PyCFunction)__pyx_pw_5_mask_13iou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_13iou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_dt = 0;
+  PyObject *__pyx_v_gt = 0;
+  PyObject *__pyx_v_pyiscrowd = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("iou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_pyiscrowd,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, 1); __PYX_ERR(0, 171, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_pyiscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, 2); __PYX_ERR(0, 171, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "iou") < 0)) __PYX_ERR(0, 171, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_dt = values[0];
+    __pyx_v_gt = values[1];
+    __pyx_v_pyiscrowd = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 171, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.iou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_12iou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_pyiscrowd);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":172
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_3iou_1_preproc(PyObject *__pyx_self, PyObject *__pyx_v_objs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_3iou_1_preproc = {"_preproc", (PyCFunction)__pyx_pw_5_mask_3iou_1_preproc, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_3iou_1_preproc(PyObject *__pyx_self, PyObject *__pyx_v_objs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_preproc (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_3iou__preproc(__pyx_self, ((PyObject *)__pyx_v_objs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_3iou__preproc(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_objs) {
+  PyObject *__pyx_v_isbox = NULL;
+  PyObject *__pyx_v_isrle = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *(*__pyx_t_11)(PyObject *);
+  PyObject *__pyx_t_12 = NULL;
+  Py_ssize_t __pyx_t_13;
+  PyObject *__pyx_t_14 = NULL;
+  __Pyx_RefNannySetupContext("_preproc", 0);
+  __Pyx_INCREF(__pyx_v_objs);
+
+  /* "_mask.pyx":173
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):
+ *         if len(objs) == 0:             # <<<<<<<<<<<<<<
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_objs); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 173, __pyx_L1_error)
+  __pyx_t_2 = ((__pyx_t_1 == 0) != 0);
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":174
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ *             return objs             # <<<<<<<<<<<<<<
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_v_objs);
+    __pyx_r = __pyx_v_objs;
+    goto __pyx_L0;
+
+    /* "_mask.pyx":173
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):
+ *         if len(objs) == 0:             # <<<<<<<<<<<<<<
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ */
+  }
+
+  /* "_mask.pyx":175
+ *         if len(objs) == 0:
+ *             return objs
+ *         if type(objs) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_objs)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 175, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 175, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":176
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ */
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_2 = ((__pyx_t_1 == 1) != 0);
+    if (__pyx_t_2) {
+
+      /* "_mask.pyx":177
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))             # <<<<<<<<<<<<<<
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ */
+      __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_reshape); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_5 = __Pyx_GetItemInt(__pyx_v_objs, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_5);
+      PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5);
+      __Pyx_INCREF(__pyx_int_1);
+      __Pyx_GIVEREF(__pyx_int_1);
+      PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_int_1);
+      __pyx_t_5 = 0;
+      __pyx_t_5 = NULL;
+      if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+        __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
+        if (likely(__pyx_t_5)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+          __Pyx_INCREF(__pyx_t_5);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_4, function);
+        }
+      }
+      if (!__pyx_t_5) {
+        __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+      } else {
+        #if CYTHON_FAST_PYCALL
+        if (PyFunction_Check(__pyx_t_4)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_t_6};
+          __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        } else
+        #endif
+        #if CYTHON_FAST_PYCCALL
+        if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_t_6};
+          __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        } else
+        #endif
+        {
+          __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __pyx_t_5 = NULL;
+          __Pyx_GIVEREF(__pyx_t_6);
+          PyTuple_SET_ITEM(__pyx_t_7, 0+1, __pyx_t_6);
+          __pyx_t_6 = 0;
+          __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        }
+      }
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_3);
+      __pyx_t_3 = 0;
+
+      /* "_mask.pyx":176
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ */
+    }
+
+    /* "_mask.pyx":179
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:             # <<<<<<<<<<<<<<
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ */
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_8 = ((!((__pyx_t_1 == 2) != 0)) != 0);
+    if (!__pyx_t_8) {
+    } else {
+      __pyx_t_2 = __pyx_t_8;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = __Pyx_GetItemInt(__pyx_t_3, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PyInt_EqObjC(__pyx_t_4, __pyx_int_4, 4, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 179, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_9 = ((!__pyx_t_8) != 0);
+    __pyx_t_2 = __pyx_t_9;
+    __pyx_L7_bool_binop_done:;
+    if (unlikely(__pyx_t_2)) {
+
+      /* "_mask.pyx":180
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')             # <<<<<<<<<<<<<<
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 180, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(0, 180, __pyx_L1_error)
+
+      /* "_mask.pyx":179
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:             # <<<<<<<<<<<<<<
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ */
+    }
+
+    /* "_mask.pyx":181
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)             # <<<<<<<<<<<<<<
+ *         elif type(objs) == list:
+ *             # check if list is in box format and convert it to np.ndarray
+ */
+    __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_astype); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 181, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 181, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_double); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 181, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_4);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_4, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_4)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_6};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_6};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_6);
+        PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_6);
+        __pyx_t_6 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "_mask.pyx":175
+ *         if len(objs) == 0:
+ *             return objs
+ *         if type(objs) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))
+ */
+    goto __pyx_L4;
+  }
+
+  /* "_mask.pyx":182
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:             # <<<<<<<<<<<<<<
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_objs)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 182, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 182, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (likely(__pyx_t_2)) {
+
+    /* "_mask.pyx":184
+ *         elif type(objs) == list:
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))             # <<<<<<<<<<<<<<
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:
+ */
+    __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 184, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_all); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 184, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 184, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_array); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 184, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = PyList_New(0); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 184, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    if (likely(PyList_CheckExact(__pyx_v_objs)) || PyTuple_CheckExact(__pyx_v_objs)) {
+      __pyx_t_10 = __pyx_v_objs; __Pyx_INCREF(__pyx_t_10); __pyx_t_1 = 0;
+      __pyx_t_11 = NULL;
+    } else {
+      __pyx_t_1 = -1; __pyx_t_10 = PyObject_GetIter(__pyx_v_objs); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 184, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_11 = Py_TYPE(__pyx_t_10)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 184, __pyx_L1_error)
+    }
+    for (;;) {
+      if (likely(!__pyx_t_11)) {
+        if (likely(PyList_CheckExact(__pyx_t_10))) {
+          if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_10)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_12 = PyList_GET_ITEM(__pyx_t_10, __pyx_t_1); __Pyx_INCREF(__pyx_t_12); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 184, __pyx_L1_error)
+          #else
+          __pyx_t_12 = PySequence_ITEM(__pyx_t_10, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 184, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_12);
+          #endif
+        } else {
+          if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_10)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_12 = PyTuple_GET_ITEM(__pyx_t_10, __pyx_t_1); __Pyx_INCREF(__pyx_t_12); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 184, __pyx_L1_error)
+          #else
+          __pyx_t_12 = PySequence_ITEM(__pyx_t_10, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 184, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_12);
+          #endif
+        }
+      } else {
+        __pyx_t_12 = __pyx_t_11(__pyx_t_10);
+        if (unlikely(!__pyx_t_12)) {
+          PyObject* exc_type = PyErr_Occurred();
+          if (exc_type) {
+            if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+            else __PYX_ERR(0, 184, __pyx_L1_error)
+          }
+          break;
+        }
+        __Pyx_GOTREF(__pyx_t_12);
+      }
+      __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_12);
+      __pyx_t_12 = 0;
+      __pyx_t_13 = PyObject_Length(__pyx_v_obj); if (unlikely(__pyx_t_13 == ((Py_ssize_t)-1))) __PYX_ERR(0, 184, __pyx_L1_error)
+      __pyx_t_2 = (__pyx_t_13 == 4);
+      if (__pyx_t_2) {
+      } else {
+        __pyx_t_14 = __Pyx_PyBool_FromLong(__pyx_t_2); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_14);
+        __pyx_t_12 = __pyx_t_14;
+        __pyx_t_14 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_14 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_14); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 184, __pyx_L1_error)
+      __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_14); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 184, __pyx_L1_error)
+      if (!__pyx_t_2) {
+        __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+      } else {
+        __Pyx_INCREF(__pyx_t_14);
+        __pyx_t_12 = __pyx_t_14;
+        __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_14 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_14); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 184, __pyx_L1_error)
+      __Pyx_INCREF(__pyx_t_14);
+      __pyx_t_12 = __pyx_t_14;
+      __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+      __pyx_L11_bool_binop_done:;
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_6, (PyObject*)__pyx_t_12))) __PYX_ERR(0, 184, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    __pyx_t_10 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_7))) {
+      __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_7);
+      if (likely(__pyx_t_10)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7);
+        __Pyx_INCREF(__pyx_t_10);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_7, function);
+      }
+    }
+    if (!__pyx_t_10) {
+      __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_6); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 184, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_6};
+        __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_6};
+        __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_12 = PyTuple_New(1+1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_GIVEREF(__pyx_t_10); PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_10); __pyx_t_10 = NULL;
+        __Pyx_GIVEREF(__pyx_t_6);
+        PyTuple_SET_ITEM(__pyx_t_12, 0+1, __pyx_t_6);
+        __pyx_t_6 = 0;
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_12, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 184, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_4};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_4};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_12 = PyTuple_New(1+1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_12, 0+1, __pyx_t_4);
+        __pyx_t_4 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_12, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 184, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_isbox = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "_mask.pyx":185
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))             # <<<<<<<<<<<<<<
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 185, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_all); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 185, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 185, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_array); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 185, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 185, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    if (likely(PyList_CheckExact(__pyx_v_objs)) || PyTuple_CheckExact(__pyx_v_objs)) {
+      __pyx_t_6 = __pyx_v_objs; __Pyx_INCREF(__pyx_t_6); __pyx_t_1 = 0;
+      __pyx_t_11 = NULL;
+    } else {
+      __pyx_t_1 = -1; __pyx_t_6 = PyObject_GetIter(__pyx_v_objs); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 185, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_11 = Py_TYPE(__pyx_t_6)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 185, __pyx_L1_error)
+    }
+    for (;;) {
+      if (likely(!__pyx_t_11)) {
+        if (likely(PyList_CheckExact(__pyx_t_6))) {
+          if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_6)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_10 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_1); __Pyx_INCREF(__pyx_t_10); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 185, __pyx_L1_error)
+          #else
+          __pyx_t_10 = PySequence_ITEM(__pyx_t_6, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 185, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_10);
+          #endif
+        } else {
+          if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_6)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_6, __pyx_t_1); __Pyx_INCREF(__pyx_t_10); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 185, __pyx_L1_error)
+          #else
+          __pyx_t_10 = PySequence_ITEM(__pyx_t_6, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 185, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_10);
+          #endif
+        }
+      } else {
+        __pyx_t_10 = __pyx_t_11(__pyx_t_6);
+        if (unlikely(!__pyx_t_10)) {
+          PyObject* exc_type = PyErr_Occurred();
+          if (exc_type) {
+            if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+            else __PYX_ERR(0, 185, __pyx_L1_error)
+          }
+          break;
+        }
+        __Pyx_GOTREF(__pyx_t_10);
+      }
+      __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_10);
+      __pyx_t_10 = 0;
+      __pyx_t_10 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)(&PyDict_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_10); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 185, __pyx_L1_error)
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_4, (PyObject*)__pyx_t_10))) __PYX_ERR(0, 185, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_7))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_7);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_7, function);
+      }
+    }
+    if (!__pyx_t_6) {
+      __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 185, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_5);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_t_4};
+        __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_t_4};
+        __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_10 = PyTuple_New(1+1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_6); __pyx_t_6 = NULL;
+        __Pyx_GIVEREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_10, 0+1, __pyx_t_4);
+        __pyx_t_4 = 0;
+        __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_10, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_12))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_12);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_12);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_12, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_12, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 185, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_12)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_5};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_12)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_5};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_10 = PyTuple_New(1+1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_5);
+        PyTuple_SET_ITEM(__pyx_t_10, 0+1, __pyx_t_5);
+        __pyx_t_5 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_10, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 185, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    __pyx_v_isrle = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "_mask.pyx":186
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:             # <<<<<<<<<<<<<<
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ */
+    __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_isbox); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 186, __pyx_L1_error)
+    if (__pyx_t_2) {
+
+      /* "_mask.pyx":187
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)             # <<<<<<<<<<<<<<
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ */
+      __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_array); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_12);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_v_objs);
+      __Pyx_GIVEREF(__pyx_v_objs);
+      PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_objs);
+      __pyx_t_10 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_double); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      if (PyDict_SetItem(__pyx_t_10, __pyx_n_s_dtype, __pyx_t_7) < 0) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_3, __pyx_t_10); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+      __pyx_t_7 = 0;
+
+      /* "_mask.pyx":188
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 188, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_1 = PyObject_Length(__pyx_t_7); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 188, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_2 = ((__pyx_t_1 == 1) != 0);
+      if (__pyx_t_2) {
+
+        /* "_mask.pyx":189
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))             # <<<<<<<<<<<<<<
+ *             elif isrle:
+ *                 objs = _frString(objs)
+ */
+        __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_reshape); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 189, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_12 = __Pyx_GetItemInt(__pyx_t_3, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 189, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_INCREF(__pyx_int_1);
+        __Pyx_GIVEREF(__pyx_int_1);
+        PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_int_1);
+        __Pyx_GIVEREF(__pyx_t_12);
+        PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_12);
+        __pyx_t_12 = 0;
+        __pyx_t_12 = NULL;
+        if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_10))) {
+          __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_10);
+          if (likely(__pyx_t_12)) {
+            PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10);
+            __Pyx_INCREF(__pyx_t_12);
+            __Pyx_INCREF(function);
+            __Pyx_DECREF_SET(__pyx_t_10, function);
+          }
+        }
+        if (!__pyx_t_12) {
+          __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_10, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+          __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else {
+          #if CYTHON_FAST_PYCALL
+          if (PyFunction_Check(__pyx_t_10)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_3};
+            __pyx_t_7 = __Pyx_PyFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          } else
+          #endif
+          #if CYTHON_FAST_PYCCALL
+          if (__Pyx_PyFastCFunction_Check(__pyx_t_10)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_3};
+            __pyx_t_7 = __Pyx_PyCFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          } else
+          #endif
+          {
+            __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 189, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_5);
+            __Pyx_GIVEREF(__pyx_t_12); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_12); __pyx_t_12 = NULL;
+            __Pyx_GIVEREF(__pyx_t_3);
+            PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
+            __pyx_t_3 = 0;
+            __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_5, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+          }
+        }
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+        __pyx_t_7 = 0;
+
+        /* "_mask.pyx":188
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ */
+      }
+
+      /* "_mask.pyx":186
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:             # <<<<<<<<<<<<<<
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ */
+      goto __pyx_L16;
+    }
+
+    /* "_mask.pyx":190
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:             # <<<<<<<<<<<<<<
+ *                 objs = _frString(objs)
+ *             else:
+ */
+    __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_isrle); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 190, __pyx_L1_error)
+    if (likely(__pyx_t_2)) {
+
+      /* "_mask.pyx":191
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ *                 objs = _frString(objs)             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ */
+      __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 191, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_5 = NULL;
+      if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_10))) {
+        __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_10);
+        if (likely(__pyx_t_5)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10);
+          __Pyx_INCREF(__pyx_t_5);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_10, function);
+        }
+      }
+      if (!__pyx_t_5) {
+        __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_10, __pyx_v_objs); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 191, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+      } else {
+        #if CYTHON_FAST_PYCALL
+        if (PyFunction_Check(__pyx_t_10)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_objs};
+          __pyx_t_7 = __Pyx_PyFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 191, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else
+        #endif
+        #if CYTHON_FAST_PYCCALL
+        if (__Pyx_PyFastCFunction_Check(__pyx_t_10)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_objs};
+          __pyx_t_7 = __Pyx_PyCFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 191, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else
+        #endif
+        {
+          __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_5); __pyx_t_5 = NULL;
+          __Pyx_INCREF(__pyx_v_objs);
+          __Pyx_GIVEREF(__pyx_v_objs);
+          PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_v_objs);
+          __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_3, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 191, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        }
+      }
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+      __pyx_t_7 = 0;
+
+      /* "_mask.pyx":190
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:             # <<<<<<<<<<<<<<
+ *                 objs = _frString(objs)
+ *             else:
+ */
+      goto __pyx_L16;
+    }
+
+    /* "_mask.pyx":193
+ *                 objs = _frString(objs)
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ */
+    /*else*/ {
+      __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 193, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_Raise(__pyx_t_7, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __PYX_ERR(0, 193, __pyx_L1_error)
+    }
+    __pyx_L16:;
+
+    /* "_mask.pyx":182
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:             # <<<<<<<<<<<<<<
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ */
+    goto __pyx_L4;
+  }
+
+  /* "_mask.pyx":195
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')             # <<<<<<<<<<<<<<
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ */
+  /*else*/ {
+    __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 195, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_Raise(__pyx_t_7, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __PYX_ERR(0, 195, __pyx_L1_error)
+  }
+  __pyx_L4:;
+
+  /* "_mask.pyx":196
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs             # <<<<<<<<<<<<<<
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":172
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_XDECREF(__pyx_t_12);
+  __Pyx_XDECREF(__pyx_t_14);
+  __Pyx_AddTraceback("_mask.iou._preproc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_isbox);
+  __Pyx_XDECREF(__pyx_v_isrle);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":197
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_3iou_3_rleIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_3iou_3_rleIou = {"_rleIou", (PyCFunction)__pyx_pw_5_mask_3iou_3_rleIou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_3iou_3_rleIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_dt = 0;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_gt = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  PyArrayObject *__pyx_v__iou = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_rleIou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_iscrowd,&__pyx_n_s_m,&__pyx_n_s_n,&__pyx_n_s_iou,0};
+    PyObject* values[6] = {0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 1); __PYX_ERR(0, 197, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_iscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 2); __PYX_ERR(0, 197, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_m)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 3); __PYX_ERR(0, 197, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 4); __PYX_ERR(0, 197, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_iou)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 5); __PYX_ERR(0, 197, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_rleIou") < 0)) __PYX_ERR(0, 197, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 6) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+    }
+    __pyx_v_dt = ((struct __pyx_obj_5_mask_RLEs *)values[0]);
+    __pyx_v_gt = ((struct __pyx_obj_5_mask_RLEs *)values[1]);
+    __pyx_v_iscrowd = ((PyArrayObject *)values[2]);
+    __pyx_v_m = __Pyx_PyInt_As_siz(values[3]); if (unlikely((__pyx_v_m == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 197, __pyx_L3_error)
+    __pyx_v_n = __Pyx_PyInt_As_siz(values[4]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 197, __pyx_L3_error)
+    __pyx_v__iou = ((PyArrayObject *)values[5]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 197, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.iou._rleIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dt), __pyx_ptype_5_mask_RLEs, 1, "dt", 0))) __PYX_ERR(0, 197, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_gt), __pyx_ptype_5_mask_RLEs, 1, "gt", 0))) __PYX_ERR(0, 197, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_iscrowd), __pyx_ptype_5numpy_ndarray, 1, "iscrowd", 0))) __PYX_ERR(0, 197, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v__iou), __pyx_ptype_5numpy_ndarray, 1, "_iou", 0))) __PYX_ERR(0, 197, __pyx_L1_error)
+  __pyx_r = __pyx_pf_5_mask_3iou_2_rleIou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_iscrowd, __pyx_v_m, __pyx_v_n, __pyx_v__iou);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_3iou_2_rleIou(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_5_mask_RLEs *__pyx_v_dt, struct __pyx_obj_5_mask_RLEs *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd__iou;
+  __Pyx_Buffer __pyx_pybuffer__iou;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_rleIou", 0);
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+  __pyx_pybuffer__iou.pybuffer.buf = NULL;
+  __pyx_pybuffer__iou.refcount = 0;
+  __pyx_pybuffernd__iou.data = NULL;
+  __pyx_pybuffernd__iou.rcbuffer = &__pyx_pybuffer__iou;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_v_iscrowd, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 197, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd__iou.rcbuffer->pybuffer, (PyObject*)__pyx_v__iou, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 197, __pyx_L1_error)
+  }
+  __pyx_pybuffernd__iou.diminfo[0].strides = __pyx_pybuffernd__iou.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd__iou.diminfo[0].shape = __pyx_pybuffernd__iou.rcbuffer->pybuffer.shape[0];
+
+  /* "_mask.pyx":198
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )             # <<<<<<<<<<<<<<
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ */
+  rleIou(((RLE *)__pyx_v_dt->_R), ((RLE *)__pyx_v_gt->_R), __pyx_v_m, __pyx_v_n, ((byte *)__pyx_v_iscrowd->data), ((double *)__pyx_v__iou->data));
+
+  /* "_mask.pyx":197
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.iou._rleIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":199
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_3iou_5_bbIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_3iou_5_bbIou = {"_bbIou", (PyCFunction)__pyx_pw_5_mask_3iou_5_bbIou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_3iou_5_bbIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_dt = 0;
+  PyArrayObject *__pyx_v_gt = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  PyArrayObject *__pyx_v__iou = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_bbIou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_iscrowd,&__pyx_n_s_m,&__pyx_n_s_n,&__pyx_n_s_iou,0};
+    PyObject* values[6] = {0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 1); __PYX_ERR(0, 199, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_iscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 2); __PYX_ERR(0, 199, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_m)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 3); __PYX_ERR(0, 199, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 4); __PYX_ERR(0, 199, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_iou)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 5); __PYX_ERR(0, 199, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_bbIou") < 0)) __PYX_ERR(0, 199, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 6) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+    }
+    __pyx_v_dt = ((PyArrayObject *)values[0]);
+    __pyx_v_gt = ((PyArrayObject *)values[1]);
+    __pyx_v_iscrowd = ((PyArrayObject *)values[2]);
+    __pyx_v_m = __Pyx_PyInt_As_siz(values[3]); if (unlikely((__pyx_v_m == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 199, __pyx_L3_error)
+    __pyx_v_n = __Pyx_PyInt_As_siz(values[4]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 199, __pyx_L3_error)
+    __pyx_v__iou = ((PyArrayObject *)values[5]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 199, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.iou._bbIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dt), __pyx_ptype_5numpy_ndarray, 1, "dt", 0))) __PYX_ERR(0, 199, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_gt), __pyx_ptype_5numpy_ndarray, 1, "gt", 0))) __PYX_ERR(0, 199, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_iscrowd), __pyx_ptype_5numpy_ndarray, 1, "iscrowd", 0))) __PYX_ERR(0, 199, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v__iou), __pyx_ptype_5numpy_ndarray, 1, "_iou", 0))) __PYX_ERR(0, 199, __pyx_L1_error)
+  __pyx_r = __pyx_pf_5_mask_3iou_4_bbIou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_iscrowd, __pyx_v_m, __pyx_v_n, __pyx_v__iou);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_3iou_4_bbIou(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dt, PyArrayObject *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd__iou;
+  __Pyx_Buffer __pyx_pybuffer__iou;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_dt;
+  __Pyx_Buffer __pyx_pybuffer_dt;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_gt;
+  __Pyx_Buffer __pyx_pybuffer_gt;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_bbIou", 0);
+  __pyx_pybuffer_dt.pybuffer.buf = NULL;
+  __pyx_pybuffer_dt.refcount = 0;
+  __pyx_pybuffernd_dt.data = NULL;
+  __pyx_pybuffernd_dt.rcbuffer = &__pyx_pybuffer_dt;
+  __pyx_pybuffer_gt.pybuffer.buf = NULL;
+  __pyx_pybuffer_gt.refcount = 0;
+  __pyx_pybuffernd_gt.data = NULL;
+  __pyx_pybuffernd_gt.rcbuffer = &__pyx_pybuffer_gt;
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+  __pyx_pybuffer__iou.pybuffer.buf = NULL;
+  __pyx_pybuffer__iou.refcount = 0;
+  __pyx_pybuffernd__iou.data = NULL;
+  __pyx_pybuffernd__iou.rcbuffer = &__pyx_pybuffer__iou;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_dt.rcbuffer->pybuffer, (PyObject*)__pyx_v_dt, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_dt.diminfo[0].strides = __pyx_pybuffernd_dt.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_dt.diminfo[0].shape = __pyx_pybuffernd_dt.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_dt.diminfo[1].strides = __pyx_pybuffernd_dt.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_dt.diminfo[1].shape = __pyx_pybuffernd_dt.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_gt.rcbuffer->pybuffer, (PyObject*)__pyx_v_gt, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_gt.diminfo[0].strides = __pyx_pybuffernd_gt.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_gt.diminfo[0].shape = __pyx_pybuffernd_gt.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_gt.diminfo[1].strides = __pyx_pybuffernd_gt.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_gt.diminfo[1].shape = __pyx_pybuffernd_gt.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_v_iscrowd, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd__iou.rcbuffer->pybuffer, (PyObject*)__pyx_v__iou, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  }
+  __pyx_pybuffernd__iou.diminfo[0].strides = __pyx_pybuffernd__iou.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd__iou.diminfo[0].shape = __pyx_pybuffernd__iou.rcbuffer->pybuffer.shape[0];
+
+  /* "_mask.pyx":200
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )             # <<<<<<<<<<<<<<
+ *     def _len(obj):
+ *         cdef siz N = 0
+ */
+  bbIou(((BB)__pyx_v_dt->data), ((BB)__pyx_v_gt->data), __pyx_v_m, __pyx_v_n, ((byte *)__pyx_v_iscrowd->data), ((double *)__pyx_v__iou->data));
+
+  /* "_mask.pyx":199
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dt.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_gt.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.iou._bbIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dt.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_gt.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":201
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_3iou_7_len(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_3iou_7_len = {"_len", (PyCFunction)__pyx_pw_5_mask_3iou_7_len, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_3iou_7_len(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_len (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_3iou_6_len(__pyx_self, ((PyObject *)__pyx_v_obj));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_3iou_6_len(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  siz __pyx_v_N;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  siz __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("_len", 0);
+
+  /* "_mask.pyx":202
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ *         cdef siz N = 0             # <<<<<<<<<<<<<<
+ *         if type(obj) == RLEs:
+ *             N = obj.n
+ */
+  __pyx_v_N = 0;
+
+  /* "_mask.pyx":203
+ *     def _len(obj):
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:             # <<<<<<<<<<<<<<
+ *             N = obj.n
+ *         elif len(obj)==0:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_5_mask_RLEs), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":204
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ *             N = obj.n             # <<<<<<<<<<<<<<
+ *         elif len(obj)==0:
+ *             pass
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 204, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_3 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 204, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_N = __pyx_t_3;
+
+    /* "_mask.pyx":203
+ *     def _len(obj):
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:             # <<<<<<<<<<<<<<
+ *             N = obj.n
+ *         elif len(obj)==0:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":205
+ *         if type(obj) == RLEs:
+ *             N = obj.n
+ *         elif len(obj)==0:             # <<<<<<<<<<<<<<
+ *             pass
+ *         elif type(obj) == np.ndarray:
+ */
+  __pyx_t_4 = PyObject_Length(__pyx_v_obj); if (unlikely(__pyx_t_4 == ((Py_ssize_t)-1))) __PYX_ERR(0, 205, __pyx_L1_error)
+  __pyx_t_2 = ((__pyx_t_4 == 0) != 0);
+  if (__pyx_t_2) {
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":207
+ *         elif len(obj)==0:
+ *             pass
+ *         elif type(obj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             N = obj.shape[0]
+ *         return N
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 207, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 207, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":208
+ *             pass
+ *         elif type(obj) == np.ndarray:
+ *             N = obj.shape[0]             # <<<<<<<<<<<<<<
+ *         return N
+ *     # convert iscrowd to numpy array
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 208, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_5 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 208, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_3 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_3 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 208, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_N = __pyx_t_3;
+
+    /* "_mask.pyx":207
+ *         elif len(obj)==0:
+ *             pass
+ *         elif type(obj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             N = obj.shape[0]
+ *         return N
+ */
+  }
+  __pyx_L3:;
+
+  /* "_mask.pyx":209
+ *         elif type(obj) == np.ndarray:
+ *             N = obj.shape[0]
+ *         return N             # <<<<<<<<<<<<<<
+ *     # convert iscrowd to numpy array
+ *     cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_5 = __Pyx_PyInt_From_siz(__pyx_v_N); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 209, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_r = __pyx_t_5;
+  __pyx_t_5 = 0;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":201
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("_mask.iou._len", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":171
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+static PyObject *__pyx_pf_5_mask_12iou(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_dt, PyObject *__pyx_v_gt, PyObject *__pyx_v_pyiscrowd) {
+  PyObject *__pyx_v__preproc = 0;
+  PyObject *__pyx_v__rleIou = 0;
+  PyObject *__pyx_v__bbIou = 0;
+  PyObject *__pyx_v__len = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  double *__pyx_v__iou;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v__iouFun = NULL;
+  PyObject *__pyx_v_iou = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyArrayObject *__pyx_t_6 = NULL;
+  siz __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  int __pyx_t_10;
+  PyObject *__pyx_t_11 = NULL;
+  __Pyx_RefNannySetupContext("iou", 0);
+  __Pyx_INCREF(__pyx_v_dt);
+  __Pyx_INCREF(__pyx_v_gt);
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+
+  /* "_mask.pyx":172
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_5_mask_3iou_1_preproc, 0, __pyx_n_s_iou_locals__preproc, NULL, __pyx_n_s_mask, __pyx_d, ((PyObject *)__pyx_codeobj__12)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 172, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__preproc = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":197
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_5_mask_3iou_3_rleIou, 0, __pyx_n_s_iou_locals__rleIou, NULL, __pyx_n_s_mask, __pyx_d, ((PyObject *)__pyx_codeobj__14)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 197, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__rleIou = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":199
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_5_mask_3iou_5_bbIou, 0, __pyx_n_s_iou_locals__bbIou, NULL, __pyx_n_s_mask, __pyx_d, ((PyObject *)__pyx_codeobj__16)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__bbIou = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":201
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_5_mask_3iou_7_len, 0, __pyx_n_s_iou_locals__len, NULL, __pyx_n_s_mask, __pyx_d, ((PyObject *)__pyx_codeobj__18)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 201, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__len = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":211
+ *         return N
+ *     # convert iscrowd to numpy array
+ *     cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)             # <<<<<<<<<<<<<<
+ *     # simple type checking
+ *     cdef siz m, n
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_v_pyiscrowd);
+  __Pyx_GIVEREF(__pyx_v_pyiscrowd);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_pyiscrowd);
+  __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_uint8); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 211, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 211, __pyx_L1_error)
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_iscrowd = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 211, __pyx_L1_error)
+    } else {__pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_6 = 0;
+  __pyx_v_iscrowd = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":214
+ *     # simple type checking
+ *     cdef siz m, n
+ *     dt = _preproc(dt)             # <<<<<<<<<<<<<<
+ *     gt = _preproc(gt)
+ *     m = _len(dt)
+ */
+  __pyx_t_5 = __pyx_pf_5_mask_3iou__preproc(__pyx_v__preproc, __pyx_v_dt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 214, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_dt, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":215
+ *     cdef siz m, n
+ *     dt = _preproc(dt)
+ *     gt = _preproc(gt)             # <<<<<<<<<<<<<<
+ *     m = _len(dt)
+ *     n = _len(gt)
+ */
+  __pyx_t_5 = __pyx_pf_5_mask_3iou__preproc(__pyx_v__preproc, __pyx_v_gt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 215, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_gt, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "_mask.pyx":216
+ *     dt = _preproc(dt)
+ *     gt = _preproc(gt)
+ *     m = _len(dt)             # <<<<<<<<<<<<<<
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:
+ */
+  __pyx_t_5 = __pyx_pf_5_mask_3iou_6_len(__pyx_v__len, __pyx_v_dt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 216, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_7 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_7 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 216, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_m = __pyx_t_7;
+
+  /* "_mask.pyx":217
+ *     gt = _preproc(gt)
+ *     m = _len(dt)
+ *     n = _len(gt)             # <<<<<<<<<<<<<<
+ *     if m == 0 or n == 0:
+ *         return []
+ */
+  __pyx_t_5 = __pyx_pf_5_mask_3iou_6_len(__pyx_v__len, __pyx_v_gt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 217, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_7 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_7 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 217, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_n = __pyx_t_7;
+
+  /* "_mask.pyx":218
+ *     m = _len(dt)
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:             # <<<<<<<<<<<<<<
+ *         return []
+ *     if not type(dt) == type(gt):
+ */
+  __pyx_t_9 = ((__pyx_v_m == 0) != 0);
+  if (!__pyx_t_9) {
+  } else {
+    __pyx_t_8 = __pyx_t_9;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_9 = ((__pyx_v_n == 0) != 0);
+  __pyx_t_8 = __pyx_t_9;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_8) {
+
+    /* "_mask.pyx":219
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:
+ *         return []             # <<<<<<<<<<<<<<
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_5 = PyList_New(0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 219, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_r = __pyx_t_5;
+    __pyx_t_5 = 0;
+    goto __pyx_L0;
+
+    /* "_mask.pyx":218
+ *     m = _len(dt)
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:             # <<<<<<<<<<<<<<
+ *         return []
+ *     if not type(dt) == type(gt):
+ */
+  }
+
+  /* "_mask.pyx":220
+ *     if m == 0 or n == 0:
+ *         return []
+ *     if not type(dt) == type(gt):             # <<<<<<<<<<<<<<
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ * 
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)Py_TYPE(__pyx_v_gt)), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 220, __pyx_L1_error)
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 220, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_9 = ((!__pyx_t_8) != 0);
+  if (unlikely(__pyx_t_9)) {
+
+    /* "_mask.pyx":221
+ *         return []
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')             # <<<<<<<<<<<<<<
+ * 
+ *     # define local variables
+ */
+    __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__19, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 221, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __PYX_ERR(0, 221, __pyx_L1_error)
+
+    /* "_mask.pyx":220
+ *     if m == 0 or n == 0:
+ *         return []
+ *     if not type(dt) == type(gt):             # <<<<<<<<<<<<<<
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ * 
+ */
+  }
+
+  /* "_mask.pyx":224
+ * 
+ *     # define local variables
+ *     cdef double* _iou = <double*> 0             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ */
+  __pyx_v__iou = ((double *)0);
+
+  /* "_mask.pyx":227
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:             # <<<<<<<<<<<<<<
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)__pyx_ptype_5_mask_RLEs), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 227, __pyx_L1_error)
+  __pyx_t_9 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 227, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (__pyx_t_9) {
+
+    /* "_mask.pyx":228
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou             # <<<<<<<<<<<<<<
+ *     elif type(dt) == np.ndarray:
+ *         _iouFun = _bbIou
+ */
+    __Pyx_INCREF(__pyx_v__rleIou);
+    __pyx_v__iouFun = __pyx_v__rleIou;
+
+    /* "_mask.pyx":227
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:             # <<<<<<<<<<<<<<
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "_mask.pyx":229
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         _iouFun = _bbIou
+ *     else:
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 229, __pyx_L1_error)
+  __pyx_t_9 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 229, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (likely(__pyx_t_9)) {
+
+    /* "_mask.pyx":230
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ *         _iouFun = _bbIou             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise Exception('input data type not allowed.')
+ */
+    __Pyx_INCREF(__pyx_v__bbIou);
+    __pyx_v__iouFun = __pyx_v__bbIou;
+
+    /* "_mask.pyx":229
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         _iouFun = _bbIou
+ *     else:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "_mask.pyx":232
+ *         _iouFun = _bbIou
+ *     else:
+ *         raise Exception('input data type not allowed.')             # <<<<<<<<<<<<<<
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ */
+  /*else*/ {
+    __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__20, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 232, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __PYX_ERR(0, 232, __pyx_L1_error)
+  }
+  __pyx_L7:;
+
+  /* "_mask.pyx":233
+ *     else:
+ *         raise Exception('input data type not allowed.')
+ *     _iou = <double*> malloc(m*n* sizeof(double))             # <<<<<<<<<<<<<<
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n
+ */
+  __pyx_v__iou = ((double *)malloc(((__pyx_v_m * __pyx_v_n) * (sizeof(double)))));
+
+  /* "_mask.pyx":234
+ *         raise Exception('input data type not allowed.')
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)             # <<<<<<<<<<<<<<
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ */
+  __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_zeros); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyInt_From_siz((__pyx_v_m * __pyx_v_n)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_double); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_iou = __pyx_t_4;
+  __pyx_t_4 = 0;
+
+  /* "_mask.pyx":235
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n             # <<<<<<<<<<<<<<
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ */
+  (__pyx_v_shape[0]) = (((npy_intp)__pyx_v_m) * __pyx_v_n);
+
+  /* "_mask.pyx":236
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ */
+  __pyx_t_4 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_DOUBLE, __pyx_v__iou); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 236, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF_SET(__pyx_v_iou, __pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "_mask.pyx":237
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ *     return iou.reshape((m,n), order='F')
+ */
+  if (!(likely(((__pyx_v_iou) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_iou, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 237, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_iou), NPY_OWNDATA);
+
+  /* "_mask.pyx":238
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)             # <<<<<<<<<<<<<<
+ *     return iou.reshape((m,n), order='F')
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_m); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 238, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_5 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 238, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_INCREF(__pyx_v__iouFun);
+  __pyx_t_3 = __pyx_v__iouFun; __pyx_t_2 = NULL;
+  __pyx_t_10 = 0;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_2)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_2);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_10 = 1;
+    }
+  }
+  #if CYTHON_FAST_PYCALL
+  if (PyFunction_Check(__pyx_t_3)) {
+    PyObject *__pyx_temp[7] = {__pyx_t_2, __pyx_v_dt, __pyx_v_gt, ((PyObject *)__pyx_v_iscrowd), __pyx_t_1, __pyx_t_5, __pyx_v_iou};
+    __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_10, 6+__pyx_t_10); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 238, __pyx_L1_error)
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else
+  #endif
+  #if CYTHON_FAST_PYCCALL
+  if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+    PyObject *__pyx_temp[7] = {__pyx_t_2, __pyx_v_dt, __pyx_v_gt, ((PyObject *)__pyx_v_iscrowd), __pyx_t_1, __pyx_t_5, __pyx_v_iou};
+    __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_10, 6+__pyx_t_10); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 238, __pyx_L1_error)
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else
+  #endif
+  {
+    __pyx_t_11 = PyTuple_New(6+__pyx_t_10); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 238, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_11);
+    if (__pyx_t_2) {
+      __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_2); __pyx_t_2 = NULL;
+    }
+    __Pyx_INCREF(__pyx_v_dt);
+    __Pyx_GIVEREF(__pyx_v_dt);
+    PyTuple_SET_ITEM(__pyx_t_11, 0+__pyx_t_10, __pyx_v_dt);
+    __Pyx_INCREF(__pyx_v_gt);
+    __Pyx_GIVEREF(__pyx_v_gt);
+    PyTuple_SET_ITEM(__pyx_t_11, 1+__pyx_t_10, __pyx_v_gt);
+    __Pyx_INCREF(((PyObject *)__pyx_v_iscrowd));
+    __Pyx_GIVEREF(((PyObject *)__pyx_v_iscrowd));
+    PyTuple_SET_ITEM(__pyx_t_11, 2+__pyx_t_10, ((PyObject *)__pyx_v_iscrowd));
+    __Pyx_GIVEREF(__pyx_t_1);
+    PyTuple_SET_ITEM(__pyx_t_11, 3+__pyx_t_10, __pyx_t_1);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyTuple_SET_ITEM(__pyx_t_11, 4+__pyx_t_10, __pyx_t_5);
+    __Pyx_INCREF(__pyx_v_iou);
+    __Pyx_GIVEREF(__pyx_v_iou);
+    PyTuple_SET_ITEM(__pyx_t_11, 5+__pyx_t_10, __pyx_v_iou);
+    __pyx_t_1 = 0;
+    __pyx_t_5 = 0;
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_11, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 238, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+  }
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "_mask.pyx":239
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ *     return iou.reshape((m,n), order='F')             # <<<<<<<<<<<<<<
+ * 
+ * def toBbox( rleObjs ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_iou, __pyx_n_s_reshape); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_PyInt_From_siz(__pyx_v_m); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_11 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_11);
+  __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_11);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_11);
+  __pyx_t_3 = 0;
+  __pyx_t_11 = 0;
+  __pyx_t_11 = PyTuple_New(1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_11);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  if (PyDict_SetItem(__pyx_t_5, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 239, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_11, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 239, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_r = __pyx_t_3;
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":171
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_11);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.iou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF(__pyx_v__preproc);
+  __Pyx_XDECREF(__pyx_v__rleIou);
+  __Pyx_XDECREF(__pyx_v__bbIou);
+  __Pyx_XDECREF(__pyx_v__len);
+  __Pyx_XDECREF((PyObject *)__pyx_v_iscrowd);
+  __Pyx_XDECREF(__pyx_v__iouFun);
+  __Pyx_XDECREF(__pyx_v_iou);
+  __Pyx_XDECREF(__pyx_v_dt);
+  __Pyx_XDECREF(__pyx_v_gt);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":241
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_15toBbox(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_15toBbox = {"toBbox", (PyCFunction)__pyx_pw_5_mask_15toBbox, METH_O, 0};
+static PyObject *__pyx_pw_5_mask_15toBbox(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("toBbox (wrapper)", 0);
+  __pyx_r = __pyx_pf_5_mask_14toBbox(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_14toBbox(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = 0;
+  siz __pyx_v_n;
+  BB __pyx_v__bb;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_bb = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  siz __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  __Pyx_RefNannySetupContext("toBbox", 0);
+
+  /* "_mask.pyx":242
+ * 
+ * def toBbox( rleObjs ):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 242, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 242, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 242, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 242, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 242, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 242, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5_mask_RLEs))))) __PYX_ERR(0, 242, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "_mask.pyx":243
+ * def toBbox( rleObjs ):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n             # <<<<<<<<<<<<<<
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_Rs), __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 243, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 243, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_n = __pyx_t_5;
+
+  /* "_mask.pyx":244
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))             # <<<<<<<<<<<<<<
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ *     cdef np.npy_intp shape[1]
+ */
+  __pyx_v__bb = ((BB)malloc(((4 * __pyx_v_n) * (sizeof(double)))));
+
+  /* "_mask.pyx":245
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n
+ */
+  rleToBbox(((RLE const *)__pyx_v_Rs->_R), __pyx_v__bb, __pyx_v_n);
+
+  /* "_mask.pyx":247
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n             # <<<<<<<<<<<<<<
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ */
+  (__pyx_v_shape[0]) = (((npy_intp)4) * __pyx_v_n);
+
+  /* "_mask.pyx":248
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n
+ *     bb = np.array((1,4*n), dtype=np.double)             # <<<<<<<<<<<<<<
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz((4 * __pyx_v_n)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_INCREF(__pyx_int_1);
+  __Pyx_GIVEREF(__pyx_int_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_int_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_double); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_6) < 0) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_t_6 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 248, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_bb = __pyx_t_6;
+  __pyx_t_6 = 0;
+
+  /* "_mask.pyx":249
+ *     shape[0] = <np.npy_intp> 4*n
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ *     return bb
+ */
+  __pyx_t_4 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_DOUBLE, __pyx_v__bb); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_reshape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_4);
+  __Pyx_INCREF(__pyx_int_4);
+  __Pyx_GIVEREF(__pyx_int_4);
+  PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_int_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_1);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_1, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 249, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_6);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
+      __pyx_t_6 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
+      __pyx_t_6 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_GIVEREF(__pyx_t_2);
+      PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_t_2);
+      __pyx_t_2 = 0;
+      __pyx_t_6 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_3, NULL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF_SET(__pyx_v_bb, __pyx_t_6);
+  __pyx_t_6 = 0;
+
+  /* "_mask.pyx":250
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     return bb
+ * 
+ */
+  if (!(likely(((__pyx_v_bb) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_bb, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 250, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_bb), NPY_OWNDATA);
+
+  /* "_mask.pyx":251
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ *     return bb             # <<<<<<<<<<<<<<
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_bb);
+  __pyx_r = __pyx_v_bb;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":241
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("_mask.toBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_bb);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":253
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_17frBbox(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_17frBbox = {"frBbox", (PyCFunction)__pyx_pw_5_mask_17frBbox, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_17frBbox(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_bb = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frBbox (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_bb,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_bb)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, 1); __PYX_ERR(0, 253, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, 2); __PYX_ERR(0, 253, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frBbox") < 0)) __PYX_ERR(0, 253, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_bb = ((PyArrayObject *)values[0]);
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 253, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 253, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 253, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.frBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_bb), __pyx_ptype_5numpy_ndarray, 1, "bb", 0))) __PYX_ERR(0, 253, __pyx_L1_error)
+  __pyx_r = __pyx_pf_5_mask_16frBbox(__pyx_self, __pyx_v_bb, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_16frBbox(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_bb, siz __pyx_v_h, siz __pyx_v_w) {
+  siz __pyx_v_n;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_bb;
+  __Pyx_Buffer __pyx_pybuffer_bb;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  __Pyx_RefNannySetupContext("frBbox", 0);
+  __pyx_pybuffer_bb.pybuffer.buf = NULL;
+  __pyx_pybuffer_bb.refcount = 0;
+  __pyx_pybuffernd_bb.data = NULL;
+  __pyx_pybuffernd_bb.rcbuffer = &__pyx_pybuffer_bb;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_bb.rcbuffer->pybuffer, (PyObject*)__pyx_v_bb, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 253, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_bb.diminfo[0].strides = __pyx_pybuffernd_bb.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_bb.diminfo[0].shape = __pyx_pybuffernd_bb.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_bb.diminfo[1].strides = __pyx_pybuffernd_bb.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_bb.diminfo[1].shape = __pyx_pybuffernd_bb.rcbuffer->pybuffer.shape[1];
+
+  /* "_mask.pyx":254
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ *     cdef siz n = bb.shape[0]             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ */
+  __pyx_v_n = (__pyx_v_bb->dimensions[0]);
+
+  /* "_mask.pyx":255
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 255, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_CallOneArg(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 255, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "_mask.pyx":256
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+  rleFrBbox(((RLE *)__pyx_v_Rs->_R), ((BB const )__pyx_v_bb->data), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* "_mask.pyx":257
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 257, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_1);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_1, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_1, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 257, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_2);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 257, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_2);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 257, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_4, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 257, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_objs = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "_mask.pyx":258
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frPoly( poly, siz h, siz w ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":253
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_bb.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.frBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_bb.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":260
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_19frPoly(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_19frPoly = {"frPoly", (PyCFunction)__pyx_pw_5_mask_19frPoly, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_19frPoly(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_poly = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frPoly (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_poly,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_poly)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, 1); __PYX_ERR(0, 260, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, 2); __PYX_ERR(0, 260, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frPoly") < 0)) __PYX_ERR(0, 260, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_poly = values[0];
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 260, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 260, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 260, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.frPoly", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_18frPoly(__pyx_self, __pyx_v_poly, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_18frPoly(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_poly, siz __pyx_v_h, siz __pyx_v_w) {
+  PyArrayObject *__pyx_v_np_poly = 0;
+  Py_ssize_t __pyx_v_n;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_i = NULL;
+  PyObject *__pyx_v_p = NULL;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_np_poly;
+  __Pyx_Buffer __pyx_pybuffer_np_poly;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyArrayObject *__pyx_t_10 = NULL;
+  int __pyx_t_11;
+  PyObject *__pyx_t_12 = NULL;
+  PyObject *__pyx_t_13 = NULL;
+  PyObject *__pyx_t_14 = NULL;
+  Py_ssize_t __pyx_t_15;
+  Py_ssize_t __pyx_t_16;
+  __Pyx_RefNannySetupContext("frPoly", 0);
+  __pyx_pybuffer_np_poly.pybuffer.buf = NULL;
+  __pyx_pybuffer_np_poly.refcount = 0;
+  __pyx_pybuffernd_np_poly.data = NULL;
+  __pyx_pybuffernd_np_poly.rcbuffer = &__pyx_pybuffer_np_poly;
+
+  /* "_mask.pyx":262
+ * def frPoly( poly, siz h, siz w ):
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_poly); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 262, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "_mask.pyx":263
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ */
+  __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 263, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = __Pyx_PyObject_CallOneArg(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 263, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "_mask.pyx":264
+ *     n = len(poly)
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):             # <<<<<<<<<<<<<<
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+ */
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_t_3 = __pyx_int_0;
+  if (likely(PyList_CheckExact(__pyx_v_poly)) || PyTuple_CheckExact(__pyx_v_poly)) {
+    __pyx_t_2 = __pyx_v_poly; __Pyx_INCREF(__pyx_t_2); __pyx_t_1 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_1 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_poly); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 264, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 264, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_2))) {
+        if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_2)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 264, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 264, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_2)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 264, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_2, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 264, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_2);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(0, 264, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_p, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __Pyx_INCREF(__pyx_t_3);
+    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_3);
+    __pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_t_3, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 264, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_3);
+    __pyx_t_3 = __pyx_t_5;
+    __pyx_t_5 = 0;
+
+    /* "_mask.pyx":265
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')             # <<<<<<<<<<<<<<
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+ *     objs = _toString(Rs)
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_array); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_INCREF(__pyx_v_p);
+    __Pyx_GIVEREF(__pyx_v_p);
+    PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_v_p);
+    __pyx_t_7 = __Pyx_PyDict_NewPresized(2); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_double); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    if (PyDict_SetItem(__pyx_t_7, __pyx_n_s_dtype, __pyx_t_9) < 0) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    if (PyDict_SetItem(__pyx_t_7, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 265, __pyx_L1_error)
+    __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_5, __pyx_t_7); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 265, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    if (!(likely(((__pyx_t_9) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_9, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 265, __pyx_L1_error)
+    __pyx_t_10 = ((PyArrayObject *)__pyx_t_9);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+      __pyx_t_11 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer, (PyObject*)__pyx_t_10, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack);
+      if (unlikely(__pyx_t_11 < 0)) {
+        PyErr_Fetch(&__pyx_t_12, &__pyx_t_13, &__pyx_t_14);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer, (PyObject*)__pyx_v_np_poly, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_12); Py_XDECREF(__pyx_t_13); Py_XDECREF(__pyx_t_14);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_12, __pyx_t_13, __pyx_t_14);
+        }
+        __pyx_t_12 = __pyx_t_13 = __pyx_t_14 = 0;
+      }
+      __pyx_pybuffernd_np_poly.diminfo[0].strides = __pyx_pybuffernd_np_poly.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_np_poly.diminfo[0].shape = __pyx_pybuffernd_np_poly.rcbuffer->pybuffer.shape[0];
+      if (unlikely(__pyx_t_11 < 0)) __PYX_ERR(0, 265, __pyx_L1_error)
+    }
+    __pyx_t_10 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_np_poly, ((PyArrayObject *)__pyx_t_9));
+    __pyx_t_9 = 0;
+
+    /* "_mask.pyx":266
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+    __pyx_t_15 = __Pyx_PyIndex_AsSsize_t(__pyx_v_i); if (unlikely((__pyx_t_15 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 266, __pyx_L1_error)
+    __pyx_t_16 = PyObject_Length(__pyx_v_p); if (unlikely(__pyx_t_16 == ((Py_ssize_t)-1))) __PYX_ERR(0, 266, __pyx_L1_error)
+    rleFrPoly(((RLE *)(&(__pyx_v_Rs->_R[__pyx_t_15]))), ((double const *)__pyx_v_np_poly->data), ((siz)__Pyx_div_Py_ssize_t(__pyx_t_16, 2)), __pyx_v_h, __pyx_v_w);
+
+    /* "_mask.pyx":264
+ *     n = len(poly)
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):             # <<<<<<<<<<<<<<
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+ */
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+
+  /* "_mask.pyx":267
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 267, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_9 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_9 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_9)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_9);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_9) {
+    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 267, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_9, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 267, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_9, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 267, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 267, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_GIVEREF(__pyx_t_9); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_9); __pyx_t_9 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_7, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 267, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_v_objs = __pyx_t_3;
+  __pyx_t_3 = 0;
+
+  /* "_mask.pyx":268
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":260
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_9);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.frPoly", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_np_poly);
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_i);
+  __Pyx_XDECREF(__pyx_v_p);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":270
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_21frUncompressedRLE(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_21frUncompressedRLE = {"frUncompressedRLE", (PyCFunction)__pyx_pw_5_mask_21frUncompressedRLE, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_21frUncompressedRLE(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_ucRles = 0;
+  CYTHON_UNUSED siz __pyx_v_h;
+  CYTHON_UNUSED siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frUncompressedRLE (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_ucRles,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ucRles)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, 1); __PYX_ERR(0, 270, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, 2); __PYX_ERR(0, 270, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frUncompressedRLE") < 0)) __PYX_ERR(0, 270, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_ucRles = values[0];
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 270, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 270, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 270, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.frUncompressedRLE", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_20frUncompressedRLE(__pyx_self, __pyx_v_ucRles, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_20frUncompressedRLE(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_ucRles, CYTHON_UNUSED siz __pyx_v_h, CYTHON_UNUSED siz __pyx_v_w) {
+  PyArrayObject *__pyx_v_cnts = 0;
+  RLE __pyx_v_R;
+  uint *__pyx_v_data;
+  Py_ssize_t __pyx_v_n;
+  PyObject *__pyx_v_objs = NULL;
+  Py_ssize_t __pyx_v_i;
+  struct __pyx_obj_5_mask_RLEs *__pyx_v_Rs = NULL;
+  Py_ssize_t __pyx_v_j;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_cnts;
+  __Pyx_Buffer __pyx_pybuffer_cnts;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyArrayObject *__pyx_t_9 = NULL;
+  int __pyx_t_10;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  PyObject *__pyx_t_13 = NULL;
+  Py_ssize_t __pyx_t_14;
+  Py_ssize_t __pyx_t_15;
+  Py_ssize_t __pyx_t_16;
+  Py_ssize_t __pyx_t_17;
+  RLE __pyx_t_18;
+  siz __pyx_t_19;
+  int __pyx_t_20;
+  __Pyx_RefNannySetupContext("frUncompressedRLE", 0);
+  __pyx_pybuffer_cnts.pybuffer.buf = NULL;
+  __pyx_pybuffer_cnts.refcount = 0;
+  __pyx_pybuffernd_cnts.data = NULL;
+  __pyx_pybuffernd_cnts.rcbuffer = &__pyx_pybuffer_cnts;
+
+  /* "_mask.pyx":274
+ *     cdef RLE R
+ *     cdef uint *data
+ *     n = len(ucRles)             # <<<<<<<<<<<<<<
+ *     objs = []
+ *     for i in range(n):
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_ucRles); if (unlikely(__pyx_t_1 == ((Py_ssize_t)-1))) __PYX_ERR(0, 274, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "_mask.pyx":275
+ *     cdef uint *data
+ *     n = len(ucRles)
+ *     objs = []             # <<<<<<<<<<<<<<
+ *     for i in range(n):
+ *         Rs = RLEs(1)
+ */
+  __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 275, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_v_objs = ((PyObject*)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "_mask.pyx":276
+ *     n = len(ucRles)
+ *     objs = []
+ *     for i in range(n):             # <<<<<<<<<<<<<<
+ *         Rs = RLEs(1)
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ */
+  __pyx_t_1 = __pyx_v_n;
+  __pyx_t_3 = __pyx_t_1;
+  for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
+    __pyx_v_i = __pyx_t_4;
+
+    /* "_mask.pyx":277
+ *     objs = []
+ *     for i in range(n):
+ *         Rs = RLEs(1)             # <<<<<<<<<<<<<<
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ */
+    __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_5_mask_RLEs), __pyx_tuple__21, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 277, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_Rs, ((struct __pyx_obj_5_mask_RLEs *)__pyx_t_2));
+    __pyx_t_2 = 0;
+
+    /* "_mask.pyx":278
+ *     for i in range(n):
+ *         Rs = RLEs(1)
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)             # <<<<<<<<<<<<<<
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ */
+    __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_array); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_6 = __Pyx_PyObject_Dict_GetItem(__pyx_t_2, __pyx_n_s_counts); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_GIVEREF(__pyx_t_6);
+    PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_6);
+    __pyx_t_6 = 0;
+    __pyx_t_6 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_uint32); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    if (PyDict_SetItem(__pyx_t_6, __pyx_n_s_dtype, __pyx_t_8) < 0) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, __pyx_t_6); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 278, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    if (!(likely(((__pyx_t_8) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_8, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 278, __pyx_L1_error)
+    __pyx_t_9 = ((PyArrayObject *)__pyx_t_8);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+      __pyx_t_10 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer, (PyObject*)__pyx_t_9, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack);
+      if (unlikely(__pyx_t_10 < 0)) {
+        PyErr_Fetch(&__pyx_t_11, &__pyx_t_12, &__pyx_t_13);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer, (PyObject*)__pyx_v_cnts, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_11); Py_XDECREF(__pyx_t_12); Py_XDECREF(__pyx_t_13);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_11, __pyx_t_12, __pyx_t_13);
+        }
+        __pyx_t_11 = __pyx_t_12 = __pyx_t_13 = 0;
+      }
+      __pyx_pybuffernd_cnts.diminfo[0].strides = __pyx_pybuffernd_cnts.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_cnts.diminfo[0].shape = __pyx_pybuffernd_cnts.rcbuffer->pybuffer.shape[0];
+      if (unlikely(__pyx_t_10 < 0)) __PYX_ERR(0, 278, __pyx_L1_error)
+    }
+    __pyx_t_9 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_cnts, ((PyArrayObject *)__pyx_t_8));
+    __pyx_t_8 = 0;
+
+    /* "_mask.pyx":280
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))             # <<<<<<<<<<<<<<
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]
+ */
+    __pyx_t_14 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_14 == ((Py_ssize_t)-1))) __PYX_ERR(0, 280, __pyx_L1_error)
+    __pyx_v_data = ((uint *)malloc((__pyx_t_14 * (sizeof(unsigned int)))));
+
+    /* "_mask.pyx":281
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ *         for j in range(len(cnts)):             # <<<<<<<<<<<<<<
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ */
+    __pyx_t_14 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_14 == ((Py_ssize_t)-1))) __PYX_ERR(0, 281, __pyx_L1_error)
+    __pyx_t_15 = __pyx_t_14;
+    for (__pyx_t_16 = 0; __pyx_t_16 < __pyx_t_15; __pyx_t_16+=1) {
+      __pyx_v_j = __pyx_t_16;
+
+      /* "_mask.pyx":282
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]             # <<<<<<<<<<<<<<
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R
+ */
+      __pyx_t_17 = __pyx_v_j;
+      __pyx_t_10 = -1;
+      if (__pyx_t_17 < 0) {
+        __pyx_t_17 += __pyx_pybuffernd_cnts.diminfo[0].shape;
+        if (unlikely(__pyx_t_17 < 0)) __pyx_t_10 = 0;
+      } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_cnts.diminfo[0].shape)) __pyx_t_10 = 0;
+      if (unlikely(__pyx_t_10 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_10);
+        __PYX_ERR(0, 282, __pyx_L1_error)
+      }
+      (__pyx_v_data[__pyx_v_j]) = ((uint)(*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_uint32_t *, __pyx_pybuffernd_cnts.rcbuffer->pybuffer.buf, __pyx_t_17, __pyx_pybuffernd_cnts.diminfo[0].strides)));
+    }
+
+    /* "_mask.pyx":283
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)             # <<<<<<<<<<<<<<
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])
+ */
+    __pyx_t_8 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __pyx_t_6 = __Pyx_PyObject_Dict_GetItem(__pyx_t_8, __pyx_n_s_size); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_8 = __Pyx_GetItemInt(__pyx_t_6, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_19 = __Pyx_PyInt_As_siz(__pyx_t_8); if (unlikely((__pyx_t_19 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_18.h = __pyx_t_19;
+    __pyx_t_8 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __pyx_t_6 = __Pyx_PyObject_Dict_GetItem(__pyx_t_8, __pyx_n_s_size); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_8 = __Pyx_GetItemInt(__pyx_t_6, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_19 = __Pyx_PyInt_As_siz(__pyx_t_8); if (unlikely((__pyx_t_19 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 283, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_18.w = __pyx_t_19;
+    __pyx_t_14 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_14 == ((Py_ssize_t)-1))) __PYX_ERR(0, 283, __pyx_L1_error)
+    __pyx_t_18.m = __pyx_t_14;
+    __pyx_t_18.cnts = ((uint *)__pyx_v_data);
+    __pyx_v_R = __pyx_t_18;
+
+    /* "_mask.pyx":284
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R             # <<<<<<<<<<<<<<
+ *         objs.append(_toString(Rs)[0])
+ *     return objs
+ */
+    (__pyx_v_Rs->_R[0]) = __pyx_v_R;
+
+    /* "_mask.pyx":285
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+    __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 285, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_2 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) {
+      __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_6);
+      if (likely(__pyx_t_2)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6);
+        __Pyx_INCREF(__pyx_t_2);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_6, function);
+      }
+    }
+    if (!__pyx_t_2) {
+      __pyx_t_8 = __Pyx_PyObject_CallOneArg(__pyx_t_6, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 285, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_8);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_6)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_2, ((PyObject *)__pyx_v_Rs)};
+        __pyx_t_8 = __Pyx_PyFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 285, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_GOTREF(__pyx_t_8);
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_6)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_2, ((PyObject *)__pyx_v_Rs)};
+        __pyx_t_8 = __Pyx_PyCFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 285, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_GOTREF(__pyx_t_8);
+      } else
+      #endif
+      {
+        __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 285, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2); __pyx_t_2 = NULL;
+        __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+        __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+        PyTuple_SET_ITEM(__pyx_t_5, 0+1, ((PyObject *)__pyx_v_Rs));
+        __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_5, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 285, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_8);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = __Pyx_GetItemInt(__pyx_t_8, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 285, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __pyx_t_20 = __Pyx_PyList_Append(__pyx_v_objs, __pyx_t_6); if (unlikely(__pyx_t_20 == ((int)-1))) __PYX_ERR(0, 285, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  }
+
+  /* "_mask.pyx":286
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frPyObjects(pyobj, h, w):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":270
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("_mask.frUncompressedRLE", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_cnts);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "_mask.pyx":288
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, h, w):             # <<<<<<<<<<<<<<
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_5_mask_23frPyObjects(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5_mask_23frPyObjects = {"frPyObjects", (PyCFunction)__pyx_pw_5_mask_23frPyObjects, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5_mask_23frPyObjects(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_pyobj = 0;
+  PyObject *__pyx_v_h = 0;
+  PyObject *__pyx_v_w = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frPyObjects (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyobj,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_pyobj)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, 1); __PYX_ERR(0, 288, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, 2); __PYX_ERR(0, 288, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frPyObjects") < 0)) __PYX_ERR(0, 288, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_pyobj = values[0];
+    __pyx_v_h = values[1];
+    __pyx_v_w = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 288, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("_mask.frPyObjects", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_5_mask_22frPyObjects(__pyx_self, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_5_mask_22frPyObjects(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyobj, PyObject *__pyx_v_h, PyObject *__pyx_v_w) {
+  PyObject *__pyx_v_objs = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_t_7;
+  Py_ssize_t __pyx_t_8;
+  int __pyx_t_9;
+  PyObject *__pyx_t_10 = NULL;
+  __Pyx_RefNannySetupContext("frPyObjects", 0);
+
+  /* "_mask.pyx":290
+ * def frPyObjects(pyobj, h, w):
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 290, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 290, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":291
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w)             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w)
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frBbox); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 291, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 291, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 291, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_6 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 291, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (__pyx_t_4) {
+        __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_6, 0+__pyx_t_5, __pyx_v_pyobj);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_6, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 291, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":290
+ * def frPyObjects(pyobj, h, w):
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":292
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 292, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 292, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 292, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_8 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_8 == ((Py_ssize_t)-1))) __PYX_ERR(0, 292, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_7 = ((__pyx_t_8 == 4) != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":293
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w)             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frBbox); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 293, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_6 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 293, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 293, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 293, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      if (__pyx_t_6) {
+        __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_6); __pyx_t_6 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+__pyx_t_5, __pyx_v_pyobj);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_4, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_4, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 293, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":292
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":294
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 294, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 294, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L6_bool_binop_done;
+  }
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 294, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_8 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_8 == ((Py_ssize_t)-1))) __PYX_ERR(0, 294, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_7 = ((__pyx_t_8 > 4) != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L6_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":295
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frPoly); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 295, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 295, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 295, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_6 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 295, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (__pyx_t_4) {
+        __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_6, 0+__pyx_t_5, __pyx_v_pyobj);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_6, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 295, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":294
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w)
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":296
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \             # <<<<<<<<<<<<<<
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 296, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 296, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L8_bool_binop_done;
+  }
+
+  /* "_mask.pyx":297
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     # encode rle from single python object
+ */
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 296, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+
+  /* "_mask.pyx":296
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \             # <<<<<<<<<<<<<<
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_t_1)), ((PyObject *)(&PyDict_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 296, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 296, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L8_bool_binop_done;
+  }
+
+  /* "_mask.pyx":297
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     # encode rle from single python object
+ */
+  __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 297, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_7 = (__Pyx_PySequence_ContainsTF(__pyx_n_s_counts, __pyx_t_3, Py_EQ)); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 297, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_9 = (__pyx_t_7 != 0);
+  if (__pyx_t_9) {
+  } else {
+    __pyx_t_2 = __pyx_t_9;
+    goto __pyx_L8_bool_binop_done;
+  }
+  __pyx_t_3 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 297, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_9 = (__Pyx_PySequence_ContainsTF(__pyx_n_s_size, __pyx_t_3, Py_EQ)); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 297, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_7 = (__pyx_t_9 != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L8_bool_binop_done:;
+
+  /* "_mask.pyx":296
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \             # <<<<<<<<<<<<<<
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ */
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":298
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ *         objs = frUncompressedRLE(pyobj, h, w)             # <<<<<<<<<<<<<<
+ *     # encode rle from single python object
+ *     elif type(pyobj) == list and len(pyobj) == 4:
+ */
+    __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_frUncompressedRLE); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 298, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_6 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_1);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_1, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 298, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 298, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 298, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      if (__pyx_t_6) {
+        __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_6); __pyx_t_6 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+__pyx_t_5, __pyx_v_pyobj);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_4, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_4, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 298, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_objs = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "_mask.pyx":296
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w)
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict \             # <<<<<<<<<<<<<<
+ *         and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":300
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     # encode rle from single python object
+ *     elif type(pyobj) == list and len(pyobj) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox([pyobj], h, w)[0]
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 300, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 300, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L12_bool_binop_done;
+  }
+  __pyx_t_8 = PyObject_Length(__pyx_v_pyobj); if (unlikely(__pyx_t_8 == ((Py_ssize_t)-1))) __PYX_ERR(0, 300, __pyx_L1_error)
+  __pyx_t_7 = ((__pyx_t_8 == 4) != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L12_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":301
+ *     # encode rle from single python object
+ *     elif type(pyobj) == list and len(pyobj) == 4:
+ *         objs = frBbox([pyobj], h, w)[0]             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ *         objs = frPoly([pyobj], h, w)[0]
+ */
+    __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_frBbox); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 301, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_4 = PyList_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 301, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_INCREF(__pyx_v_pyobj);
+    __Pyx_GIVEREF(__pyx_v_pyobj);
+    PyList_SET_ITEM(__pyx_t_4, 0, __pyx_v_pyobj);
+    __pyx_t_6 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_1);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_1, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_t_4, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 301, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_6, __pyx_t_4, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 301, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_10 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 301, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      if (__pyx_t_6) {
+        __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_6); __pyx_t_6 = NULL;
+      }
+      __Pyx_GIVEREF(__pyx_t_4);
+      PyTuple_SET_ITEM(__pyx_t_10, 0+__pyx_t_5, __pyx_t_4);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_10, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_10, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_4 = 0;
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_10, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 301, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_3, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 301, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":300
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     # encode rle from single python object
+ *     elif type(pyobj) == list and len(pyobj) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox([pyobj], h, w)[0]
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":302
+ *     elif type(pyobj) == list and len(pyobj) == 4:
+ *         objs = frBbox([pyobj], h, w)[0]
+ *     elif type(pyobj) == list and len(pyobj) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly([pyobj], h, w)[0]
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 302, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 302, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L14_bool_binop_done;
+  }
+  __pyx_t_8 = PyObject_Length(__pyx_v_pyobj); if (unlikely(__pyx_t_8 == ((Py_ssize_t)-1))) __PYX_ERR(0, 302, __pyx_L1_error)
+  __pyx_t_7 = ((__pyx_t_8 > 4) != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L14_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "_mask.pyx":303
+ *         objs = frBbox([pyobj], h, w)[0]
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ *         objs = frPoly([pyobj], h, w)[0]             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frPoly); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 303, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_10 = PyList_New(1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 303, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_10);
+    __Pyx_INCREF(__pyx_v_pyobj);
+    __Pyx_GIVEREF(__pyx_v_pyobj);
+    PyList_SET_ITEM(__pyx_t_10, 0, __pyx_v_pyobj);
+    __pyx_t_4 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_t_10, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 303, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_t_10, __pyx_v_h, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 303, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_6 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 303, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      if (__pyx_t_4) {
+        __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      }
+      __Pyx_GIVEREF(__pyx_t_10);
+      PyTuple_SET_ITEM(__pyx_t_6, 0+__pyx_t_5, __pyx_t_10);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_6, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_6, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_10 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 303, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 303, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_objs = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "_mask.pyx":302
+ *     elif type(pyobj) == list and len(pyobj) == 4:
+ *         objs = frBbox([pyobj], h, w)[0]
+ *     elif type(pyobj) == list and len(pyobj) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly([pyobj], h, w)[0]
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":304
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ *         objs = frPoly([pyobj], h, w)[0]
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]
+ *     else:
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyDict_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 304, __pyx_L1_error)
+  __pyx_t_7 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 304, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_7) {
+  } else {
+    __pyx_t_2 = __pyx_t_7;
+    goto __pyx_L16_bool_binop_done;
+  }
+  __pyx_t_7 = (__Pyx_PySequence_ContainsTF(__pyx_n_s_counts, __pyx_v_pyobj, Py_EQ)); if (unlikely(__pyx_t_7 < 0)) __PYX_ERR(0, 304, __pyx_L1_error)
+  __pyx_t_9 = (__pyx_t_7 != 0);
+  if (__pyx_t_9) {
+  } else {
+    __pyx_t_2 = __pyx_t_9;
+    goto __pyx_L16_bool_binop_done;
+  }
+  __pyx_t_9 = (__Pyx_PySequence_ContainsTF(__pyx_n_s_size, __pyx_v_pyobj, Py_EQ)); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 304, __pyx_L1_error)
+  __pyx_t_7 = (__pyx_t_9 != 0);
+  __pyx_t_2 = __pyx_t_7;
+  __pyx_L16_bool_binop_done:;
+  if (likely(__pyx_t_2)) {
+
+    /* "_mask.pyx":305
+ *         objs = frPoly([pyobj], h, w)[0]
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise Exception('input type is not supported.')
+ */
+    __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_frUncompressedRLE); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 305, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_6 = PyList_New(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 305, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_INCREF(__pyx_v_pyobj);
+    __Pyx_GIVEREF(__pyx_v_pyobj);
+    PyList_SET_ITEM(__pyx_t_6, 0, __pyx_v_pyobj);
+    __pyx_t_10 = NULL;
+    __pyx_t_5 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) {
+      __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_1);
+      if (likely(__pyx_t_10)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+        __Pyx_INCREF(__pyx_t_10);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_1, function);
+        __pyx_t_5 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_10, __pyx_t_6, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 305, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_10, __pyx_t_6, __pyx_v_h, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_5, 3+__pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 305, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(3+__pyx_t_5); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 305, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      if (__pyx_t_10) {
+        __Pyx_GIVEREF(__pyx_t_10); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_10); __pyx_t_10 = NULL;
+      }
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+__pyx_t_5, __pyx_t_6);
+      __Pyx_INCREF(__pyx_v_h);
+      __Pyx_GIVEREF(__pyx_v_h);
+      PyTuple_SET_ITEM(__pyx_t_4, 1+__pyx_t_5, __pyx_v_h);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_4, 2+__pyx_t_5, __pyx_v_w);
+      __pyx_t_6 = 0;
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 305, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_3, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 305, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "_mask.pyx":304
+ *     elif type(pyobj) == list and len(pyobj) > 4:
+ *         objs = frPoly([pyobj], h, w)[0]
+ *     elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]
+ *     else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "_mask.pyx":307
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]
+ *     else:
+ *         raise Exception('input type is not supported.')             # <<<<<<<<<<<<<<
+ *     return objs
+ */
+  /*else*/ {
+    __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__22, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 307, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_Raise(__pyx_t_1, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __PYX_ERR(0, 307, __pyx_L1_error)
+  }
+  __pyx_L3:;
+
+  /* "_mask.pyx":308
+ *     else:
+ *         raise Exception('input type is not supported.')
+ *     return objs             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "_mask.pyx":288
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, h, w):             # <<<<<<<<<<<<<<
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_AddTraceback("_mask.frPyObjects", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fulfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_i;
+  int __pyx_v_ndim;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  int __pyx_v_t;
+  char *__pyx_v_f;
+  PyArray_Descr *__pyx_v_descr = 0;
+  int __pyx_v_offset;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  char *__pyx_t_8;
+  if (__pyx_v_info == NULL) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ * 
+ *             cdef int i, ndim
+ *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223
+ *             cdef int i, ndim
+ *             cdef int endian_detector = 1
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ * 
+ *             ndim = PyArray_NDIM(self)
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":225
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+  __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  if (unlikely(__pyx_t_1)) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__23, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 229, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(2, 229, __pyx_L1_error)
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L7_bool_binop_done;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L7_bool_binop_done:;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  if (unlikely(__pyx_t_1)) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__24, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 233, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(2, 233, __pyx_L1_error)
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ * 
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 # Allocate new buffer for strides and shape info.
+ */
+  __pyx_v_info->ndim = __pyx_v_ndim;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":240
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)             # <<<<<<<<<<<<<<
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * 2) * ((size_t)__pyx_v_ndim))));
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":241
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)
+ *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+    __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":242
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):             # <<<<<<<<<<<<<<
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ */
+    __pyx_t_4 = __pyx_v_ndim;
+    __pyx_t_5 = __pyx_t_4;
+    for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+      __pyx_v_i = __pyx_t_6;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ */
+      (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":244
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ */
+      (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+    goto __pyx_L9;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ */
+  /*else*/ {
+    __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":247
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ */
+    __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+  }
+  __pyx_L9:;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":249
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ * 
+ */
+  __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":250
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int t
+ */
+  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ * 
+ *             cdef int t
+ *             cdef char* f = NULL             # <<<<<<<<<<<<<<
+ *             cdef dtype descr = self.descr
+ *             cdef int offset
+ */
+  __pyx_v_f = NULL;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254
+ *             cdef int t
+ *             cdef char* f = NULL
+ *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
+ *             cdef int offset
+ * 
+ */
+  __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
+  __Pyx_INCREF(__pyx_t_3);
+  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *             cdef int offset
+ * 
+ *             info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *             if not PyDataType_HASFIELDS(descr):
+ */
+  __Pyx_INCREF(((PyObject *)__pyx_v_self));
+  __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *             info.obj = self
+ * 
+ *             if not PyDataType_HASFIELDS(descr):             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  __pyx_t_1 = ((!(PyDataType_HASFIELDS(__pyx_v_descr) != 0)) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ * 
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ */
+    __pyx_t_4 = __pyx_v_descr->type_num;
+    __pyx_v_t = __pyx_t_4;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '>') != 0);
+    if (!__pyx_t_2) {
+      goto __pyx_L15_next_or;
+    } else {
+    }
+    __pyx_t_2 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L14_bool_binop_done;
+    }
+    __pyx_L15_next_or:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '<') != 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L14_bool_binop_done;
+    }
+    __pyx_t_2 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L14_bool_binop_done:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    if (unlikely(__pyx_t_1)) {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__25, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 263, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(2, 263, __pyx_L1_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ */
+    switch (__pyx_v_t) {
+      case NPY_BYTE:
+      __pyx_v_f = ((char *)"b");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ */
+      case NPY_UBYTE:
+      __pyx_v_f = ((char *)"B");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ */
+      case NPY_SHORT:
+      __pyx_v_f = ((char *)"h");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ */
+      case NPY_USHORT:
+      __pyx_v_f = ((char *)"H");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ */
+      case NPY_INT:
+      __pyx_v_f = ((char *)"i");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ */
+      case NPY_UINT:
+      __pyx_v_f = ((char *)"I");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ */
+      case NPY_LONG:
+      __pyx_v_f = ((char *)"l");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ */
+      case NPY_ULONG:
+      __pyx_v_f = ((char *)"L");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ */
+      case NPY_LONGLONG:
+      __pyx_v_f = ((char *)"q");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ */
+      case NPY_ULONGLONG:
+      __pyx_v_f = ((char *)"Q");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ */
+      case NPY_FLOAT:
+      __pyx_v_f = ((char *)"f");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":275
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ */
+      case NPY_DOUBLE:
+      __pyx_v_f = ((char *)"d");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ */
+      case NPY_LONGDOUBLE:
+      __pyx_v_f = ((char *)"g");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+      case NPY_CFLOAT:
+      __pyx_v_f = ((char *)"Zf");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"
+ */
+      case NPY_CDOUBLE:
+      __pyx_v_f = ((char *)"Zd");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":279
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ */
+      case NPY_CLONGDOUBLE:
+      __pyx_v_f = ((char *)"Zg");
+      break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      case NPY_OBJECT:
+      __pyx_v_f = ((char *)"O");
+      break;
+      default:
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *                 info.format = f
+ *                 return
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_7 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(2, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_7); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(2, 282, __pyx_L1_error)
+      break;
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f             # <<<<<<<<<<<<<<
+ *                 return
+ *             else:
+ */
+    __pyx_v_info->format = __pyx_v_f;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":284
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f
+ *                 return             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ */
+    __pyx_r = 0;
+    goto __pyx_L0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *             info.obj = self
+ * 
+ *             if not PyDataType_HASFIELDS(descr):             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286
+ *                 return
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ */
+  /*else*/ {
+    __pyx_v_info->format = ((char *)PyObject_Malloc(0xFF));
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":287
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ */
+    (__pyx_v_info->format[0]) = '^';
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0             # <<<<<<<<<<<<<<
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ *                                       info.format + _buffer_format_string_len,
+ */
+    __pyx_v_offset = 0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ */
+    __pyx_t_8 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 0xFF), (&__pyx_v_offset)); if (unlikely(__pyx_t_8 == ((char *)NULL))) __PYX_ERR(2, 289, __pyx_L1_error)
+    __pyx_v_f = __pyx_t_8;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+    (__pyx_v_f[0]) = '\x00';
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fulfill the PEP.
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+  __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":296
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 PyObject_Free(info.strides)
+ */
+    PyObject_Free(__pyx_v_info->format);
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":298
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 PyObject_Free(info.strides)             # <<<<<<<<<<<<<<
+ *                 # info.shape was stored after info.strides in the same block
+ * 
+ */
+    PyObject_Free(__pyx_v_info->strides);
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 776, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 779, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 782, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":785
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 785, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":788
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 788, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":792
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject*)__pyx_v_d->subarray->shape));
+    __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape);
+    goto __pyx_L0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *         return ()
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+  PyArray_Descr *__pyx_v_child = 0;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  PyObject *__pyx_v_fields = 0;
+  PyObject *__pyx_v_childname = NULL;
+  PyObject *__pyx_v_new_offset = NULL;
+  PyObject *__pyx_v_t = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  long __pyx_t_8;
+  char *__pyx_t_9;
+  __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ * 
+ *     cdef dtype child
+ *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ *     cdef tuple fields
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ *     cdef dtype child
+ *     cdef int endian_detector = 1
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ *     cdef tuple fields
+ * 
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  if (unlikely(__pyx_v_descr->names == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    __PYX_ERR(2, 805, __pyx_L1_error)
+  }
+  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  for (;;) {
+    if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(2, 805, __pyx_L1_error)
+    #else
+    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 805, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":806
+ * 
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
+ *         child, new_offset = fields
+ * 
+ */
+    if (unlikely(__pyx_v_descr->fields == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(2, 806, __pyx_L1_error)
+    }
+    __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 806, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) __PYX_ERR(2, 806, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":807
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields             # <<<<<<<<<<<<<<
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ */
+    if (likely(__pyx_v_fields != Py_None)) {
+      PyObject* sequence = __pyx_v_fields;
+      Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+      if (unlikely(size != 2)) {
+        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+        __PYX_ERR(2, 807, __pyx_L1_error)
+      }
+      #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+      __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      #else
+      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 807, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 807, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      #endif
+    } else {
+      __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(2, 807, __pyx_L1_error)
+    }
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) __PYX_ERR(2, 807, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    __pyx_t_3 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 809, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 809, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(2, 809, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    if (unlikely(__pyx_t_6)) {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":810
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__26, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 810, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(2, 810, __pyx_L1_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '>') != 0);
+    if (!__pyx_t_7) {
+      goto __pyx_L8_next_or;
+    } else {
+    }
+    __pyx_t_7 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_L8_next_or:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *             raise ValueError(u"Non-native byte order not supported")
+ *             # One could encode it in the format string and have Cython
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '<') != 0);
+    if (__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_t_7 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_6 = __pyx_t_7;
+    __pyx_L7_bool_binop_done:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    if (unlikely(__pyx_t_6)) {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__27, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 814, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(2, 814, __pyx_L1_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":824
+ * 
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ */
+    while (1) {
+      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 824, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 824, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 824, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (!__pyx_t_6) break;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":825
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
+ *             f += 1
+ *             offset[0] += 1
+ */
+      (__pyx_v_f[0]) = 0x78;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1             # <<<<<<<<<<<<<<
+ *             offset[0] += 1
+ * 
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ *             offset[0] += 1             # <<<<<<<<<<<<<<
+ * 
+ *         offset[0] += child.itemsize
+ */
+      __pyx_t_8 = 0;
+      (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + 1);
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ *             offset[0] += 1
+ * 
+ *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ */
+    __pyx_t_8 = 0;
+    (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + __pyx_v_child->elsize);
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    if (__pyx_t_6) {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num             # <<<<<<<<<<<<<<
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 832, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+      __pyx_t_4 = 0;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      if (unlikely(__pyx_t_6)) {
+
+        /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__28, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 834, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __PYX_ERR(2, 834, __pyx_L1_error)
+
+        /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_BYTE); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 837, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 98;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UBYTE); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 838, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_SHORT); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 839, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x68;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_USHORT); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 840, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 72;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_INT); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 841, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x69;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UINT); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 842, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 73;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":843
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 843, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 843, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 843, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x6C;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 844, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 844, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 844, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 76;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 845, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 845, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 845, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x71;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":846
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 846, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 846, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 846, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 81;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":847
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_FLOAT); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 847, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 847, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 847, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":848
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 848, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 848, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 848, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x64;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 849, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 849, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 849, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x67;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 850, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 850, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 850, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x66;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":851
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 851, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 851, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 851, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x64;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":852
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 852, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 852, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 852, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x67;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":853
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_OBJECT); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 853, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 853, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(2, 853, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (likely(__pyx_t_6)) {
+        (__pyx_v_f[0]) = 79;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":855
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *             f += 1
+ *         else:
+ */
+      /*else*/ {
+        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(2, 855, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(2, 855, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __PYX_ERR(2, 855, __pyx_L1_error)
+      }
+      __pyx_L15:;
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":856
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *             f += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             # Cython ignores struct boundary information ("T{...}"),
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+      goto __pyx_L13;
+    }
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":860
+ *             # Cython ignores struct boundary information ("T{...}"),
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
+ *     return f
+ * 
+ */
+    /*else*/ {
+      __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_9 == ((char *)NULL))) __PYX_ERR(2, 860, __pyx_L1_error)
+      __pyx_v_f = __pyx_t_9;
+    }
+    __pyx_L13:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":861
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)
+ *     return f             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_f;
+  goto __pyx_L0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *         return ()
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_child);
+  __Pyx_XDECREF(__pyx_v_fields);
+  __Pyx_XDECREF(__pyx_v_childname);
+  __Pyx_XDECREF(__pyx_v_new_offset);
+  __Pyx_XDECREF(__pyx_v_t);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  PyObject *__pyx_v_baseptr;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+  __pyx_t_1 = (__pyx_v_base == Py_None);
+  __pyx_t_2 = (__pyx_t_1 != 0);
+  if (__pyx_t_2) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ *          baseptr = NULL             # <<<<<<<<<<<<<<
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ */
+    __pyx_v_baseptr = NULL;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":982
+ *          baseptr = NULL
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ */
+  /*else*/ {
+    Py_INCREF(__pyx_v_base);
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":983
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr
+ */
+    __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+  }
+  __pyx_L3:;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":984
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
+ *      arr.base = baseptr
+ * 
+ */
+  Py_XDECREF(__pyx_v_arr->base);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":985
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_v_arr->base = __pyx_v_baseptr;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":989
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     else:
+ *         return <object>arr.base
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":991
+ *         return None
+ *     else:
+ *         return <object>arr.base             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
+    goto __pyx_L0;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":996
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_array", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         _import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 998, __pyx_L3_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":999
+ *     try:
+ *         _import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 999, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__29, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1000, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1000, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":996
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1002
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_umath", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1004
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1004, __pyx_L3_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1005
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1005, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1006
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__30, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1006, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1006, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1002
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1008
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1010
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(2, 1010, __pyx_L3_error)
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1011
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(2, 1011, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1012
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__31, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(2, 1012, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(2, 1012, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1008
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_tp_new_5_mask_RLEs(PyTypeObject *t, PyObject *a, PyObject *k) {
+  PyObject *o;
+  if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  if (unlikely(__pyx_pw_5_mask_4RLEs_1__cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_5_mask_RLEs(PyObject *o) {
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
+    if (PyObject_CallFinalizerFromDealloc(o)) return;
+  }
+  #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_5_mask_4RLEs_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
+  (*Py_TYPE(o)->tp_free)(o);
+}
+
+static PyObject *__pyx_tp_getattro_5_mask_RLEs(PyObject *o, PyObject *n) {
+  PyObject *v = __Pyx_PyObject_GenericGetAttr(o, n);
+  if (!v && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+    PyErr_Clear();
+    v = __pyx_pw_5_mask_4RLEs_5__getattr__(o, n);
+  }
+  return v;
+}
+
+static PyMethodDef __pyx_methods_5_mask_RLEs[] = {
+  {"__getattr__", (PyCFunction)__pyx_pw_5_mask_4RLEs_5__getattr__, METH_O|METH_COEXIST, 0},
+  {"__reduce_cython__", (PyCFunction)__pyx_pw_5_mask_4RLEs_7__reduce_cython__, METH_NOARGS, 0},
+  {"__setstate_cython__", (PyCFunction)__pyx_pw_5_mask_4RLEs_9__setstate_cython__, METH_O, 0},
+  {0, 0, 0, 0}
+};
+
+static PyTypeObject __pyx_type_5_mask_RLEs = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "_mask.RLEs", /*tp_name*/
+  sizeof(struct __pyx_obj_5_mask_RLEs), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_5_mask_RLEs, /*tp_dealloc*/
+  0, /*tp_print*/
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  __pyx_tp_getattro_5_mask_RLEs, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_5_mask_RLEs, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  0, /*tp_dictoffset*/
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_5_mask_RLEs, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  0, /*tp_finalize*/
+  #endif
+};
+
+static PyObject *__pyx_tp_new_5_mask_Masks(PyTypeObject *t, PyObject *a, PyObject *k) {
+  PyObject *o;
+  if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  if (unlikely(__pyx_pw_5_mask_5Masks_1__cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_5_mask_Masks(PyObject *o) {
+  #if CYTHON_USE_TP_FINALIZE
+  if (unlikely(PyType_HasFeature(Py_TYPE(o), Py_TPFLAGS_HAVE_FINALIZE) && Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
+    if (PyObject_CallFinalizerFromDealloc(o)) return;
+  }
+  #endif
+  (*Py_TYPE(o)->tp_free)(o);
+}
+
+static PyMethodDef __pyx_methods_5_mask_Masks[] = {
+  {"__array__", (PyCFunction)__pyx_pw_5_mask_5Masks_3__array__, METH_NOARGS, 0},
+  {"__reduce_cython__", (PyCFunction)__pyx_pw_5_mask_5Masks_5__reduce_cython__, METH_NOARGS, 0},
+  {"__setstate_cython__", (PyCFunction)__pyx_pw_5_mask_5Masks_7__setstate_cython__, METH_O, 0},
+  {0, 0, 0, 0}
+};
+
+static PyTypeObject __pyx_type_5_mask_Masks = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "_mask.Masks", /*tp_name*/
+  sizeof(struct __pyx_obj_5_mask_Masks), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_5_mask_Masks, /*tp_dealloc*/
+  0, /*tp_print*/
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_5_mask_Masks, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  0, /*tp_dictoffset*/
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_5_mask_Masks, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  0, /*tp_finalize*/
+  #endif
+};
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec__mask(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec__mask},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_mask",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_n_s_AttributeError, __pyx_k_AttributeError, sizeof(__pyx_k_AttributeError), 0, 0, 1, 1},
+  {&__pyx_n_s_F, __pyx_k_F, sizeof(__pyx_k_F), 0, 0, 1, 1},
+  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
+  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
+  {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+  {&__pyx_n_s_N, __pyx_k_N, sizeof(__pyx_k_N), 0, 0, 1, 1},
+  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
+  {&__pyx_n_s_PYTHON_VERSION, __pyx_k_PYTHON_VERSION, sizeof(__pyx_k_PYTHON_VERSION), 0, 0, 1, 1},
+  {&__pyx_kp_s_Python_version_must_be_2_or_3, __pyx_k_Python_version_must_be_2_or_3, sizeof(__pyx_k_Python_version_must_be_2_or_3), 0, 0, 1, 0},
+  {&__pyx_n_s_R, __pyx_k_R, sizeof(__pyx_k_R), 0, 0, 1, 1},
+  {&__pyx_n_s_Rs, __pyx_k_Rs, sizeof(__pyx_k_Rs), 0, 0, 1, 1},
+  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
+  {&__pyx_kp_s_The_dt_and_gt_should_have_the_sa, __pyx_k_The_dt_and_gt_should_have_the_sa, sizeof(__pyx_k_The_dt_and_gt_should_have_the_sa), 0, 0, 1, 0},
+  {&__pyx_n_s_TypeError, __pyx_k_TypeError, sizeof(__pyx_k_TypeError), 0, 0, 1, 1},
+  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s_a, __pyx_k_a, sizeof(__pyx_k_a), 0, 0, 1, 1},
+  {&__pyx_n_s_a_2, __pyx_k_a_2, sizeof(__pyx_k_a_2), 0, 0, 1, 1},
+  {&__pyx_n_s_all, __pyx_k_all, sizeof(__pyx_k_all), 0, 0, 1, 1},
+  {&__pyx_n_s_area, __pyx_k_area, sizeof(__pyx_k_area), 0, 0, 1, 1},
+  {&__pyx_n_s_array, __pyx_k_array, sizeof(__pyx_k_array), 0, 0, 1, 1},
+  {&__pyx_n_s_astype, __pyx_k_astype, sizeof(__pyx_k_astype), 0, 0, 1, 1},
+  {&__pyx_n_s_author, __pyx_k_author, sizeof(__pyx_k_author), 0, 0, 1, 1},
+  {&__pyx_n_s_bb, __pyx_k_bb, sizeof(__pyx_k_bb), 0, 0, 1, 1},
+  {&__pyx_n_s_bbIou, __pyx_k_bbIou, sizeof(__pyx_k_bbIou), 0, 0, 1, 1},
+  {&__pyx_n_s_bb_2, __pyx_k_bb_2, sizeof(__pyx_k_bb_2), 0, 0, 1, 1},
+  {&__pyx_n_s_c_string, __pyx_k_c_string, sizeof(__pyx_k_c_string), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_cnts, __pyx_k_cnts, sizeof(__pyx_k_cnts), 0, 0, 1, 1},
+  {&__pyx_n_s_counts, __pyx_k_counts, sizeof(__pyx_k_counts), 0, 0, 1, 1},
+  {&__pyx_n_s_data, __pyx_k_data, sizeof(__pyx_k_data), 0, 0, 1, 1},
+  {&__pyx_n_s_decode, __pyx_k_decode, sizeof(__pyx_k_decode), 0, 0, 1, 1},
+  {&__pyx_n_s_double, __pyx_k_double, sizeof(__pyx_k_double), 0, 0, 1, 1},
+  {&__pyx_n_s_dt, __pyx_k_dt, sizeof(__pyx_k_dt), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1},
+  {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1},
+  {&__pyx_n_s_frBbox, __pyx_k_frBbox, sizeof(__pyx_k_frBbox), 0, 0, 1, 1},
+  {&__pyx_n_s_frPoly, __pyx_k_frPoly, sizeof(__pyx_k_frPoly), 0, 0, 1, 1},
+  {&__pyx_n_s_frPyObjects, __pyx_k_frPyObjects, sizeof(__pyx_k_frPyObjects), 0, 0, 1, 1},
+  {&__pyx_n_s_frString, __pyx_k_frString, sizeof(__pyx_k_frString), 0, 0, 1, 1},
+  {&__pyx_n_s_frUncompressedRLE, __pyx_k_frUncompressedRLE, sizeof(__pyx_k_frUncompressedRLE), 0, 0, 1, 1},
+  {&__pyx_n_s_getstate, __pyx_k_getstate, sizeof(__pyx_k_getstate), 0, 0, 1, 1},
+  {&__pyx_n_s_gt, __pyx_k_gt, sizeof(__pyx_k_gt), 0, 0, 1, 1},
+  {&__pyx_n_s_h, __pyx_k_h, sizeof(__pyx_k_h), 0, 0, 1, 1},
+  {&__pyx_n_s_i, __pyx_k_i, sizeof(__pyx_k_i), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_kp_s_input_data_type_not_allowed, __pyx_k_input_data_type_not_allowed, sizeof(__pyx_k_input_data_type_not_allowed), 0, 0, 1, 0},
+  {&__pyx_kp_s_input_type_is_not_supported, __pyx_k_input_type_is_not_supported, sizeof(__pyx_k_input_type_is_not_supported), 0, 0, 1, 0},
+  {&__pyx_n_s_intersect, __pyx_k_intersect, sizeof(__pyx_k_intersect), 0, 0, 1, 1},
+  {&__pyx_n_s_iou, __pyx_k_iou, sizeof(__pyx_k_iou), 0, 0, 1, 1},
+  {&__pyx_n_s_iouFun, __pyx_k_iouFun, sizeof(__pyx_k_iouFun), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_2, __pyx_k_iou_2, sizeof(__pyx_k_iou_2), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__bbIou, __pyx_k_iou_locals__bbIou, sizeof(__pyx_k_iou_locals__bbIou), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__len, __pyx_k_iou_locals__len, sizeof(__pyx_k_iou_locals__len), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__preproc, __pyx_k_iou_locals__preproc, sizeof(__pyx_k_iou_locals__preproc), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__rleIou, __pyx_k_iou_locals__rleIou, sizeof(__pyx_k_iou_locals__rleIou), 0, 0, 1, 1},
+  {&__pyx_n_s_isbox, __pyx_k_isbox, sizeof(__pyx_k_isbox), 0, 0, 1, 1},
+  {&__pyx_n_s_iscrowd, __pyx_k_iscrowd, sizeof(__pyx_k_iscrowd), 0, 0, 1, 1},
+  {&__pyx_n_s_isrle, __pyx_k_isrle, sizeof(__pyx_k_isrle), 0, 0, 1, 1},
+  {&__pyx_n_s_j, __pyx_k_j, sizeof(__pyx_k_j), 0, 0, 1, 1},
+  {&__pyx_n_s_len, __pyx_k_len, sizeof(__pyx_k_len), 0, 0, 1, 1},
+  {&__pyx_kp_s_list_input_can_be_bounding_box_N, __pyx_k_list_input_can_be_bounding_box_N, sizeof(__pyx_k_list_input_can_be_bounding_box_N), 0, 0, 1, 0},
+  {&__pyx_n_s_m, __pyx_k_m, sizeof(__pyx_k_m), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_mask, __pyx_k_mask, sizeof(__pyx_k_mask), 0, 0, 1, 1},
+  {&__pyx_n_s_mask_2, __pyx_k_mask_2, sizeof(__pyx_k_mask_2), 0, 0, 1, 1},
+  {&__pyx_kp_s_mask_pyx, __pyx_k_mask_pyx, sizeof(__pyx_k_mask_pyx), 0, 0, 1, 0},
+  {&__pyx_n_s_masks, __pyx_k_masks, sizeof(__pyx_k_masks), 0, 0, 1, 1},
+  {&__pyx_n_s_merge, __pyx_k_merge, sizeof(__pyx_k_merge), 0, 0, 1, 1},
+  {&__pyx_n_s_n, __pyx_k_n, sizeof(__pyx_k_n), 0, 0, 1, 1},
+  {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
+  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
+  {&__pyx_kp_s_no_default___reduce___due_to_non, __pyx_k_no_default___reduce___due_to_non, sizeof(__pyx_k_no_default___reduce___due_to_non), 0, 0, 1, 0},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_np_poly, __pyx_k_np_poly, sizeof(__pyx_k_np_poly), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_kp_s_numpy_core_multiarray_failed_to, __pyx_k_numpy_core_multiarray_failed_to, sizeof(__pyx_k_numpy_core_multiarray_failed_to), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_core_umath_failed_to_impor, __pyx_k_numpy_core_umath_failed_to_impor, sizeof(__pyx_k_numpy_core_umath_failed_to_impor), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_ndarray_input_is_only_for, __pyx_k_numpy_ndarray_input_is_only_for, sizeof(__pyx_k_numpy_ndarray_input_is_only_for), 0, 0, 1, 0},
+  {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
+  {&__pyx_n_s_objs, __pyx_k_objs, sizeof(__pyx_k_objs), 0, 0, 1, 1},
+  {&__pyx_n_s_order, __pyx_k_order, sizeof(__pyx_k_order), 0, 0, 1, 1},
+  {&__pyx_n_s_p, __pyx_k_p, sizeof(__pyx_k_p), 0, 0, 1, 1},
+  {&__pyx_n_s_poly, __pyx_k_poly, sizeof(__pyx_k_poly), 0, 0, 1, 1},
+  {&__pyx_n_s_preproc, __pyx_k_preproc, sizeof(__pyx_k_preproc), 0, 0, 1, 1},
+  {&__pyx_n_s_py_string, __pyx_k_py_string, sizeof(__pyx_k_py_string), 0, 0, 1, 1},
+  {&__pyx_n_s_pyiscrowd, __pyx_k_pyiscrowd, sizeof(__pyx_k_pyiscrowd), 0, 0, 1, 1},
+  {&__pyx_n_s_pyobj, __pyx_k_pyobj, sizeof(__pyx_k_pyobj), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_reduce, __pyx_k_reduce, sizeof(__pyx_k_reduce), 0, 0, 1, 1},
+  {&__pyx_n_s_reduce_cython, __pyx_k_reduce_cython, sizeof(__pyx_k_reduce_cython), 0, 0, 1, 1},
+  {&__pyx_n_s_reduce_ex, __pyx_k_reduce_ex, sizeof(__pyx_k_reduce_ex), 0, 0, 1, 1},
+  {&__pyx_n_s_reshape, __pyx_k_reshape, sizeof(__pyx_k_reshape), 0, 0, 1, 1},
+  {&__pyx_n_s_rleIou, __pyx_k_rleIou, sizeof(__pyx_k_rleIou), 0, 0, 1, 1},
+  {&__pyx_n_s_rleObjs, __pyx_k_rleObjs, sizeof(__pyx_k_rleObjs), 0, 0, 1, 1},
+  {&__pyx_n_s_setstate, __pyx_k_setstate, sizeof(__pyx_k_setstate), 0, 0, 1, 1},
+  {&__pyx_n_s_setstate_cython, __pyx_k_setstate_cython, sizeof(__pyx_k_setstate_cython), 0, 0, 1, 1},
+  {&__pyx_n_s_shape, __pyx_k_shape, sizeof(__pyx_k_shape), 0, 0, 1, 1},
+  {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
+  {&__pyx_n_s_sys, __pyx_k_sys, sizeof(__pyx_k_sys), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_toBbox, __pyx_k_toBbox, sizeof(__pyx_k_toBbox), 0, 0, 1, 1},
+  {&__pyx_n_s_toString, __pyx_k_toString, sizeof(__pyx_k_toString), 0, 0, 1, 1},
+  {&__pyx_n_s_tsungyi, __pyx_k_tsungyi, sizeof(__pyx_k_tsungyi), 0, 0, 1, 1},
+  {&__pyx_n_s_ucRles, __pyx_k_ucRles, sizeof(__pyx_k_ucRles), 0, 0, 1, 1},
+  {&__pyx_n_s_uint32, __pyx_k_uint32, sizeof(__pyx_k_uint32), 0, 0, 1, 1},
+  {&__pyx_n_s_uint8, __pyx_k_uint8, sizeof(__pyx_k_uint8), 0, 0, 1, 1},
+  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
+  {&__pyx_kp_s_unrecognized_type_The_following, __pyx_k_unrecognized_type_The_following, sizeof(__pyx_k_unrecognized_type_The_following), 0, 0, 1, 0},
+  {&__pyx_n_s_utf8, __pyx_k_utf8, sizeof(__pyx_k_utf8), 0, 0, 1, 1},
+  {&__pyx_n_s_version_info, __pyx_k_version_info, sizeof(__pyx_k_version_info), 0, 0, 1, 1},
+  {&__pyx_n_s_w, __pyx_k_w, sizeof(__pyx_k_w), 0, 0, 1, 1},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 67, __pyx_L1_error)
+  __pyx_builtin_AttributeError = __Pyx_GetBuiltinName(__pyx_n_s_AttributeError); if (!__pyx_builtin_AttributeError) __PYX_ERR(0, 73, __pyx_L1_error)
+  __pyx_builtin_TypeError = __Pyx_GetBuiltinName(__pyx_n_s_TypeError); if (!__pyx_builtin_TypeError) __PYX_ERR(1, 2, __pyx_L1_error)
+  __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 124, __pyx_L1_error)
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(2, 229, __pyx_L1_error)
+  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) __PYX_ERR(2, 810, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(2, 1000, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple_)) __PYX_ERR(1, 2, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+
+  /* "(tree fragment)":4
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ */
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "(tree fragment)":2
+ * def __reduce_cython__(self):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ */
+  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(1, 2, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+
+  /* "(tree fragment)":4
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")
+ * def __setstate_cython__(self, __pyx_state):
+ *     raise TypeError("no default __reduce__ due to non-trivial __cinit__")             # <<<<<<<<<<<<<<
+ */
+  __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_s_no_default___reduce___due_to_non); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(1, 4, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "_mask.pyx":126
+ *     for i, obj in enumerate(rleObjs):
+ *         if PYTHON_VERSION == 2:
+ *             py_string = str(obj['counts']).encode('utf8')             # <<<<<<<<<<<<<<
+ *         elif PYTHON_VERSION == 3:
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ */
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_n_s_utf8); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 126, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+
+  /* "_mask.pyx":130
+ *             py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+ *         else:
+ *             raise Exception('Python version must be 2 or 3')             # <<<<<<<<<<<<<<
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ */
+  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_s_Python_version_must_be_2_or_3); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 130, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__6);
+  __Pyx_GIVEREF(__pyx_tuple__6);
+
+  /* "_mask.pyx":154
+ * def merge(rleObjs, intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)             # <<<<<<<<<<<<<<
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ */
+  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_int_1); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 154, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+
+  /* "_mask.pyx":180
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')             # <<<<<<<<<<<<<<
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:
+ */
+  __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_s_numpy_ndarray_input_is_only_for); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(0, 180, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "_mask.pyx":193
+ *                 objs = _frString(objs)
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ */
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_s_list_input_can_be_bounding_box_N); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 193, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+
+  /* "_mask.pyx":195
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')             # <<<<<<<<<<<<<<
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ */
+  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_s_unrecognized_type_The_following); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(0, 195, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+
+  /* "_mask.pyx":172
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+  __pyx_tuple__11 = PyTuple_Pack(4, __pyx_n_s_objs, __pyx_n_s_isbox, __pyx_n_s_isrle, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 172, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_codeobj__12 = (PyObject*)__Pyx_PyCode_New(1, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__11, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_preproc, 172, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__12)) __PYX_ERR(0, 172, __pyx_L1_error)
+
+  /* "_mask.pyx":197
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+  __pyx_tuple__13 = PyTuple_Pack(6, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(0, 197, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
+  __pyx_codeobj__14 = (PyObject*)__Pyx_PyCode_New(6, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__13, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_rleIou, 197, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__14)) __PYX_ERR(0, 197, __pyx_L1_error)
+
+  /* "_mask.pyx":199
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+  __pyx_tuple__15 = PyTuple_Pack(6, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(0, 199, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+  __pyx_codeobj__16 = (PyObject*)__Pyx_PyCode_New(6, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__15, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_bbIou, 199, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__16)) __PYX_ERR(0, 199, __pyx_L1_error)
+
+  /* "_mask.pyx":201
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+  __pyx_tuple__17 = PyTuple_Pack(2, __pyx_n_s_obj, __pyx_n_s_N); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(0, 201, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+  __pyx_codeobj__18 = (PyObject*)__Pyx_PyCode_New(1, 0, 2, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__17, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_len, 201, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__18)) __PYX_ERR(0, 201, __pyx_L1_error)
+
+  /* "_mask.pyx":221
+ *         return []
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')             # <<<<<<<<<<<<<<
+ * 
+ *     # define local variables
+ */
+  __pyx_tuple__19 = PyTuple_Pack(1, __pyx_kp_s_The_dt_and_gt_should_have_the_sa); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(0, 221, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__19);
+
+  /* "_mask.pyx":232
+ *         _iouFun = _bbIou
+ *     else:
+ *         raise Exception('input data type not allowed.')             # <<<<<<<<<<<<<<
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ */
+  __pyx_tuple__20 = PyTuple_Pack(1, __pyx_kp_s_input_data_type_not_allowed); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(0, 232, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__20);
+  __Pyx_GIVEREF(__pyx_tuple__20);
+
+  /* "_mask.pyx":277
+ *     objs = []
+ *     for i in range(n):
+ *         Rs = RLEs(1)             # <<<<<<<<<<<<<<
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ */
+  __pyx_tuple__21 = PyTuple_Pack(1, __pyx_int_1); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(0, 277, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__21);
+  __Pyx_GIVEREF(__pyx_tuple__21);
+
+  /* "_mask.pyx":307
+ *         objs = frUncompressedRLE([pyobj], h, w)[0]
+ *     else:
+ *         raise Exception('input type is not supported.')             # <<<<<<<<<<<<<<
+ *     return objs
+ */
+  __pyx_tuple__22 = PyTuple_Pack(1, __pyx_kp_s_input_type_is_not_supported); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 307, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__22);
+  __Pyx_GIVEREF(__pyx_tuple__22);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+  __pyx_tuple__23 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(2, 229, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__23);
+  __Pyx_GIVEREF(__pyx_tuple__23);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+  __pyx_tuple__24 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__24)) __PYX_ERR(2, 233, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__24);
+  __Pyx_GIVEREF(__pyx_tuple__24);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+  __pyx_tuple__25 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(2, 263, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__25);
+  __Pyx_GIVEREF(__pyx_tuple__25);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":810
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+  __pyx_tuple__26 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__26)) __PYX_ERR(2, 810, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__26);
+  __Pyx_GIVEREF(__pyx_tuple__26);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+  __pyx_tuple__27 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__27)) __PYX_ERR(2, 814, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__27);
+  __Pyx_GIVEREF(__pyx_tuple__27);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+  __pyx_tuple__28 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__28)) __PYX_ERR(2, 834, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__28);
+  __Pyx_GIVEREF(__pyx_tuple__28);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple__29 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__29)) __PYX_ERR(2, 1000, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__29);
+  __Pyx_GIVEREF(__pyx_tuple__29);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1006
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__30 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__30)) __PYX_ERR(2, 1006, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__30);
+  __Pyx_GIVEREF(__pyx_tuple__30);
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1012
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+  __pyx_tuple__31 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__31)) __PYX_ERR(2, 1012, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__31);
+  __Pyx_GIVEREF(__pyx_tuple__31);
+
+  /* "_mask.pyx":103
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+  __pyx_tuple__32 = PyTuple_Pack(6, __pyx_n_s_Rs, __pyx_n_s_n, __pyx_n_s_py_string, __pyx_n_s_c_string, __pyx_n_s_objs, __pyx_n_s_i); if (unlikely(!__pyx_tuple__32)) __PYX_ERR(0, 103, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__32);
+  __Pyx_GIVEREF(__pyx_tuple__32);
+  __pyx_codeobj__33 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__32, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_toString, 103, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__33)) __PYX_ERR(0, 103, __pyx_L1_error)
+
+  /* "_mask.pyx":119
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+  __pyx_tuple__34 = PyTuple_Pack(7, __pyx_n_s_rleObjs, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_py_string, __pyx_n_s_c_string, __pyx_n_s_i, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__34)) __PYX_ERR(0, 119, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__34);
+  __Pyx_GIVEREF(__pyx_tuple__34);
+  __pyx_codeobj__35 = (PyObject*)__Pyx_PyCode_New(1, 0, 7, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__34, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_frString, 119, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__35)) __PYX_ERR(0, 119, __pyx_L1_error)
+
+  /* "_mask.pyx":137
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+  __pyx_tuple__36 = PyTuple_Pack(6, __pyx_n_s_mask_2, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__36)) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__36);
+  __Pyx_GIVEREF(__pyx_tuple__36);
+  __pyx_codeobj__37 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__36, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_encode, 137, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__37)) __PYX_ERR(0, 137, __pyx_L1_error)
+
+  /* "_mask.pyx":145
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+  __pyx_tuple__38 = PyTuple_Pack(6, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_masks); if (unlikely(!__pyx_tuple__38)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__38);
+  __Pyx_GIVEREF(__pyx_tuple__38);
+  __pyx_codeobj__39 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__38, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_decode, 145, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__39)) __PYX_ERR(0, 145, __pyx_L1_error)
+
+  /* "_mask.pyx":152
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+  __pyx_tuple__40 = PyTuple_Pack(5, __pyx_n_s_rleObjs, __pyx_n_s_intersect, __pyx_n_s_Rs, __pyx_n_s_R, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__40)) __PYX_ERR(0, 152, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__40);
+  __Pyx_GIVEREF(__pyx_tuple__40);
+  __pyx_codeobj__41 = (PyObject*)__Pyx_PyCode_New(2, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__40, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_merge, 152, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__41)) __PYX_ERR(0, 152, __pyx_L1_error)
+
+  /* "_mask.pyx":159
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+  __pyx_tuple__42 = PyTuple_Pack(5, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_a, __pyx_n_s_shape, __pyx_n_s_a_2); if (unlikely(!__pyx_tuple__42)) __PYX_ERR(0, 159, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__42);
+  __Pyx_GIVEREF(__pyx_tuple__42);
+  __pyx_codeobj__43 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__42, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_area, 159, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__43)) __PYX_ERR(0, 159, __pyx_L1_error)
+
+  /* "_mask.pyx":171
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+  __pyx_tuple__44 = PyTuple_Pack(18, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_pyiscrowd, __pyx_n_s_preproc, __pyx_n_s_preproc, __pyx_n_s_rleIou, __pyx_n_s_rleIou, __pyx_n_s_bbIou, __pyx_n_s_bbIou, __pyx_n_s_len, __pyx_n_s_len, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou, __pyx_n_s_shape, __pyx_n_s_iouFun, __pyx_n_s_iou_2); if (unlikely(!__pyx_tuple__44)) __PYX_ERR(0, 171, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__44);
+  __Pyx_GIVEREF(__pyx_tuple__44);
+  __pyx_codeobj__45 = (PyObject*)__Pyx_PyCode_New(3, 0, 18, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__44, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_iou_2, 171, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__45)) __PYX_ERR(0, 171, __pyx_L1_error)
+
+  /* "_mask.pyx":241
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+  __pyx_tuple__46 = PyTuple_Pack(6, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_n, __pyx_n_s_bb_2, __pyx_n_s_shape, __pyx_n_s_bb); if (unlikely(!__pyx_tuple__46)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__46);
+  __Pyx_GIVEREF(__pyx_tuple__46);
+  __pyx_codeobj__47 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__46, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_toBbox, 241, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__47)) __PYX_ERR(0, 241, __pyx_L1_error)
+
+  /* "_mask.pyx":253
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+  __pyx_tuple__48 = PyTuple_Pack(6, __pyx_n_s_bb, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__48)) __PYX_ERR(0, 253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__48);
+  __Pyx_GIVEREF(__pyx_tuple__48);
+  __pyx_codeobj__49 = (PyObject*)__Pyx_PyCode_New(3, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__48, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_frBbox, 253, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__49)) __PYX_ERR(0, 253, __pyx_L1_error)
+
+  /* "_mask.pyx":260
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+  __pyx_tuple__50 = PyTuple_Pack(9, __pyx_n_s_poly, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_np_poly, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_i, __pyx_n_s_p, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__50)) __PYX_ERR(0, 260, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__50);
+  __Pyx_GIVEREF(__pyx_tuple__50);
+  __pyx_codeobj__51 = (PyObject*)__Pyx_PyCode_New(3, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__50, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_frPoly, 260, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__51)) __PYX_ERR(0, 260, __pyx_L1_error)
+
+  /* "_mask.pyx":270
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+  __pyx_tuple__52 = PyTuple_Pack(11, __pyx_n_s_ucRles, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_cnts, __pyx_n_s_R, __pyx_n_s_data, __pyx_n_s_n, __pyx_n_s_objs, __pyx_n_s_i, __pyx_n_s_Rs, __pyx_n_s_j); if (unlikely(!__pyx_tuple__52)) __PYX_ERR(0, 270, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__52);
+  __Pyx_GIVEREF(__pyx_tuple__52);
+  __pyx_codeobj__53 = (PyObject*)__Pyx_PyCode_New(3, 0, 11, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__52, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_frUncompressedRLE, 270, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__53)) __PYX_ERR(0, 270, __pyx_L1_error)
+
+  /* "_mask.pyx":288
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, h, w):             # <<<<<<<<<<<<<<
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:
+ */
+  __pyx_tuple__54 = PyTuple_Pack(4, __pyx_n_s_pyobj, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__54)) __PYX_ERR(0, 288, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__54);
+  __Pyx_GIVEREF(__pyx_tuple__54);
+  __pyx_codeobj__55 = (PyObject*)__Pyx_PyCode_New(3, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__54, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_mask_pyx, __pyx_n_s_frPyObjects, 288, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__55)) __PYX_ERR(0, 288, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_2 = PyInt_FromLong(2); if (unlikely(!__pyx_int_2)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_3 = PyInt_FromLong(3); if (unlikely(!__pyx_int_3)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_4 = PyInt_FromLong(4); if (unlikely(!__pyx_int_4)) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_modinit_global_init_code(void); /*proto*/
+static int __Pyx_modinit_variable_export_code(void); /*proto*/
+static int __Pyx_modinit_function_export_code(void); /*proto*/
+static int __Pyx_modinit_type_init_code(void); /*proto*/
+static int __Pyx_modinit_type_import_code(void); /*proto*/
+static int __Pyx_modinit_variable_import_code(void); /*proto*/
+static int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  if (PyType_Ready(&__pyx_type_5_mask_RLEs) < 0) __PYX_ERR(0, 56, __pyx_L1_error)
+  __pyx_type_5_mask_RLEs.tp_print = 0;
+  if (PyObject_SetAttrString(__pyx_m, "RLEs", (PyObject *)&__pyx_type_5_mask_RLEs) < 0) __PYX_ERR(0, 56, __pyx_L1_error)
+  if (__Pyx_setup_reduce((PyObject*)&__pyx_type_5_mask_RLEs) < 0) __PYX_ERR(0, 56, __pyx_L1_error)
+  __pyx_ptype_5_mask_RLEs = &__pyx_type_5_mask_RLEs;
+  if (PyType_Ready(&__pyx_type_5_mask_Masks) < 0) __PYX_ERR(0, 77, __pyx_L1_error)
+  __pyx_type_5_mask_Masks.tp_print = 0;
+  if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_5_mask_Masks.tp_dictoffset && __pyx_type_5_mask_Masks.tp_getattro == PyObject_GenericGetAttr)) {
+    __pyx_type_5_mask_Masks.tp_getattro = __Pyx_PyObject_GenericGetAttr;
+  }
+  if (PyObject_SetAttrString(__pyx_m, "Masks", (PyObject *)&__pyx_type_5_mask_Masks) < 0) __PYX_ERR(0, 77, __pyx_L1_error)
+  if (__Pyx_setup_reduce((PyObject*)&__pyx_type_5_mask_Masks) < 0) __PYX_ERR(0, 77, __pyx_L1_error)
+  __pyx_ptype_5_mask_Masks = &__pyx_type_5_mask_Masks;
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) __PYX_ERR(3, 9, __pyx_L1_error)
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) __PYX_ERR(2, 164, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) __PYX_ERR(2, 186, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) __PYX_ERR(2, 190, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) __PYX_ERR(2, 199, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) __PYX_ERR(2, 872, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#if PY_MAJOR_VERSION < 3
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC void
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#else
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (!(defined(__cplusplus)) || (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4)))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC init_mask(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC init_mask(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit__mask(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit__mask(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        result = PyDict_SetItemString(moddict, to_name, value);
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__") < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static int __pyx_pymod_exec__mask(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_t_3;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m && __pyx_m == __pyx_pyinit_module) return 0;
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit__mask(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("_mask", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main__mask) {
+    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "_mask")) {
+      if (unlikely(PyDict_SetItemString(modules, "_mask", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  if (unlikely(__Pyx_modinit_type_init_code() != 0)) goto __pyx_L1_error;
+  if (unlikely(__Pyx_modinit_type_import_code() != 0)) goto __pyx_L1_error;
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "_mask.pyx":11
+ * #**************************************************************************
+ * 
+ * __author__ = 'tsungyi'             # <<<<<<<<<<<<<<
+ * 
+ * import sys
+ */
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_author, __pyx_n_s_tsungyi) < 0) __PYX_ERR(0, 11, __pyx_L1_error)
+
+  /* "_mask.pyx":13
+ * __author__ = 'tsungyi'
+ * 
+ * import sys             # <<<<<<<<<<<<<<
+ * PYTHON_VERSION = sys.version_info[0]
+ * 
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_sys, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_sys, __pyx_t_1) < 0) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":14
+ * 
+ * import sys
+ * PYTHON_VERSION = sys.version_info[0]             # <<<<<<<<<<<<<<
+ * 
+ * # import both Python-level and C-level symbols of Numpy
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_sys); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_version_info); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_t_2, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_PYTHON_VERSION, __pyx_t_1) < 0) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":18
+ * # import both Python-level and C-level symbols of Numpy
+ * # the API uses Numpy to interface C and Python
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libc.stdlib cimport malloc, free
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":23
+ * 
+ * # intialized Numpy. must do.
+ * np.import_array()             # <<<<<<<<<<<<<<
+ * 
+ * # import numpy C function
+ */
+  __pyx_t_3 = __pyx_f_5numpy_import_array(); if (unlikely(__pyx_t_3 == ((int)-1))) __PYX_ERR(0, 23, __pyx_L1_error)
+
+  /* "_mask.pyx":103
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_1_toString, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 103, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toString, __pyx_t_1) < 0) __PYX_ERR(0, 103, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":119
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_3_frString, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 119, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frString, __pyx_t_1) < 0) __PYX_ERR(0, 119, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":137
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_5encode, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_encode, __pyx_t_1) < 0) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":145
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_7decode, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_decode, __pyx_t_1) < 0) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":152
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_9merge, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_merge, __pyx_t_1) < 0) __PYX_ERR(0, 152, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":159
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_11area, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 159, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_area, __pyx_t_1) < 0) __PYX_ERR(0, 159, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":171
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_13iou, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 171, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_iou_2, __pyx_t_1) < 0) __PYX_ERR(0, 171, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":241
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_15toBbox, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toBbox, __pyx_t_1) < 0) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":253
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_17frBbox, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 253, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frBbox, __pyx_t_1) < 0) __PYX_ERR(0, 253, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":260
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_19frPoly, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 260, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frPoly, __pyx_t_1) < 0) __PYX_ERR(0, 260, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":270
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_21frUncompressedRLE, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 270, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frUncompressedRLE, __pyx_t_1) < 0) __PYX_ERR(0, 270, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":288
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, h, w):             # <<<<<<<<<<<<<<
+ *     # encode rle from a list of python objects
+ *     if type(pyobj) == np.ndarray:
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5_mask_23frPyObjects, NULL, __pyx_n_s_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 288, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frPyObjects, __pyx_t_1) < 0) __PYX_ERR(0, 288, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "_mask.pyx":1
+ * # distutils: language = c             # <<<<<<<<<<<<<<
+ * # distutils: sources = maskApi.c
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../../../../../root/anaconda2/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1008
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init _mask", 0, __pyx_lineno, __pyx_filename);
+    }
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init _mask");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* BytesEquals */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+    if (s1 == s2) {
+        return (equals == Py_EQ);
+    } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) {
+        const char *ps1, *ps2;
+        Py_ssize_t length = PyBytes_GET_SIZE(s1);
+        if (length != PyBytes_GET_SIZE(s2))
+            return (equals == Py_NE);
+        ps1 = PyBytes_AS_STRING(s1);
+        ps2 = PyBytes_AS_STRING(s2);
+        if (ps1[0] != ps2[0]) {
+            return (equals == Py_NE);
+        } else if (length == 1) {
+            return (equals == Py_EQ);
+        } else {
+            int result;
+#if CYTHON_USE_UNICODE_INTERNALS
+            Py_hash_t hash1, hash2;
+            hash1 = ((PyBytesObject*)s1)->ob_shash;
+            hash2 = ((PyBytesObject*)s2)->ob_shash;
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                return (equals == Py_NE);
+            }
+#endif
+            result = memcmp(ps1, ps2, (size_t)length);
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) {
+        return (equals == Py_NE);
+    } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) {
+        return (equals == Py_NE);
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+#endif
+}
+
+/* UnicodeEquals */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+#if PY_MAJOR_VERSION < 3
+    PyObject* owned_ref = NULL;
+#endif
+    int s1_is_unicode, s2_is_unicode;
+    if (s1 == s2) {
+        goto return_eq;
+    }
+    s1_is_unicode = PyUnicode_CheckExact(s1);
+    s2_is_unicode = PyUnicode_CheckExact(s2);
+#if PY_MAJOR_VERSION < 3
+    if ((s1_is_unicode & (!s2_is_unicode)) && PyString_CheckExact(s2)) {
+        owned_ref = PyUnicode_FromObject(s2);
+        if (unlikely(!owned_ref))
+            return -1;
+        s2 = owned_ref;
+        s2_is_unicode = 1;
+    } else if ((s2_is_unicode & (!s1_is_unicode)) && PyString_CheckExact(s1)) {
+        owned_ref = PyUnicode_FromObject(s1);
+        if (unlikely(!owned_ref))
+            return -1;
+        s1 = owned_ref;
+        s1_is_unicode = 1;
+    } else if (((!s2_is_unicode) & (!s1_is_unicode))) {
+        return __Pyx_PyBytes_Equals(s1, s2, equals);
+    }
+#endif
+    if (s1_is_unicode & s2_is_unicode) {
+        Py_ssize_t length;
+        int kind;
+        void *data1, *data2;
+        if (unlikely(__Pyx_PyUnicode_READY(s1) < 0) || unlikely(__Pyx_PyUnicode_READY(s2) < 0))
+            return -1;
+        length = __Pyx_PyUnicode_GET_LENGTH(s1);
+        if (length != __Pyx_PyUnicode_GET_LENGTH(s2)) {
+            goto return_ne;
+        }
+#if CYTHON_USE_UNICODE_INTERNALS
+        {
+            Py_hash_t hash1, hash2;
+        #if CYTHON_PEP393_ENABLED
+            hash1 = ((PyASCIIObject*)s1)->hash;
+            hash2 = ((PyASCIIObject*)s2)->hash;
+        #else
+            hash1 = ((PyUnicodeObject*)s1)->hash;
+            hash2 = ((PyUnicodeObject*)s2)->hash;
+        #endif
+            if (hash1 != hash2 && hash1 != -1 && hash2 != -1) {
+                goto return_ne;
+            }
+        }
+#endif
+        kind = __Pyx_PyUnicode_KIND(s1);
+        if (kind != __Pyx_PyUnicode_KIND(s2)) {
+            goto return_ne;
+        }
+        data1 = __Pyx_PyUnicode_DATA(s1);
+        data2 = __Pyx_PyUnicode_DATA(s2);
+        if (__Pyx_PyUnicode_READ(kind, data1, 0) != __Pyx_PyUnicode_READ(kind, data2, 0)) {
+            goto return_ne;
+        } else if (length == 1) {
+            goto return_eq;
+        } else {
+            int result = memcmp(data1, data2, (size_t)(length * kind));
+            #if PY_MAJOR_VERSION < 3
+            Py_XDECREF(owned_ref);
+            #endif
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & s2_is_unicode) {
+        goto return_ne;
+    } else if ((s2 == Py_None) & s1_is_unicode) {
+        goto return_ne;
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        #if PY_MAJOR_VERSION < 3
+        Py_XDECREF(owned_ref);
+        #endif
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+return_eq:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_EQ);
+return_ne:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_NE);
+#endif
+}
+
+/* PyCFunctionFastCall */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) {
+    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
+    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
+    PyObject *self = PyCFunction_GET_SELF(func);
+    int flags = PyCFunction_GET_FLAGS(func);
+    assert(PyCFunction_Check(func));
+    assert(METH_FASTCALL == (flags & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)));
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    /* _PyCFunction_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+    if ((PY_VERSION_HEX < 0x030700A0) || unlikely(flags & METH_KEYWORDS)) {
+        return (*((__Pyx_PyCFunctionFastWithKeywords)meth)) (self, args, nargs, NULL);
+    } else {
+        return (*((__Pyx_PyCFunctionFast)meth)) (self, args, nargs);
+    }
+}
+#endif
+
+/* PyFunctionFastCall */
+#if CYTHON_FAST_PYCALL
+#include "frameobject.h"
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = f->f_localsplus;
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    if (Py_EnterRecursiveCall((char*)" while calling a Python object")) {
+        return NULL;
+    }
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif
+#endif
+
+/* PyObjectCall */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallMethO */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = PyCFunction_GET_FUNCTION(func);
+    self = PyCFunction_GET_SELF(func);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallOneArg */
+#if CYTHON_COMPILING_IN_CPYTHON
+static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_New(1);
+    if (unlikely(!args)) return NULL;
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(args, 0, arg);
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+#if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCall(func, &arg, 1);
+    }
+#endif
+    if (likely(PyCFunction_Check(func))) {
+        if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) {
+            return __Pyx_PyObject_CallMethO(func, arg);
+#if CYTHON_FAST_PYCCALL
+        } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) {
+            return __Pyx_PyCFunction_FastCall(func, &arg, 1);
+#endif
+        }
+    }
+    return __Pyx__PyObject_CallOneArg(func, arg);
+}
+#else
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_Pack(1, arg);
+    if (unlikely(!args)) return NULL;
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+#endif
+
+/* PyErrFetchRestore */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* RaiseException */
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+#if CYTHON_COMPILING_IN_PYPY
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#else
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* ExtTypeTest */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(__Pyx_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+    return 0;
+}
+
+/* PyIntBinop */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) {
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long x;
+        long a = PyInt_AS_LONG(op1);
+            x = (long)((unsigned long)a + b);
+            if (likely((x^a) >= 0 || (x^b) >= 0))
+                return PyInt_FromLong(x);
+            return PyLong_Type.tp_as_number->nb_add(op1, op2);
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        const long b = intval;
+        long a, x;
+#ifdef HAVE_LONG_LONG
+        const PY_LONG_LONG llb = intval;
+        PY_LONG_LONG lla, llx;
+#endif
+        const digit* digits = ((PyLongObject*)op1)->ob_digit;
+        const Py_ssize_t size = Py_SIZE(op1);
+        if (likely(__Pyx_sst_abs(size) <= 1)) {
+            a = likely(size) ? digits[0] : 0;
+            if (size == -1) a = -a;
+        } else {
+            switch (size) {
+                case -2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                    CYTHON_FALLTHROUGH;
+                default: return PyLong_Type.tp_as_number->nb_add(op1, op2);
+            }
+        }
+                x = a + b;
+            return PyLong_FromLong(x);
+#ifdef HAVE_LONG_LONG
+        long_long:
+                llx = lla + llb;
+            return PyLong_FromLongLong(llx);
+#endif
+        
+        
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+        double a = PyFloat_AS_DOUBLE(op1);
+            double result;
+            PyFPE_START_PROTECT("add", return NULL)
+            result = ((double)a) + (double)b;
+            PyFPE_END_PROTECT(result)
+            return PyFloat_FromDouble(result);
+    }
+    return (inplace ? PyNumber_InPlaceAdd : PyNumber_Add)(op1, op2);
+}
+#endif
+
+/* PyIntBinop */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) {
+    if (op1 == op2) {
+        Py_RETURN_TRUE;
+    }
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long a = PyInt_AS_LONG(op1);
+        if (a == b) {
+            Py_RETURN_TRUE;
+        } else {
+            Py_RETURN_FALSE;
+        }
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        const long b = intval;
+        long a;
+        const digit* digits = ((PyLongObject*)op1)->ob_digit;
+        const Py_ssize_t size = Py_SIZE(op1);
+        if (likely(__Pyx_sst_abs(size) <= 1)) {
+            a = likely(size) ? digits[0] : 0;
+            if (size == -1) a = -a;
+        } else {
+            switch (size) {
+                case -2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                case 4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                    CYTHON_FALLTHROUGH;
+                #if PyLong_SHIFT < 30 && PyLong_SHIFT != 15
+                default: return PyLong_Type.tp_richcompare(op1, op2, Py_EQ);
+                #else
+                default: Py_RETURN_FALSE;
+                #endif
+            }
+        }
+            if (a == b) {
+                Py_RETURN_TRUE;
+            } else {
+                Py_RETURN_FALSE;
+            }
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+        double a = PyFloat_AS_DOUBLE(op1);
+            if ((double)a == (double)b) {
+                Py_RETURN_TRUE;
+            } else {
+                Py_RETURN_FALSE;
+            }
+    }
+    return PyObject_RichCompare(op1, op2, Py_EQ);
+}
+#endif
+
+/* GetModuleGlobalName */
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
+    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else if (unlikely(PyErr_Occurred())) {
+        result = NULL;
+    } else {
+#else
+    result = PyDict_GetItem(__pyx_d, name);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else {
+#endif
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
+/* DictGetItem */
+    #if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key) {
+    PyObject *value;
+    value = PyDict_GetItemWithError(d, key);
+    if (unlikely(!value)) {
+        if (!PyErr_Occurred()) {
+            PyObject* args = PyTuple_Pack(1, key);
+            if (likely(args))
+                PyErr_SetObject(PyExc_KeyError, args);
+            Py_XDECREF(args);
+        }
+        return NULL;
+    }
+    Py_INCREF(value);
+    return value;
+}
+#endif
+
+/* GetItemInt */
+    static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (!j) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyList_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyList_GET_SIZE(o)))) {
+        PyObject *r = PyList_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyTuple_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyTuple_GET_SIZE(o)))) {
+        PyObject *r = PyTuple_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
+                                                     CYTHON_NCP_UNUSED int wraparound,
+                                                     CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    } else {
+        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
+        if (likely(m && m->sq_item)) {
+            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
+                Py_ssize_t l = m->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return NULL;
+                    PyErr_Clear();
+                }
+            }
+            return m->sq_item(o, i);
+        }
+    }
+#else
+    if (is_list || PySequence_Check(o)) {
+        return PySequence_GetItem(o, i);
+    }
+#endif
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+
+/* IsLittleEndian */
+    static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+    static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t < '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number;
+    int ndim = ctx->head->field->type->ndim;
+;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if (ctx->enc_type == *ts && got_Z == ctx->is_complex &&
+            ctx->enc_packmode == ctx->new_packmode) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+      static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((unsigned)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* FetchCommonType */
+      static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
+    PyObject* fake_module;
+    PyTypeObject* cached_type = NULL;
+    fake_module = PyImport_AddModule((char*) "_cython_" CYTHON_ABI);
+    if (!fake_module) return NULL;
+    Py_INCREF(fake_module);
+    cached_type = (PyTypeObject*) PyObject_GetAttrString(fake_module, type->tp_name);
+    if (cached_type) {
+        if (!PyType_Check((PyObject*)cached_type)) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s is not a type object",
+                type->tp_name);
+            goto bad;
+        }
+        if (cached_type->tp_basicsize != type->tp_basicsize) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s has the wrong size, try recompiling",
+                type->tp_name);
+            goto bad;
+        }
+    } else {
+        if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+        PyErr_Clear();
+        if (PyType_Ready(type) < 0) goto bad;
+        if (PyObject_SetAttrString(fake_module, type->tp_name, (PyObject*) type) < 0)
+            goto bad;
+        Py_INCREF(type);
+        cached_type = type;
+    }
+done:
+    Py_DECREF(fake_module);
+    return cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+
+/* CythonFunction */
+      #include <structmember.h>
+static PyObject *
+__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, CYTHON_UNUSED void *closure)
+{
+    if (unlikely(op->func_doc == NULL)) {
+        if (op->func.m_ml->ml_doc) {
+#if PY_MAJOR_VERSION >= 3
+            op->func_doc = PyUnicode_FromString(op->func.m_ml->ml_doc);
+#else
+            op->func_doc = PyString_FromString(op->func.m_ml->ml_doc);
+#endif
+            if (unlikely(op->func_doc == NULL))
+                return NULL;
+        } else {
+            Py_INCREF(Py_None);
+            return Py_None;
+        }
+    }
+    Py_INCREF(op->func_doc);
+    return op->func_doc;
+}
+static int
+__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp = op->func_doc;
+    if (value == NULL) {
+        value = Py_None;
+    }
+    Py_INCREF(value);
+    op->func_doc = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_name == NULL)) {
+#if PY_MAJOR_VERSION >= 3
+        op->func_name = PyUnicode_InternFromString(op->func.m_ml->ml_name);
+#else
+        op->func_name = PyString_InternFromString(op->func.m_ml->ml_name);
+#endif
+        if (unlikely(op->func_name == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_name);
+    return op->func_name;
+}
+static int
+__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__name__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_name;
+    Py_INCREF(value);
+    op->func_name = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_qualname);
+    return op->func_qualname;
+}
+static int
+__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__qualname__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_qualname;
+    Py_INCREF(value);
+    op->func_qualname = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_self(__pyx_CyFunctionObject *m, CYTHON_UNUSED void *closure)
+{
+    PyObject *self;
+    self = m->func_closure;
+    if (self == NULL)
+        self = Py_None;
+    Py_INCREF(self);
+    return self;
+}
+static PyObject *
+__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_dict == NULL)) {
+        op->func_dict = PyDict_New();
+        if (unlikely(op->func_dict == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_dict);
+    return op->func_dict;
+}
+static int
+__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+    if (unlikely(value == NULL)) {
+        PyErr_SetString(PyExc_TypeError,
+               "function's dictionary may not be deleted");
+        return -1;
+    }
+    if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+               "setting function's dictionary to a non-dict");
+        return -1;
+    }
+    tmp = op->func_dict;
+    Py_INCREF(value);
+    op->func_dict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_globals);
+    return op->func_globals;
+}
+static PyObject *
+__Pyx_CyFunction_get_closure(CYTHON_UNUSED __pyx_CyFunctionObject *op)
+{
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+static PyObject *
+__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op)
+{
+    PyObject* result = (op->func_code) ? op->func_code : Py_None;
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
+    int result = 0;
+    PyObject *res = op->defaults_getter((PyObject *) op);
+    if (unlikely(!res))
+        return -1;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
+    Py_INCREF(op->defaults_tuple);
+    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
+    Py_INCREF(op->defaults_kwdict);
+    #else
+    op->defaults_tuple = PySequence_ITEM(res, 0);
+    if (unlikely(!op->defaults_tuple)) result = -1;
+    else {
+        op->defaults_kwdict = PySequence_ITEM(res, 1);
+        if (unlikely(!op->defaults_kwdict)) result = -1;
+    }
+    #endif
+    Py_DECREF(res);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyTuple_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__defaults__ must be set to a tuple object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_tuple;
+    op->defaults_tuple = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_tuple;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_tuple;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__kwdefaults__ must be set to a dict object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_kwdict;
+    op->defaults_kwdict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_kwdict;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_kwdict;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value || value == Py_None) {
+        value = NULL;
+    } else if (!PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__annotations__ must be set to a dict object");
+        return -1;
+    }
+    Py_XINCREF(value);
+    tmp = op->func_annotations;
+    op->func_annotations = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->func_annotations;
+    if (unlikely(!result)) {
+        result = PyDict_New();
+        if (unlikely(!result)) return NULL;
+        op->func_annotations = result;
+    }
+    Py_INCREF(result);
+    return result;
+}
+static PyGetSetDef __pyx_CyFunction_getsets[] = {
+    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
+    {(char *) "__self__", (getter)__Pyx_CyFunction_get_self, 0, 0, 0},
+    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
+    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
+    {0, 0, 0, 0, 0}
+};
+static PyMemberDef __pyx_CyFunction_members[] = {
+    {(char *) "__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), PY_WRITE_RESTRICTED, 0},
+    {0, 0, 0,  0, 0}
+};
+static PyObject *
+__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, CYTHON_UNUSED PyObject *args)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromString(m->func.m_ml->ml_name);
+#else
+    return PyString_FromString(m->func.m_ml->ml_name);
+#endif
+}
+static PyMethodDef __pyx_CyFunction_methods[] = {
+    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
+    {0, 0, 0, 0}
+};
+#if PY_VERSION_HEX < 0x030500A0
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
+#else
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func.m_weakreflist)
+#endif
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int flags, PyObject* qualname,
+                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+    __pyx_CyFunctionObject *op = PyObject_GC_New(__pyx_CyFunctionObject, type);
+    if (op == NULL)
+        return NULL;
+    op->flags = flags;
+    __Pyx_CyFunction_weakreflist(op) = NULL;
+    op->func.m_ml = ml;
+    op->func.m_self = (PyObject *) op;
+    Py_XINCREF(closure);
+    op->func_closure = closure;
+    Py_XINCREF(module);
+    op->func.m_module = module;
+    op->func_dict = NULL;
+    op->func_name = NULL;
+    Py_INCREF(qualname);
+    op->func_qualname = qualname;
+    op->func_doc = NULL;
+    op->func_classobj = NULL;
+    op->func_globals = globals;
+    Py_INCREF(op->func_globals);
+    Py_XINCREF(code);
+    op->func_code = code;
+    op->defaults_pyobjects = 0;
+    op->defaults = NULL;
+    op->defaults_tuple = NULL;
+    op->defaults_kwdict = NULL;
+    op->defaults_getter = NULL;
+    op->func_annotations = NULL;
+    PyObject_GC_Track(op);
+    return (PyObject *) op;
+}
+static int
+__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
+{
+    Py_CLEAR(m->func_closure);
+    Py_CLEAR(m->func.m_module);
+    Py_CLEAR(m->func_dict);
+    Py_CLEAR(m->func_name);
+    Py_CLEAR(m->func_qualname);
+    Py_CLEAR(m->func_doc);
+    Py_CLEAR(m->func_globals);
+    Py_CLEAR(m->func_code);
+    Py_CLEAR(m->func_classobj);
+    Py_CLEAR(m->defaults_tuple);
+    Py_CLEAR(m->defaults_kwdict);
+    Py_CLEAR(m->func_annotations);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_XDECREF(pydefaults[i]);
+        PyObject_Free(m->defaults);
+        m->defaults = NULL;
+    }
+    return 0;
+}
+static void __Pyx__CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    if (__Pyx_CyFunction_weakreflist(m) != NULL)
+        PyObject_ClearWeakRefs((PyObject *) m);
+    __Pyx_CyFunction_clear(m);
+    PyObject_GC_Del(m);
+}
+static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    PyObject_GC_UnTrack(m);
+    __Pyx__CyFunction_dealloc(m);
+}
+static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
+{
+    Py_VISIT(m->func_closure);
+    Py_VISIT(m->func.m_module);
+    Py_VISIT(m->func_dict);
+    Py_VISIT(m->func_name);
+    Py_VISIT(m->func_qualname);
+    Py_VISIT(m->func_doc);
+    Py_VISIT(m->func_globals);
+    Py_VISIT(m->func_code);
+    Py_VISIT(m->func_classobj);
+    Py_VISIT(m->defaults_tuple);
+    Py_VISIT(m->defaults_kwdict);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_VISIT(pydefaults[i]);
+    }
+    return 0;
+}
+static PyObject *__Pyx_CyFunction_descr_get(PyObject *func, PyObject *obj, PyObject *type)
+{
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    if (m->flags & __Pyx_CYFUNCTION_STATICMETHOD) {
+        Py_INCREF(func);
+        return func;
+    }
+    if (m->flags & __Pyx_CYFUNCTION_CLASSMETHOD) {
+        if (type == NULL)
+            type = (PyObject *)(Py_TYPE(obj));
+        return __Pyx_PyMethod_New(func, type, (PyObject *)(Py_TYPE(type)));
+    }
+    if (obj == Py_None)
+        obj = NULL;
+    return __Pyx_PyMethod_New(func, obj, type);
+}
+static PyObject*
+__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromFormat("<cyfunction %U at %p>",
+                                op->func_qualname, (void *)op);
+#else
+    return PyString_FromFormat("<cyfunction %s at %p>",
+                               PyString_AsString(op->func_qualname), (void *)op);
+#endif
+}
+static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
+    PyCFunctionObject* f = (PyCFunctionObject*)func;
+    PyCFunction meth = f->m_ml->ml_meth;
+    Py_ssize_t size;
+    switch (f->m_ml->ml_flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
+    case METH_VARARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0))
+            return (*meth)(self, arg);
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        return (*(PyCFunctionWithKeywords)meth)(self, arg, kw);
+    case METH_NOARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 0))
+                return (*meth)(self, NULL);
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    case METH_O:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 1)) {
+                PyObject *result, *arg0;
+                #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+                arg0 = PyTuple_GET_ITEM(arg, 0);
+                #else
+                arg0 = PySequence_ITEM(arg, 0); if (unlikely(!arg0)) return NULL;
+                #endif
+                result = (*meth)(self, arg0);
+                #if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+                Py_DECREF(arg0);
+                #endif
+                return result;
+            }
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags in "
+                        "__Pyx_CyFunction_Call. METH_OLDARGS is no "
+                        "longer supported!");
+        return NULL;
+    }
+    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                 f->m_ml->ml_name);
+    return NULL;
+}
+static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    return __Pyx_CyFunction_CallMethod(func, ((PyCFunctionObject*)func)->m_self, arg, kw);
+}
+static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
+    PyObject *result;
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        Py_ssize_t argc;
+        PyObject *new_args;
+        PyObject *self;
+        argc = PyTuple_GET_SIZE(args);
+        new_args = PyTuple_GetSlice(args, 1, argc);
+        if (unlikely(!new_args))
+            return NULL;
+        self = PyTuple_GetItem(args, 0);
+        if (unlikely(!self)) {
+            Py_DECREF(new_args);
+            return NULL;
+        }
+        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
+        Py_DECREF(new_args);
+    } else {
+        result = __Pyx_CyFunction_Call(func, args, kw);
+    }
+    return result;
+}
+static PyTypeObject __pyx_CyFunctionType_type = {
+    PyVarObject_HEAD_INIT(0, 0)
+    "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+    (destructor) __Pyx_CyFunction_dealloc,
+    0,
+    0,
+    0,
+#if PY_MAJOR_VERSION < 3
+    0,
+#else
+    0,
+#endif
+    (reprfunc) __Pyx_CyFunction_repr,
+    0,
+    0,
+    0,
+    0,
+    __Pyx_CyFunction_CallAsMethod,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+    0,
+    (traverseproc) __Pyx_CyFunction_traverse,
+    (inquiry) __Pyx_CyFunction_clear,
+    0,
+#if PY_VERSION_HEX < 0x030500A0
+    offsetof(__pyx_CyFunctionObject, func_weakreflist),
+#else
+    offsetof(PyCFunctionObject, m_weakreflist),
+#endif
+    0,
+    0,
+    __pyx_CyFunction_methods,
+    __pyx_CyFunction_members,
+    __pyx_CyFunction_getsets,
+    0,
+    0,
+    __Pyx_CyFunction_descr_get,
+    0,
+    offsetof(__pyx_CyFunctionObject, func_dict),
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+#if PY_VERSION_HEX >= 0x030400a1
+    0,
+#endif
+};
+static int __pyx_CyFunction_init(void) {
+    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
+    if (unlikely(__pyx_CyFunctionType == NULL)) {
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults = PyObject_Malloc(size);
+    if (unlikely(!m->defaults))
+        return PyErr_NoMemory();
+    memset(m->defaults, 0, size);
+    m->defaults_pyobjects = pyobjects;
+    return m->defaults;
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_tuple = tuple;
+    Py_INCREF(tuple);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_kwdict = dict;
+    Py_INCREF(dict);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->func_annotations = dict;
+    Py_INCREF(dict);
+}
+
+/* BufferFallbackError */
+          static void __Pyx_RaiseBufferFallbackError(void) {
+  PyErr_SetString(PyExc_ValueError,
+     "Buffer acquisition failed on assignment; and then reacquiring the old buffer failed too!");
+}
+
+/* None */
+          static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ssize_t b) {
+    Py_ssize_t q = a / b;
+    Py_ssize_t r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* BufferIndexError */
+          static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+/* RaiseTooManyValuesToUnpack */
+          static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+/* RaiseNeedMoreValuesToUnpack */
+          static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+/* RaiseNoneIterError */
+          static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+/* SaveResetException */
+          #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    #if PY_VERSION_HEX >= 0x030700A3
+    *type = tstate->exc_state.exc_type;
+    *value = tstate->exc_state.exc_value;
+    *tb = tstate->exc_state.exc_traceback;
+    #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    #endif
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if PY_VERSION_HEX >= 0x030700A3
+    tmp_type = tstate->exc_state.exc_type;
+    tmp_value = tstate->exc_state.exc_value;
+    tmp_tb = tstate->exc_state.exc_traceback;
+    tstate->exc_state.exc_type = type;
+    tstate->exc_state.exc_value = value;
+    tstate->exc_state.exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+#endif
+
+/* PyErrExceptionMatches */
+          #if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    PyObject *exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+    if (unlikely(PyTuple_Check(err)))
+        return __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    return __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+}
+#endif
+
+/* GetException */
+          #if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) {
+#endif
+    PyObject *local_type, *local_value, *local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if PY_VERSION_HEX >= 0x030700A3
+    tmp_type = tstate->exc_state.exc_type;
+    tmp_value = tstate->exc_state.exc_value;
+    tmp_tb = tstate->exc_state.exc_traceback;
+    tstate->exc_state.exc_type = local_type;
+    tstate->exc_state.exc_value = local_value;
+    tstate->exc_state.exc_traceback = local_tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* PyObject_GenericGetAttrNoDict */
+            #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, PyObject *attr_name) {
+    PyErr_Format(PyExc_AttributeError,
+#if PY_MAJOR_VERSION >= 3
+                 "'%.50s' object has no attribute '%U'",
+                 tp->tp_name, attr_name);
+#else
+                 "'%.50s' object has no attribute '%.400s'",
+                 tp->tp_name, PyString_AS_STRING(attr_name));
+#endif
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name) {
+    PyObject *descr;
+    PyTypeObject *tp = Py_TYPE(obj);
+    if (unlikely(!PyString_Check(attr_name))) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    assert(!tp->tp_dictoffset);
+    descr = _PyType_Lookup(tp, attr_name);
+    if (unlikely(!descr)) {
+        return __Pyx_RaiseGenericGetAttributeError(tp, attr_name);
+    }
+    Py_INCREF(descr);
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_HAVE_CLASS)))
+    #endif
+    {
+        descrgetfunc f = Py_TYPE(descr)->tp_descr_get;
+        if (unlikely(f)) {
+            PyObject *res = f(descr, obj, (PyObject *)tp);
+            Py_DECREF(descr);
+            return res;
+        }
+    }
+    return descr;
+}
+#endif
+
+/* PyObject_GenericGetAttr */
+            #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000
+static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name) {
+    if (unlikely(Py_TYPE(obj)->tp_dictoffset)) {
+        return PyObject_GenericGetAttr(obj, attr_name);
+    }
+    return __Pyx_PyObject_GenericGetAttrNoDict(obj, attr_name);
+}
+#endif
+
+/* SetupReduce */
+            static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) {
+  int ret;
+  PyObject *name_attr;
+  name_attr = __Pyx_PyObject_GetAttrStr(meth, __pyx_n_s_name);
+  if (likely(name_attr)) {
+      ret = PyObject_RichCompareBool(name_attr, name, Py_EQ);
+  } else {
+      ret = -1;
+  }
+  if (unlikely(ret < 0)) {
+      PyErr_Clear();
+      ret = 0;
+  }
+  Py_XDECREF(name_attr);
+  return ret;
+}
+static int __Pyx_setup_reduce(PyObject* type_obj) {
+    int ret = 0;
+    PyObject *object_reduce = NULL;
+    PyObject *object_reduce_ex = NULL;
+    PyObject *reduce = NULL;
+    PyObject *reduce_ex = NULL;
+    PyObject *reduce_cython = NULL;
+    PyObject *setstate = NULL;
+    PyObject *setstate_cython = NULL;
+#if CYTHON_USE_PYTYPE_LOOKUP
+    if (_PyType_Lookup((PyTypeObject*)type_obj, __pyx_n_s_getstate)) goto GOOD;
+#else
+    if (PyObject_HasAttr(type_obj, __pyx_n_s_getstate)) goto GOOD;
+#endif
+#if CYTHON_USE_PYTYPE_LOOKUP
+    object_reduce_ex = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto BAD;
+#else
+    object_reduce_ex = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce_ex); if (!object_reduce_ex) goto BAD;
+#endif
+    reduce_ex = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce_ex); if (unlikely(!reduce_ex)) goto BAD;
+    if (reduce_ex == object_reduce_ex) {
+#if CYTHON_USE_PYTYPE_LOOKUP
+        object_reduce = _PyType_Lookup(&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto BAD;
+#else
+        object_reduce = __Pyx_PyObject_GetAttrStr((PyObject*)&PyBaseObject_Type, __pyx_n_s_reduce); if (!object_reduce) goto BAD;
+#endif
+        reduce = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce); if (unlikely(!reduce)) goto BAD;
+        if (reduce == object_reduce || __Pyx_setup_reduce_is_named(reduce, __pyx_n_s_reduce_cython)) {
+            reduce_cython = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_reduce_cython); if (unlikely(!reduce_cython)) goto BAD;
+            ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce, reduce_cython); if (unlikely(ret < 0)) goto BAD;
+            ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_reduce_cython); if (unlikely(ret < 0)) goto BAD;
+            setstate = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_setstate);
+            if (!setstate) PyErr_Clear();
+            if (!setstate || __Pyx_setup_reduce_is_named(setstate, __pyx_n_s_setstate_cython)) {
+                setstate_cython = __Pyx_PyObject_GetAttrStr(type_obj, __pyx_n_s_setstate_cython); if (unlikely(!setstate_cython)) goto BAD;
+                ret = PyDict_SetItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate, setstate_cython); if (unlikely(ret < 0)) goto BAD;
+                ret = PyDict_DelItem(((PyTypeObject*)type_obj)->tp_dict, __pyx_n_s_setstate_cython); if (unlikely(ret < 0)) goto BAD;
+            }
+            PyType_Modified((PyTypeObject*)type_obj);
+        }
+    }
+    goto GOOD;
+BAD:
+    if (!PyErr_Occurred())
+        PyErr_Format(PyExc_RuntimeError, "Unable to initialize pickling for %s", ((PyTypeObject*)type_obj)->tp_name);
+    ret = -1;
+GOOD:
+#if !CYTHON_USE_PYTYPE_LOOKUP
+    Py_XDECREF(object_reduce);
+    Py_XDECREF(object_reduce_ex);
+#endif
+    Py_XDECREF(reduce);
+    Py_XDECREF(reduce_ex);
+    Py_XDECREF(reduce_cython);
+    Py_XDECREF(setstate);
+    Py_XDECREF(setstate_cython);
+    return ret;
+}
+
+/* Import */
+            static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* CLineInTraceback */
+            #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+      use_cline = __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback);
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (PyObject_Not(use_cline) != 0) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+            static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+            #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+        else if (__Pyx_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view);
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+            /* CIntToPy */
+            static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPyVerify */
+            #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* CIntToPy */
+            static CYTHON_INLINE PyObject* __Pyx_PyInt_From_siz(siz value) {
+    const siz neg_one = (siz) -1, const_zero = (siz) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(siz) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(siz) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(siz) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(siz) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(siz) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(siz),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+            static CYTHON_INLINE PyObject* __Pyx_PyInt_From_Py_intptr_t(Py_intptr_t value) {
+    const Py_intptr_t neg_one = (Py_intptr_t) -1, const_zero = (Py_intptr_t) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(Py_intptr_t) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(Py_intptr_t) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(Py_intptr_t) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(Py_intptr_t) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(Py_intptr_t) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(Py_intptr_t),
+                                     little, !is_unsigned);
+    }
+}
+
+/* Declarations */
+            #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+            #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+            #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+            #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* CIntToPy */
+            static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+            static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value) {
+    const enum NPY_TYPES neg_one = (enum NPY_TYPES) -1, const_zero = (enum NPY_TYPES) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(enum NPY_TYPES) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(enum NPY_TYPES) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(enum NPY_TYPES),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+            static CYTHON_INLINE siz __Pyx_PyInt_As_siz(PyObject *x) {
+    const siz neg_one = (siz) -1, const_zero = (siz) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(siz) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(siz, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (siz) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (siz) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(siz, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(siz) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 2 * PyLong_SHIFT) {
+                            return (siz) (((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(siz) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 3 * PyLong_SHIFT) {
+                            return (siz) (((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(siz) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 4 * PyLong_SHIFT) {
+                            return (siz) (((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (siz) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(siz) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(siz) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (siz) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(siz, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(siz,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(siz) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(siz) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                            return (siz) ((((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(siz) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                            return (siz) ((((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 4 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(siz) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 4 * PyLong_SHIFT) {
+                            return (siz) ((((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(siz) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(siz) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            siz val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (siz) -1;
+        }
+    } else {
+        siz val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (siz) -1;
+        val = __Pyx_PyInt_As_siz(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to siz");
+    return (siz) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to siz");
+    return (siz) -1;
+}
+
+/* CIntFromPy */
+            static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) {
+    const size_t neg_one = (size_t) -1, const_zero = (size_t) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(size_t) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (size_t) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(size_t, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 2 * PyLong_SHIFT) {
+                            return (size_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 3 * PyLong_SHIFT) {
+                            return (size_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 4 * PyLong_SHIFT) {
+                            return (size_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (size_t) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(size_t, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(size_t,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(size_t) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) ((((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) ((((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) ((((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            size_t val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (size_t) -1;
+        }
+    } else {
+        size_t val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (size_t) -1;
+        val = __Pyx_PyInt_As_size_t(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to size_t");
+    return (size_t) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to size_t");
+    return (size_t) -1;
+}
+
+/* CIntFromPy */
+            static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntFromPy */
+            static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* FastTypeChecks */
+            #if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+            static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* ModuleImport */
+            #ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
+/* TypeImport */
+            #ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
+    size_t size, int strict)
+{
+    PyObject *py_module = 0;
+    PyObject *result = 0;
+    PyObject *py_name = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    py_module = __Pyx_ImportModule(module_name);
+    if (!py_module)
+        goto bad;
+    py_name = __Pyx_PyIdentifier_FromString(class_name);
+    if (!py_name)
+        goto bad;
+    result = PyObject_GetAttr(py_module, py_name);
+    Py_DECREF(py_name);
+    py_name = 0;
+    Py_DECREF(py_module);
+    py_module = 0;
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (!strict && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    else if ((size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s has the wrong size, try recompiling. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(py_module);
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* InitStrings */
+            static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(x);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/_mask.pyx b/insightface/detection/retinaface/rcnn/pycocotools/_mask.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..1c3e127a1c05f0542a7b9c75c5433902bc71b8c3
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/_mask.pyx
@@ -0,0 +1,308 @@
+# distutils: language = c
+# distutils: sources = maskApi.c
+
+#**************************************************************************
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+# Licensed under the Simplified BSD License [see coco/license.txt]
+#**************************************************************************
+
+__author__ = 'tsungyi'
+
+import sys
+PYTHON_VERSION = sys.version_info[0]
+
+# import both Python-level and C-level symbols of Numpy
+# the API uses Numpy to interface C and Python
+import numpy as np
+cimport numpy as np
+from libc.stdlib cimport malloc, free
+
+# intialized Numpy. must do.
+np.import_array()
+
+# import numpy C function
+# we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+
+# Declare the prototype of the C functions in MaskApi.h
+cdef extern from "maskApi.h":
+    ctypedef unsigned int uint
+    ctypedef unsigned long siz
+    ctypedef unsigned char byte
+    ctypedef double* BB
+    ctypedef struct RLE:
+        siz h,
+        siz w,
+        siz m,
+        uint* cnts,
+    void rlesInit( RLE **R, siz n )
+    void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
+    void rleDecode( const RLE *R, byte *mask, siz n )
+    void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
+    void rleArea( const RLE *R, siz n, uint *a )
+    void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
+    void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
+    void rleToBbox( const RLE *R, BB bb, siz n )
+    void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
+    void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
+    char* rleToString( const RLE *R )
+    void rleFrString( RLE *R, char *s, siz h, siz w )
+
+# python class to wrap RLE array in C
+# the class handles the memory allocation and deallocation
+cdef class RLEs:
+    cdef RLE *_R
+    cdef siz _n
+
+    def __cinit__(self, siz n =0):
+        rlesInit(&self._R, n)
+        self._n = n
+
+    # free the RLE array here
+    def __dealloc__(self):
+        if self._R is not NULL:
+            for i in range(self._n):
+                free(self._R[i].cnts)
+            free(self._R)
+    def __getattr__(self, key):
+        if key == 'n':
+            return self._n
+        raise AttributeError(key)
+
+# python class to wrap Mask array in C
+# the class handles the memory allocation and deallocation
+cdef class Masks:
+    cdef byte *_mask
+    cdef siz _h
+    cdef siz _w
+    cdef siz _n
+
+    def __cinit__(self, h, w, n):
+        self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+        self._h = h
+        self._w = w
+        self._n = n
+    # def __dealloc__(self):
+        # the memory management of _mask has been passed to np.ndarray
+        # it doesn't need to be freed here
+
+    # called when passing into np.array() and return an np.ndarray in column-major order
+    def __array__(self):
+        cdef np.npy_intp shape[1]
+        shape[0] = <np.npy_intp> self._h*self._w*self._n
+        # Create a 1D array, and reshape it to fortran/Matlab column-major array
+        ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+        # The _mask allocated by Masks is now handled by ndarray
+        PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+        return ndarray
+
+# internal conversion from Python RLEs object to compressed RLE format
+def _toString(RLEs Rs):
+    cdef siz n = Rs.n
+    cdef bytes py_string
+    cdef char* c_string
+    objs = []
+    for i in range(n):
+        c_string = rleToString( <RLE*> &Rs._R[i] )
+        py_string = c_string
+        objs.append({
+            'size': [Rs._R[i].h, Rs._R[i].w],
+            'counts': py_string
+        })
+        free(c_string)
+    return objs
+
+# internal conversion from compressed RLE format to Python RLEs object
+def _frString(rleObjs):
+    cdef siz n = len(rleObjs)
+    Rs = RLEs(n)
+    cdef bytes py_string
+    cdef char* c_string
+    for i, obj in enumerate(rleObjs):
+        if PYTHON_VERSION == 2:
+            py_string = str(obj['counts']).encode('utf8')
+        elif PYTHON_VERSION == 3:
+            py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+        else:
+            raise Exception('Python version must be 2 or 3')
+        c_string = py_string
+        rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+    return Rs
+
+# encode mask to RLEs objects
+# list of RLE string can be generated by RLEs member function
+def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+    h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+    cdef RLEs Rs = RLEs(n)
+    rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+    objs = _toString(Rs)
+    return objs
+
+# decode mask from compressed list of RLE string or RLEs object
+def decode(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+    masks = Masks(h, w, n)
+    rleDecode(<RLE*>Rs._R, masks._mask, n);
+    return np.array(masks)
+
+def merge(rleObjs, intersect=0):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef RLEs R = RLEs(1)
+    rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+    obj = _toString(R)[0]
+    return obj
+
+def area(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+    rleArea(Rs._R, Rs._n, _a)
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> Rs._n
+    a = np.array((Rs._n, ), dtype=np.uint8)
+    a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+    PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+    return a
+
+# iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+def iou( dt, gt, pyiscrowd ):
+    def _preproc(objs):
+        if len(objs) == 0:
+            return objs
+        if type(objs) == np.ndarray:
+            if len(objs.shape) == 1:
+                objs = objs.reshape((objs[0], 1))
+            # check if it's Nx4 bbox
+            if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+                raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+            objs = objs.astype(np.double)
+        elif type(objs) == list:
+            # check if list is in box format and convert it to np.ndarray
+            isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+            isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+            if isbox:
+                objs = np.array(objs, dtype=np.double)
+                if len(objs.shape) == 1:
+                    objs = objs.reshape((1,objs.shape[0]))
+            elif isrle:
+                objs = _frString(objs)
+            else:
+                raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+        else:
+            raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+        return objs
+    def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+        rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+    def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+        bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+    def _len(obj):
+        cdef siz N = 0
+        if type(obj) == RLEs:
+            N = obj.n
+        elif len(obj)==0:
+            pass
+        elif type(obj) == np.ndarray:
+            N = obj.shape[0]
+        return N
+    # convert iscrowd to numpy array
+    cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
+    # simple type checking
+    cdef siz m, n
+    dt = _preproc(dt)
+    gt = _preproc(gt)
+    m = _len(dt)
+    n = _len(gt)
+    if m == 0 or n == 0:
+        return []
+    if not type(dt) == type(gt):
+        raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+
+    # define local variables
+    cdef double* _iou = <double*> 0
+    cdef np.npy_intp shape[1]
+    # check type and assign iou function
+    if type(dt) == RLEs:
+        _iouFun = _rleIou
+    elif type(dt) == np.ndarray:
+        _iouFun = _bbIou
+    else:
+        raise Exception('input data type not allowed.')
+    _iou = <double*> malloc(m*n* sizeof(double))
+    iou = np.zeros((m*n, ), dtype=np.double)
+    shape[0] = <np.npy_intp> m*n
+    iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+    PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+    _iouFun(dt, gt, iscrowd, m, n, iou)
+    return iou.reshape((m,n), order='F')
+
+def toBbox( rleObjs ):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef siz n = Rs.n
+    cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+    rleToBbox( <const RLE*> Rs._R, _bb, n )
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> 4*n
+    bb = np.array((1,4*n), dtype=np.double)
+    bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+    PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+    return bb
+
+def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+    cdef siz n = bb.shape[0]
+    Rs = RLEs(n)
+    rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+    objs = _toString(Rs)
+    return objs
+
+def frPoly( poly, siz h, siz w ):
+    cdef np.ndarray[np.double_t, ndim=1] np_poly
+    n = len(poly)
+    Rs = RLEs(n)
+    for i, p in enumerate(poly):
+        np_poly = np.array(p, dtype=np.double, order='F')
+        rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+    objs = _toString(Rs)
+    return objs
+
+def frUncompressedRLE(ucRles, siz h, siz w):
+    cdef np.ndarray[np.uint32_t, ndim=1] cnts
+    cdef RLE R
+    cdef uint *data
+    n = len(ucRles)
+    objs = []
+    for i in range(n):
+        Rs = RLEs(1)
+        cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+        # time for malloc can be saved here but it's fine
+        data = <uint*> malloc(len(cnts)* sizeof(uint))
+        for j in range(len(cnts)):
+            data[j] = <uint> cnts[j]
+        R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+        Rs._R[0] = R
+        objs.append(_toString(Rs)[0])
+    return objs
+
+def frPyObjects(pyobj, h, w):
+    # encode rle from a list of python objects
+    if type(pyobj) == np.ndarray:
+        objs = frBbox(pyobj, h, w)
+    elif type(pyobj) == list and len(pyobj[0]) == 4:
+        objs = frBbox(pyobj, h, w)
+    elif type(pyobj) == list and len(pyobj[0]) > 4:
+        objs = frPoly(pyobj, h, w)
+    elif type(pyobj) == list and type(pyobj[0]) == dict \
+        and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+        objs = frUncompressedRLE(pyobj, h, w)
+    # encode rle from single python object
+    elif type(pyobj) == list and len(pyobj) == 4:
+        objs = frBbox([pyobj], h, w)[0]
+    elif type(pyobj) == list and len(pyobj) > 4:
+        objs = frPoly([pyobj], h, w)[0]
+    elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+        objs = frUncompressedRLE([pyobj], h, w)[0]
+    else:
+        raise Exception('input type is not supported.')
+    return objs
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/coco.py b/insightface/detection/retinaface/rcnn/pycocotools/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f79236526249919be01e687bceabc6629481a72
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/coco.py
@@ -0,0 +1,477 @@
+__author__ = 'tylin'
+__version__ = '2.0'
+# Interface for accessing the Microsoft COCO dataset.
+
+# Microsoft COCO is a large image dataset designed for object detection,
+# segmentation, and caption generation. pycocotools is a Python API that
+# assists in loading, parsing and visualizing the annotations in COCO.
+# Please visit http://mscoco.org/ for more information on COCO, including
+# for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the COCO website. For example usage of the pycocotools
+# please see pycocotools_demo.ipynb. In addition to this API, please download both
+# the COCO images and annotations in order to run the demo.
+
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *instance* and *caption* annotations. In the case of
+# captions not all functions are defined (e.g. categories are undefined).
+
+# The following API functions are defined:
+#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  showAnns   - Display the specified annotations.
+#  loadRes    - Load algorithm results and create API for accessing them.
+#  download   - Download COCO images from mscoco.org server.
+# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
+# Help on each functions can be accessed by: "help COCO>function".
+
+# See also COCO>decodeMask,
+# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
+# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
+# COCO>loadImgs, COCO>annToMask, COCO>showAnns
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import json
+import time
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+import numpy as np
+import copy
+import itertools
+from . import mask as maskUtils
+import os
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
+
+
+class COCO:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset, self.anns, self.cats, self.imgs = dict(), dict(), dict(
+        ), dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(
+                dataset
+            ) == dict, 'annotation file format {} not supported'.format(
+                type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time() - tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                imgToAnns[ann['image_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToImgs[ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('{}: {}'.format(key, value))
+
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param imgIds  (int array)     : get anns for given imgs
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(imgIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                lists = [
+                    self.imgToAnns[imgId] for imgId in imgIds
+                    if imgId in self.imgToAnns
+                ]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds) == 0 else [
+                ann for ann in anns if ann['category_id'] in catIds
+            ]
+            anns = anns if len(areaRng) == 0 else [
+                ann for ann in anns
+                if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]
+            ]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if type(catNms) == list else [catNms]
+        supNms = supNms if type(supNms) == list else [supNms]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [
+                cat for cat in cats if cat['name'] in catNms
+            ]
+            cats = cats if len(supNms) == 0 else [
+                cat for cat in cats if cat['supercategory'] in supNms
+            ]
+            cats = cats if len(catIds) == 0 else [
+                cat for cat in cats if cat['id'] in catIds
+            ]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getImgIds(self, imgIds=[], catIds=[]):
+        '''
+        Get img ids that satisfy given filter conditions.
+        :param imgIds (int array) : get imgs for given ids
+        :param catIds (int array) : get imgs with all given cats
+        :return: ids (int array)  : integer array of img ids
+        '''
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(imgIds) == len(catIds) == 0:
+            ids = self.imgs.keys()
+        else:
+            ids = set(imgIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToImgs[catId])
+                else:
+                    ids &= set(self.catToImgs[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if type(ids) == list:
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if type(ids) == list:
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadImgs(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying img
+        :return: imgs (object array) : loaded img objects
+        """
+        if type(ids) == list:
+            return [self.imgs[id] for id in ids]
+        elif type(ids) == int:
+            return [self.imgs[ids]]
+
+    def showAnns(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
+            datasetType = 'instances'
+        elif 'caption' in anns[0]:
+            datasetType = 'captions'
+        else:
+            raise Exception('datasetType not supported')
+        if datasetType == 'instances':
+            ax = plt.gca()
+            ax.set_autoscale_on(False)
+            polygons = []
+            color = []
+            for ann in anns:
+                c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+                if 'segmentation' in ann:
+                    if type(ann['segmentation']) == list:
+                        # polygon
+                        for seg in ann['segmentation']:
+                            poly = np.array(seg).reshape(
+                                (int(len(seg) / 2), 2))
+                            polygons.append(Polygon(poly))
+                            color.append(c)
+                    else:
+                        # mask
+                        t = self.imgs[ann['image_id']]
+                        if type(ann['segmentation']['counts']) == list:
+                            rle = maskUtils.frPyObjects([ann['segmentation']],
+                                                        t['height'],
+                                                        t['width'])
+                        else:
+                            rle = [ann['segmentation']]
+                        m = maskUtils.decode(rle)
+                        img = np.ones((m.shape[0], m.shape[1], 3))
+                        if ann['iscrowd'] == 1:
+                            color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                        if ann['iscrowd'] == 0:
+                            color_mask = np.random.random((1, 3)).tolist()[0]
+                        for i in range(3):
+                            img[:, :, i] = color_mask[i]
+                        ax.imshow(np.dstack((img, m * 0.5)))
+                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                    # turn skeleton into zero-based index
+                    sks = np.array(
+                        self.loadCats(ann['category_id'])[0]['skeleton']) - 1
+                    kp = np.array(ann['keypoints'])
+                    x = kp[0::3]
+                    y = kp[1::3]
+                    v = kp[2::3]
+                    for sk in sks:
+                        if np.all(v[sk] > 0):
+                            plt.plot(x[sk], y[sk], linewidth=3, color=c)
+                    plt.plot(x[v > 0],
+                             y[v > 0],
+                             'o',
+                             markersize=8,
+                             markerfacecolor=c,
+                             markeredgecolor='k',
+                             markeredgewidth=2)
+                    plt.plot(x[v > 1],
+                             y[v > 1],
+                             'o',
+                             markersize=8,
+                             markerfacecolor=c,
+                             markeredgecolor=c,
+                             markeredgewidth=2)
+            p = PatchCollection(polygons,
+                                facecolor=color,
+                                linewidths=0,
+                                alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons,
+                                facecolor='none',
+                                edgecolors=color,
+                                linewidths=2)
+            ax.add_collection(p)
+        elif datasetType == 'captions':
+            for ann in anns:
+                print(ann['caption'])
+
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCO()
+        res.dataset['images'] = [img for img in self.dataset['images']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or type(resFile) == unicode:
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsImgIds = [ann['image_id'] for ann in anns]
+        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+               'Results do not correspond to current coco set'
+        if 'caption' in anns[0]:
+            imgIds = set([img['id'] for img in res.dataset['images']]) & set(
+                [ann['image_id'] for ann in anns])
+            res.dataset['images'] = [
+                img for img in res.dataset['images'] if img['id'] in imgIds
+            ]
+            for id, ann in enumerate(anns):
+                ann['id'] = id + 1
+        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                bb = ann['bbox']
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not 'segmentation' in ann:
+                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann['area'] = bb[2] * bb[3]
+                ann['id'] = id + 1
+                ann['iscrowd'] = 0
+        elif 'segmentation' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann['area'] = maskUtils.area(ann['segmentation'])
+                if not 'bbox' in ann:
+                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+                ann['id'] = id + 1
+                ann['iscrowd'] = 0
+        elif 'keypoints' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                s = ann['keypoints']
+                x = s[0::3]
+                y = s[1::3]
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann['area'] = (x1 - x0) * (y1 - y0)
+                ann['id'] = id + 1
+                ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+        print('DONE (t={:0.2f}s)'.format(time.time() - tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def download(self, tarDir=None, imgIds=[]):
+        '''
+        Download COCO images from mscoco.org server.
+        :param tarDir (str): COCO results directory name
+               imgIds (list): images to be downloaded
+        :return:
+        '''
+        if tarDir is None:
+            print('Please specify target directory')
+            return -1
+        if len(imgIds) == 0:
+            imgs = self.imgs.values()
+        else:
+            imgs = self.loadImgs(imgIds)
+        N = len(imgs)
+        if not os.path.exists(tarDir):
+            os.makedirs(tarDir)
+        for i, img in enumerate(imgs):
+            tic = time.time()
+            fname = os.path.join(tarDir, img['file_name'])
+            if not os.path.exists(fname):
+                urlretrieve(img['coco_url'], fname)
+            print('downloaded {}/{} images (t={:0.1f}s)'.format(
+                i, N,
+                time.time() - tic))
+
+    def loadNumpyAnnotations(self, data):
+        """
+        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
+        :param  data (numpy.ndarray)
+        :return: annotations (python nested list)
+        """
+        print('Converting ndarray to lists...')
+        assert (type(data) == np.ndarray)
+        print(data.shape)
+        assert (data.shape[1] == 7)
+        N = data.shape[0]
+        ann = []
+        for i in range(N):
+            if i % 1000000 == 0:
+                print('{}/{}'.format(i, N))
+            ann += [{
+                'image_id': int(data[i, 0]),
+                'bbox': [data[i, 1], data[i, 2], data[i, 3], data[i, 4]],
+                'score': data[i, 5],
+                'category_id': int(data[i, 6]),
+            }]
+        return ann
+
+    def annToRLE(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.imgs[ann['image_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentation']
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = ann['segmentation']
+        return rle
+
+    def annToMask(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann)
+        m = maskUtils.decode(rle)
+        return m
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/cocoeval.py b/insightface/detection/retinaface/rcnn/pycocotools/cocoeval.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d3531847d216d5eb905a10d78b78a7ed8a8c776
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/cocoeval.py
@@ -0,0 +1,592 @@
+__author__ = 'tsungyi'
+
+import numpy as np
+import datetime
+import time
+from collections import defaultdict
+from .mask import *
+import copy
+
+
+class COCOeval:
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        '''
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        '''
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.params = {}  # evaluation parameters
+        self.evalImgs = defaultdict(
+            list)  # per-image per-category evaluation results [KxAxI] elements
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if not cocoGt is None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
+
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if not p.useSegm is None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(imgId, catId): computeIoU(imgId, catId) \
+                        for imgId in p.imgIds
+                        for catId in catIds}
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            evaluateImg(imgId, catId, areaRng, maxDet) for catId in catIds
+            for areaRng in p.areaRng for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0:p.maxDets[-1]]
+
+        if p.iouType == 'segm':
+            g = [g['segmentation'] for g in gt]
+            d = [d['segmentation'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bbox'] for g in gt]
+            d = [d['bbox'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        ious = iou(d, g, iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimention here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros((k))
+                    dx = np.max((z, x0 - xd), axis=0) + np.max(
+                        (z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max(
+                        (z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['area'] + np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['area'] < aRng[0] or g['area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[imgId, catId][:, gtind] if len(
+            self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area'] < aRng[0] or d['area'] > aRng[1]
+                      for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+        return {
+            'image_id': imgId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def accumulate(self, p=None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -np.ones(
+            (T, R, K, A, M))  # -1 for the precision of absent categories
+        recall = -np.ones((T, K, A, M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [
+            n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng))
+            if a in setA
+        ]
+        i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate(
+                        [e['dtScores'][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+
+                    dtm = np.concatenate(
+                        [e['dtMatches'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                          inds]
+                    dtIg = np.concatenate(
+                        [e['dtIgnore'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                         inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(np.logical_not(dtm),
+                                         np.logical_not(dtIg))
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R, ))
+
+                        if nd:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall': recall,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def summarize(self):
+        '''
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        '''
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12, ))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1,
+                                  iouThr=.75,
+                                  maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1,
+                                  areaRng='small',
+                                  maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1,
+                                  areaRng='medium',
+                                  maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1,
+                                  areaRng='large',
+                                  maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0,
+                                  areaRng='small',
+                                  maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0,
+                                   areaRng='medium',
+                                   maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0,
+                                   areaRng='large',
+                                   maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+
+class Params:
+    '''
+    Params for coco evaluation api
+    '''
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5,
+                                   0.95,
+                                   np.round((0.95 - .5) / .05) + 1,
+                                   endpoint=True)
+        self.recThrs = np.linspace(.0,
+                                   1.00,
+                                   np.round((1.00 - .0) / .01) + 1,
+                                   endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2],
+                        [96**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5,
+                                   0.95,
+                                   np.round((0.95 - .5) / .05) + 1,
+                                   endpoint=True)
+        self.recThrs = np.linspace(.0,
+                                   1.00,
+                                   np.round((1.00 - .0) / .01) + 1,
+                                   endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/mask.py b/insightface/detection/retinaface/rcnn/pycocotools/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9c14cf041bdd0ed12a5e25a16ed3573ba4a964
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/mask.py
@@ -0,0 +1,107 @@
+__author__ = 'tsungyi'
+
+from rcnn.pycocotools import _mask
+
+# Interface for manipulating masks stored in RLE format.
+#
+# RLE is a simple yet efficient format for storing binary masks. RLE
+# first divides a vector (or vectorized image) into a series of piecewise
+# constant regions and then for each piece simply stores the length of
+# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
+# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
+# (note that the odd counts are always the numbers of zeros). Instead of
+# storing the counts directly, additional compression is achieved with a
+# variable bitrate representation based on a common scheme called LEB128.
+#
+# Compression is greatest given large piecewise constant regions.
+# Specifically, the size of the RLE is proportional to the number of
+# *boundaries* in M (or for an image the number of boundaries in the y
+# direction). Assuming fairly simple shapes, the RLE representation is
+# O(sqrt(n)) where n is number of pixels in the object. Hence space usage
+# is substantially lower, especially for large simple objects (large n).
+#
+# Many common operations on masks can be computed directly using the RLE
+# (without need for decoding). This includes computations such as area,
+# union, intersection, etc. All of these operations are linear in the
+# size of the RLE, in other words they are O(sqrt(n)) where n is the area
+# of the object. Computing these operations on the original mask is O(n).
+# Thus, using the RLE can result in substantial computational savings.
+#
+# The following API functions are defined:
+#  encode         - Encode binary masks using RLE.
+#  decode         - Decode binary masks encoded via RLE.
+#  merge          - Compute union or intersection of encoded masks.
+#  iou            - Compute intersection over union between masks.
+#  area           - Compute area of encoded masks.
+#  toBbox         - Get bounding boxes surrounding encoded masks.
+#  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
+#
+# Usage:
+#  Rs     = encode( masks )
+#  masks  = decode( Rs )
+#  R      = merge( Rs, intersect=false )
+#  o      = iou( dt, gt, iscrowd )
+#  a      = area( Rs )
+#  bbs    = toBbox( Rs )
+#  Rs     = frPyObjects( [pyObjects], h, w )
+#
+# In the API the following formats are used:
+#  Rs      - [dict] Run-length encoding of binary masks
+#  R       - dict Run-length encoding of binary mask
+#  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
+#  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
+#  bbs     - [nx4] Bounding box(es) stored as [x y w h]
+#  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
+#  dt,gt   - May be either bounding boxes or encoded masks
+# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
+#
+# Finally, a note about the intersection over union (iou) computation.
+# The standard iou of a ground truth (gt) and detected (dt) object is
+#  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
+# For "crowd" regions, we use a modified criteria. If a gt object is
+# marked as "iscrowd", we allow a dt to match any subregion of the gt.
+# Choosing gt' in the crowd gt that best matches the dt can be done using
+# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
+#  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
+# For crowd gt regions we use this modified criteria above for the iou.
+#
+# To compile run "python setup.py build_ext --inplace"
+# Please do not contact us for help with compiling.
+#
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+# Licensed under the Simplified BSD License [see coco/license.txt]
+
+iou = _mask.iou
+merge = _mask.merge
+frPyObjects = _mask.frPyObjects
+
+
+def encode(bimask):
+    if len(bimask.shape) == 3:
+        return _mask.encode(bimask)
+    elif len(bimask.shape) == 2:
+        h, w = bimask.shape
+        return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
+
+
+def decode(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.decode(rleObjs)
+    else:
+        return _mask.decode([rleObjs])[:, :, 0]
+
+
+def area(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.area(rleObjs)
+    else:
+        return _mask.area([rleObjs])[0]
+
+
+def toBbox(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.toBbox(rleObjs)
+    else:
+        return _mask.toBbox([rleObjs])[0]
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/maskApi.c b/insightface/detection/retinaface/rcnn/pycocotools/maskApi.c
new file mode 100644
index 0000000000000000000000000000000000000000..85e397918278126ce11f225dc109efbeb8a9394f
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/maskApi.c
@@ -0,0 +1,230 @@
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#include "maskApi.h"
+#include <math.h>
+#include <stdlib.h>
+
+uint umin( uint a, uint b ) { return (a<b) ? a : b; }
+uint umax( uint a, uint b ) { return (a>b) ? a : b; }
+
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
+  R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
+  siz j; if(cnts) for(j=0; j<m; j++) R->cnts[j]=cnts[j];
+}
+
+void rleFree( RLE *R ) {
+  free(R->cnts); R->cnts=0;
+}
+
+void rlesInit( RLE **R, siz n ) {
+  siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
+  for(i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
+}
+
+void rlesFree( RLE **R, siz n ) {
+  siz i; for(i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
+}
+
+void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
+  siz i, j, k, a=w*h; uint c, *cnts; byte p;
+  cnts = malloc(sizeof(uint)*(a+1));
+  for(i=0; i<n; i++) {
+    const byte *T=M+a*i; k=0; p=0; c=0;
+    for(j=0; j<a; j++) { if(T[j]!=p) { cnts[k++]=c; c=0; p=T[j]; } c++; }
+    cnts[k++]=c; rleInit(R+i,h,w,k,cnts);
+  }
+  free(cnts);
+}
+
+void rleDecode( const RLE *R, byte *M, siz n ) {
+  siz i, j, k; for( i=0; i<n; i++ ) {
+    byte v=0; for( j=0; j<R[i].m; j++ ) {
+      for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
+}
+
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) {
+  uint *cnts, c, ca, cb, cc, ct; int v, va, vb, vp;
+  siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
+  if(n==0) { rleInit(M,0,0,0,0); return; }
+  if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
+  cnts = malloc(sizeof(uint)*(h*w+1));
+  for( a=0; a<m; a++ ) cnts[a]=R[0].cnts[a];
+  for( i=1; i<n; i++ ) {
+    B=R[i]; if(B.h!=h||B.w!=w) { h=w=m=0; break; }
+    rleInit(&A,h,w,m,cnts); ca=A.cnts[0]; cb=B.cnts[0];
+    v=va=vb=0; m=0; a=b=1; cc=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); cc+=c; ct=0;
+      ca-=c; if(!ca && a<A.m) { ca=A.cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<B.m) { cb=B.cnts[b++]; vb=!vb; } ct+=cb;
+      vp=v; if(intersect) v=va&&vb; else v=va||vb;
+      if( v!=vp||ct==0 ) { cnts[m++]=cc; cc=0; }
+    }
+    rleFree(&A);
+  }
+  rleInit(M,h,w,m,cnts); free(cnts);
+}
+
+void rleArea( const RLE *R, siz n, uint *a ) {
+  siz i, j; for( i=0; i<n; i++ ) {
+    a[i]=0; for( j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
+}
+
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
+  siz g, d; BB db, gb; int crowd;
+  db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
+  gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
+  bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
+  for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
+    crowd=iscrowd!=NULL && iscrowd[g];
+    if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
+    siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
+    ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
+    cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
+      ca-=c; if(!ca && a<ka) { ca=dt[d].cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<kb) { cb=gt[g].cnts[b++]; vb=!vb; } ct+=cb;
+    }
+    if(i==0) u=1; else if(crowd) rleArea(dt+d,1,&u);
+    o[g*m+d] = (double)i/(double)u;
+  }
+}
+
+void rleNms( RLE *dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      rleIou(dt+i,dt+j,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
+  double h, w, i, u, ga, da; siz g, d; int crowd;
+  for( g=0; g<n; g++ ) {
+    BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
+    for( d=0; d<m; d++ ) {
+      BB D=dt+d*4; da=D[2]*D[3]; o[g*m+d]=0;
+      w=fmin(D[2]+D[0],G[2]+G[0])-fmax(D[0],G[0]); if(w<=0) continue;
+      h=fmin(D[3]+D[1],G[3]+G[1])-fmax(D[1],G[1]); if(h<=0) continue;
+      i=w*h; u = crowd ? da : da+ga-i; o[g*m+d]=i/u;
+    }
+  }
+}
+
+void bbNms( BB dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      bbIou(dt+i*4,dt+j*4,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
+void rleToBbox( const RLE *R, BB bb, siz n ) {
+  siz i; for( i=0; i<n; i++ ) {
+    uint h, w, x, y, xs, ys, xe, ye, cc, t; siz j, m;
+    h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
+    m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
+    if(m==0) { bb[4*i+0]=bb[4*i+1]=bb[4*i+2]=bb[4*i+3]=0; continue; }
+    for( j=0; j<m; j++ ) {
+      cc+=R[i].cnts[j]; t=cc-j%2; y=t%h; x=(t-y)/h;
+      xs=umin(xs,x); xe=umax(xe,x); ys=umin(ys,y); ye=umax(ye,y);
+    }
+    bb[4*i+0]=xs; bb[4*i+2]=xe-xs+1;
+    bb[4*i+1]=ys; bb[4*i+3]=ye-ys+1;
+  }
+}
+
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
+  siz i; for( i=0; i<n; i++ ) {
+    double xs=bb[4*i+0], xe=xs+bb[4*i+2];
+    double ys=bb[4*i+1], ye=ys+bb[4*i+3];
+    double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
+    rleFrPoly( R+i, xy, 4, h, w );
+  }
+}
+
+int uintCompare(const void *a, const void *b) {
+  uint c=*((uint*)a), d=*((uint*)b); return c>d?1:c<d?-1:0;
+}
+
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
+  /* upsample and get discrete points densely along entire boundary */
+  siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
+  x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
+  for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
+  for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
+  for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
+  u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
+  for( j=0; j<k; j++ ) {
+    int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t, d;
+    int flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
+    flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
+    if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
+    s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
+    if(dx>=dy) for( d=0; d<=dx; d++ ) {
+      t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
+    } else for( d=0; d<=dy; d++ ) {
+      t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
+    }
+  }
+  /* get points along y-boundary and downsample */
+  free(x); free(y); k=m; m=0; double xd, yd;
+  x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
+  for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
+    xd=(double)(u[j]<u[j-1]?u[j]:u[j]-1); xd=(xd+.5)/scale-.5;
+    if( floor(xd)!=xd || xd<0 || xd>w-1 ) continue;
+    yd=(double)(v[j]<v[j-1]?v[j]:v[j-1]); yd=(yd+.5)/scale-.5;
+    if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
+    x[m]=(int) xd; y[m]=(int) yd; m++;
+  }
+  /* compute rle encoding given y-boundary points */
+  k=m; a=malloc(sizeof(uint)*(k+1));
+  for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
+  a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
+  qsort(a,k,sizeof(uint),uintCompare); uint p=0;
+  for( j=0; j<k; j++ ) { uint t=a[j]; a[j]-=p; p=t; }
+  b=malloc(sizeof(uint)*k); j=m=0; b[m++]=a[j++];
+  while(j<k) if(a[j]>0) b[m++]=a[j++]; else {
+    j++; if(j<k) b[m-1]+=a[j++]; }
+  rleInit(R,h,w,m,b); free(a); free(b);
+}
+
+char* rleToString( const RLE *R ) {
+  /* Similar to LEB128 but using 6 bits/char and ascii chars 48-111. */
+  siz i, m=R->m, p=0; long x; int more;
+  char *s=malloc(sizeof(char)*m*6);
+  for( i=0; i<m; i++ ) {
+    x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
+    while( more ) {
+      char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
+      if(more) c |= 0x20; c+=48; s[p++]=c;
+    }
+  }
+  s[p]=0; return s;
+}
+
+void rleFrString( RLE *R, char *s, siz h, siz w ) {
+  siz m=0, p=0, k; long x; int more; uint *cnts;
+  while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
+  while( s[p] ) {
+    x=0; k=0; more=1;
+    while( more ) {
+      char c=s[p]-48; x |= (c & 0x1f) << 5*k;
+      more = c & 0x20; p++; k++;
+      if(!more && (c & 0x10)) x |= -1 << 5*k;
+    }
+    if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
+  }
+  rleInit(R,h,w,m,cnts); free(cnts);
+}
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/maskApi.h b/insightface/detection/retinaface/rcnn/pycocotools/maskApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebc7892da38289b459d6be824e1f849878bd4069
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/maskApi.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#pragma once
+
+typedef unsigned int uint;
+typedef unsigned long siz;
+typedef unsigned char byte;
+typedef double* BB;
+typedef struct { siz h, w, m; uint *cnts; } RLE;
+
+/* Initialize/destroy RLE. */
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
+void rleFree( RLE *R );
+
+/* Initialize/destroy RLE array. */
+void rlesInit( RLE **R, siz n );
+void rlesFree( RLE **R, siz n );
+
+/* Encode binary masks using RLE. */
+void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
+
+/* Decode binary masks encoded via RLE. */
+void rleDecode( const RLE *R, byte *mask, siz n );
+
+/* Compute union or intersection of encoded masks. */
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
+
+/* Compute area of encoded masks. */
+void rleArea( const RLE *R, siz n, uint *a );
+
+/* Compute intersection over union between masks. */
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
+
+/* Compute non-maximum suppression between bounding masks */
+void rleNms( RLE *dt, siz n, uint *keep, double thr );
+
+/* Compute intersection over union between bounding boxes. */
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
+
+/* Compute non-maximum suppression between bounding boxes */
+void bbNms( BB dt, siz n, uint *keep, double thr );
+
+/* Get bounding boxes surrounding encoded masks. */
+void rleToBbox( const RLE *R, BB bb, siz n );
+
+/* Convert bounding boxes to encoded masks. */
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
+
+/* Convert polygon to encoded mask. */
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
+
+/* Get compressed string representation of encoded mask. */
+char* rleToString( const RLE *R );
+
+/* Convert from compressed string representation of encoded mask. */
+void rleFrString( RLE *R, char *s, siz h, siz w );
diff --git a/insightface/detection/retinaface/rcnn/pycocotools/setup.py b/insightface/detection/retinaface/rcnn/pycocotools/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e0d2e89c09e226d43e9e5d0adad4e48e34e6af
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/pycocotools/setup.py
@@ -0,0 +1,18 @@
+from distutils.core import setup
+from Cython.Build import cythonize
+from distutils.extension import Extension
+import numpy as np
+
+# To compile and install locally run "python setup.py build_ext --inplace"
+# To install library to Python site-packages run "python setup.py build_ext install"
+
+ext_modules = [
+    Extension(
+        '_mask',
+        sources=['maskApi.c', '_mask.pyx'],
+        include_dirs=[np.get_include()],
+        extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
+    )
+]
+
+setup(name='pycocotools', ext_modules=cythonize(ext_modules))
diff --git a/insightface/detection/retinaface/rcnn/sample_config.py b/insightface/detection/retinaface/rcnn/sample_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..88fff9f0431b33ddf5f283595cdcae79709e7bba
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/sample_config.py
@@ -0,0 +1,351 @@
+import numpy as np
+from easydict import EasyDict as edict
+
+config = edict()
+
+# network related params
+config.PIXEL_MEANS = np.array([103.939, 116.779, 123.68])
+config.PIXEL_STDS = np.array([1.0, 1.0, 1.0])
+config.PIXEL_SCALE = 1.0
+config.IMAGE_STRIDE = 0
+
+# dataset related params
+config.NUM_CLASSES = 2
+config.PRE_SCALES = [(1200, 1600)
+                     ]  # first is scale (the shorter side); second is max size
+config.SCALES = [(640, 640)
+                 ]  # first is scale (the shorter side); second is max size
+#config.SCALES = [(800, 800)]  # first is scale (the shorter side); second is max size
+config.ORIGIN_SCALE = False
+
+_ratio = (1., )
+
+RAC_SSH = {
+    '32': {
+        'SCALES': (32, 16),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '16': {
+        'SCALES': (8, 4),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '8': {
+        'SCALES': (2, 1),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+}
+
+_ratio = (1., 1.5)
+RAC_SSH2 = {
+    '32': {
+        'SCALES': (32, 16),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '16': {
+        'SCALES': (8, 4),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '8': {
+        'SCALES': (2, 1),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+}
+
+_ratio = (1., 1.5)
+RAC_SSH3 = {
+    '32': {
+        'SCALES': (32, 16),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '16': {
+        'SCALES': (8, 4),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '8': {
+        'SCALES': (2, 1),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+    '4': {
+        'SCALES': (2, 1),
+        'BASE_SIZE': 16,
+        'RATIOS': _ratio,
+        'ALLOWED_BORDER': 9999
+    },
+}
+
+RAC_RETINA = {}
+_ratios = (1.0, )
+_ass = 2.0**(1.0 / 3)
+_basescale = 1.0
+for _stride in [4, 8, 16, 32, 64]:
+    key = str(_stride)
+    value = {'BASE_SIZE': 16, 'RATIOS': _ratios, 'ALLOWED_BORDER': 9999}
+    scales = []
+    for _ in range(3):
+        scales.append(_basescale)
+        _basescale *= _ass
+    value['SCALES'] = tuple(scales)
+    RAC_RETINA[key] = value
+
+config.RPN_ANCHOR_CFG = RAC_SSH  #default
+
+config.NET_MODE = 2
+config.HEAD_MODULE = 'SSH'
+#config.HEAD_MODULE = 'RF'
+config.LR_MODE = 0
+config.LANDMARK_LR_MULT = 2.0
+config.HEAD_FILTER_NUM = 256
+config.CONTEXT_FILTER_RATIO = 1
+config.max_feat_channel = 9999
+
+config.USE_CROP = True
+config.USE_FPN = True
+config.USE_DCN = 0
+config.FACE_LANDMARK = True
+config.USE_OCCLUSION = False
+config.USE_BLUR = False
+config.MORE_SMALL_BOX = True
+
+config.LAYER_FIX = False
+
+config.CASCADE = 0
+config.CASCADE_MODE = 1
+#config.CASCADE_CLS_STRIDES = [16,8,4]
+#config.CASCADE_BBOX_STRIDES = [64,32]
+config.CASCADE_CLS_STRIDES = [64, 32, 16, 8, 4]
+config.CASCADE_BBOX_STRIDES = [64, 32, 16, 8, 4]
+#config.CASCADE_BBOX_STRIDES = [64,32,16,8]
+
+config.HEAD_BOX = False
+config.DENSE_ANCHOR = False
+config.USE_MAXOUT = 0
+config.SHARE_WEIGHT_BBOX = False
+config.SHARE_WEIGHT_LANDMARK = False
+
+config.RANDOM_FEAT_STRIDE = False
+config.NUM_CPU = 4
+config.MIXUP = 0.0
+config.USE_3D = False
+
+#config.BBOX_MASK_THRESH = 0
+config.COLOR_MODE = 2
+config.COLOR_JITTERING = 0.125
+#config.COLOR_JITTERING = 0
+#config.COLOR_JITTERING = 0.2
+
+config.TRAIN = edict()
+
+config.TRAIN.IMAGE_ALIGN = 0
+config.TRAIN.MIN_BOX_SIZE = 0
+config.BBOX_MASK_THRESH = config.TRAIN.MIN_BOX_SIZE
+# R-CNN and RPN
+# size of images for each device, 2 for rcnn, 1 for rpn and e2e
+config.TRAIN.BATCH_IMAGES = 8
+# e2e changes behavior of anchor loader and metric
+config.TRAIN.END2END = True
+# group images with similar aspect ratio
+config.TRAIN.ASPECT_GROUPING = False
+
+# RPN anchor loader
+# rpn anchors batch size
+config.TRAIN.RPN_ENABLE_OHEM = 2
+config.TRAIN.OHEM_MODE = 1
+config.TRAIN.RPN_BATCH_SIZE = 256
+# rpn anchors sampling params
+config.TRAIN.RPN_FG_FRACTION = 0.25
+config.TRAIN.RPN_POSITIVE_OVERLAP = 0.5
+config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+if config.CASCADE > 0:
+    config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+config.TRAIN.CASCADE_OVERLAP = [0.4, 0.5]
+config.TRAIN.RPN_CLOBBER_POSITIVES = False
+config.TRAIN.RPN_FORCE_POSITIVE = False
+# rpn bounding box regression params
+config.TRAIN.BBOX_STDS = (1.0, 1.0, 1.0, 1.0)
+config.TRAIN.LANDMARK_STD = 1.0
+
+config.TEST = edict()
+
+# R-CNN testing
+# use rpn to generate proposal
+config.TEST.HAS_RPN = False
+# size of images for each device
+config.TEST.BATCH_IMAGES = 1
+
+# RPN proposal
+config.TEST.CXX_PROPOSAL = True
+config.TEST.RPN_NMS_THRESH = 0.3
+config.TEST.RPN_PRE_NMS_TOP_N = 1000
+config.TEST.RPN_POST_NMS_TOP_N = 3000
+#config.TEST.RPN_MIN_SIZE = config.RPN_FEAT_STRIDE
+#config.TEST.RPN_MIN_SIZE = [0,0,0]
+
+# RCNN nms
+config.TEST.NMS = 0.3
+
+config.TEST.SCORE_THRESH = 0.05
+config.TEST.IOU_THRESH = 0.5
+
+# network settings
+network = edict()
+
+network.ssh = edict()
+
+network.mnet = edict()
+#network.mnet.pretrained = 'model/mnasnet'
+#network.mnet.pretrained = 'model/mobilenetv2_0_5'
+#network.mnet.pretrained = 'model/mobilenet_0_5'
+#network.mnet.MULTIPLIER = 0.5
+#network.mnet.pretrained = 'model/mobilenet_0_25'
+#network.mnet.pretrained_epoch = 0
+#network.mnet.PIXEL_MEANS = np.array([0.406, 0.456, 0.485])
+#network.mnet.PIXEL_STDS = np.array([0.225, 0.224, 0.229])
+#network.mnet.PIXEL_SCALE = 255.0
+network.mnet.FIXED_PARAMS = ['^stage1', '^.*upsampling']
+network.mnet.BATCH_IMAGES = 16
+network.mnet.HEAD_FILTER_NUM = 64
+network.mnet.CONTEXT_FILTER_RATIO = 1
+
+network.mnet.PIXEL_MEANS = np.array([0.0, 0.0, 0.0])
+network.mnet.PIXEL_STDS = np.array([1.0, 1.0, 1.0])
+network.mnet.PIXEL_SCALE = 1.0
+#network.mnet.pretrained = 'model/mobilenetfd_0_25' #78
+#network.mnet.pretrained = 'model/mobilenetfd2' #75
+network.mnet.pretrained = 'model/mobilenet025fd0'  #78
+#network.mnet.pretrained = 'model/mobilenet025fd1' #75
+#network.mnet.pretrained = 'model/mobilenet025fd2' #
+network.mnet.pretrained_epoch = 0
+network.mnet.max_feat_channel = 8888
+network.mnet.COLOR_MODE = 1
+network.mnet.USE_CROP = True
+network.mnet.RPN_ANCHOR_CFG = RAC_SSH
+network.mnet.LAYER_FIX = True
+network.mnet.LANDMARK_LR_MULT = 2.5
+
+network.resnet = edict()
+#network.resnet.pretrained = 'model/ResNet50_v1d'
+#network.resnet.pretrained = 'model/resnet-50'
+network.resnet.pretrained = 'model/resnet-152'
+#network.resnet.pretrained = 'model/senet154'
+#network.resnet.pretrained = 'model/densenet161'
+network.resnet.pretrained_epoch = 0
+#network.mnet.PIXEL_MEANS = np.array([103.939, 116.779, 123.68])
+#network.mnet.PIXEL_STDS = np.array([57.375, 57.12, 58.393])
+#network.resnet.PIXEL_MEANS = np.array([0.406, 0.456, 0.485])
+#network.resnet.PIXEL_STDS = np.array([0.225, 0.224, 0.229])
+#network.resnet.PIXEL_SCALE = 255.0
+network.resnet.lr_step = '1,2,3,4,5,55,68,80'
+network.resnet.lr = 0.001
+network.resnet.PIXEL_MEANS = np.array([0.0, 0.0, 0.0])
+network.resnet.PIXEL_STDS = np.array([1.0, 1.0, 1.0])
+network.resnet.PIXEL_SCALE = 1.0
+network.resnet.FIXED_PARAMS = ['^stage1', '^.*upsampling']
+network.resnet.BATCH_IMAGES = 8
+network.resnet.HEAD_FILTER_NUM = 256
+network.resnet.CONTEXT_FILTER_RATIO = 1
+network.resnet.USE_DCN = 2
+network.resnet.RPN_BATCH_SIZE = 256
+network.resnet.RPN_ANCHOR_CFG = RAC_RETINA
+
+network.resnet.USE_DCN = 0
+network.resnet.pretrained = 'model/resnet-50'
+network.resnet.RPN_ANCHOR_CFG = RAC_SSH
+
+# dataset settings
+dataset = edict()
+
+dataset.widerface = edict()
+dataset.widerface.dataset = 'widerface'
+dataset.widerface.image_set = 'train'
+dataset.widerface.test_image_set = 'val'
+dataset.widerface.root_path = 'data'
+dataset.widerface.dataset_path = 'data/widerface'
+dataset.widerface.NUM_CLASSES = 2
+
+dataset.retinaface = edict()
+dataset.retinaface.dataset = 'retinaface'
+dataset.retinaface.image_set = 'train'
+dataset.retinaface.test_image_set = 'val'
+dataset.retinaface.root_path = 'data'
+dataset.retinaface.dataset_path = 'data/retinaface'
+dataset.retinaface.NUM_CLASSES = 2
+
+# default settings
+default = edict()
+
+config.FIXED_PARAMS = ['^conv1', '^conv2', '^conv3', '^.*upsampling']
+#config.FIXED_PARAMS = ['^.*upsampling']
+#config.FIXED_PARAMS = ['^conv1', '^conv2', '^conv3']
+#config.FIXED_PARAMS = ['^conv0', '^stage1', 'gamma', 'beta']  #for resnet
+
+# default network
+default.network = 'resnet'
+default.pretrained = 'model/resnet-152'
+#default.network = 'resnetssh'
+default.pretrained_epoch = 0
+# default dataset
+default.dataset = 'retinaface'
+default.image_set = 'train'
+default.test_image_set = 'val'
+default.root_path = 'data'
+default.dataset_path = 'data/retinaface'
+# default training
+default.frequent = 20
+default.kvstore = 'device'
+# default e2e
+default.prefix = 'model/retinaface'
+default.end_epoch = 10000
+default.lr_step = '55,68,80'
+default.lr = 0.01
+default.wd = 0.0005
+
+
+def generate_config(_network, _dataset):
+    for k, v in network[_network].items():
+        if k in config:
+            config[k] = v
+        elif k in default:
+            default[k] = v
+        if k in config.TRAIN:
+            config.TRAIN[k] = v
+    for k, v in dataset[_dataset].items():
+        if k in config:
+            config[k] = v
+        elif k in default:
+            default[k] = v
+        if k in config.TRAIN:
+            config.TRAIN[k] = v
+    config.network = _network
+    config.dataset = _dataset
+    config.RPN_FEAT_STRIDE = []
+    num_anchors = []
+    for k in config.RPN_ANCHOR_CFG:
+        config.RPN_FEAT_STRIDE.append(int(k))
+        _num_anchors = len(config.RPN_ANCHOR_CFG[k]['SCALES']) * len(
+            config.RPN_ANCHOR_CFG[k]['RATIOS'])
+        if config.DENSE_ANCHOR:
+            _num_anchors *= 2
+        config.RPN_ANCHOR_CFG[k]['NUM_ANCHORS'] = _num_anchors
+        num_anchors.append(_num_anchors)
+    config.RPN_FEAT_STRIDE = sorted(config.RPN_FEAT_STRIDE, reverse=True)
+    for j in range(1, len(num_anchors)):
+        assert num_anchors[0] == num_anchors[j]
+    config.NUM_ANCHORS = num_anchors[0]
diff --git a/insightface/detection/retinaface/rcnn/symbol/__init__.py b/insightface/detection/retinaface/rcnn/symbol/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ed3e17686f9f1f7cc600189409ac651c359311e
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/__init__.py
@@ -0,0 +1,3 @@
+from .symbol_ssh import *
+from .symbol_mnet import *
+from .symbol_resnet import *
diff --git a/insightface/detection/retinaface/rcnn/symbol/pyramidbox.py b/insightface/detection/retinaface/rcnn/symbol/pyramidbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ae7ce94bdd59b4578a36ab566a06265227d2ee
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/pyramidbox.py
@@ -0,0 +1,489 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import Xavier
+from paddle.fluid.initializer import Constant
+from paddle.fluid.initializer import Bilinear
+from paddle.fluid.regularizer import L2Decay
+
+
+def conv_bn(input,
+            filter,
+            ksize,
+            stride,
+            padding,
+            act='relu',
+            bias_attr=False):
+    conv = fluid.layers.conv2d(input=input,
+                               filter_size=ksize,
+                               num_filters=filter,
+                               stride=stride,
+                               padding=padding,
+                               act=None,
+                               bias_attr=bias_attr)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True):
+    assert len(filters) == groups
+    assert len(ksizes) == groups
+    strides = [1] * groups if strides is None else strides
+    w_attr = ParamAttr(learning_rate=1., initializer=Xavier())
+    b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+    conv = input
+    for i in six.moves.xrange(groups):
+        conv = fluid.layers.conv2d(input=conv,
+                                   num_filters=filters[i],
+                                   filter_size=ksizes[i],
+                                   stride=strides[i],
+                                   padding=(ksizes[i] - 1) // 2,
+                                   param_attr=w_attr,
+                                   bias_attr=b_attr,
+                                   act='relu')
+    if with_pool:
+        pool = fluid.layers.pool2d(input=conv,
+                                   pool_size=2,
+                                   pool_type='max',
+                                   pool_stride=2,
+                                   ceil_mode=True)
+        return conv, pool
+    else:
+        return conv
+
+
+class PyramidBox(object):
+    def __init__(self,
+                 data_shape,
+                 num_classes=None,
+                 use_transposed_conv2d=True,
+                 is_infer=False,
+                 sub_network=False):
+        """
+        TODO(qingqing): add comments.
+        """
+        self.data_shape = data_shape
+        self.min_sizes = [16., 32., 64., 128., 256., 512.]
+        self.steps = [4., 8., 16., 32., 64., 128.]
+        self.num_classes = num_classes
+        self.use_transposed_conv2d = use_transposed_conv2d
+        self.is_infer = is_infer
+        self.sub_network = sub_network
+
+        # the base network is VGG with atrous layers
+        self._input()
+        self._vgg()
+        if sub_network:
+            self._low_level_fpn()
+            self._cpm_module()
+            self._pyramidbox()
+        else:
+            self._vgg_ssd()
+
+    def feeds(self):
+        if self.is_infer:
+            return [self.image]
+        else:
+            return [self.image, self.face_box, self.head_box, self.gt_label]
+
+    def _input(self):
+        self.image = fluid.layers.data(name='image',
+                                       shape=self.data_shape,
+                                       dtype='float32')
+        if not self.is_infer:
+            self.face_box = fluid.layers.data(name='face_box',
+                                              shape=[4],
+                                              dtype='float32',
+                                              lod_level=1)
+            self.head_box = fluid.layers.data(name='head_box',
+                                              shape=[4],
+                                              dtype='float32',
+                                              lod_level=1)
+            self.gt_label = fluid.layers.data(name='gt_label',
+                                              shape=[1],
+                                              dtype='int32',
+                                              lod_level=1)
+
+    def _vgg(self):
+        self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2)
+        self.conv2, self.pool2 = conv_block(self.pool1, 2, [128] * 2, [3] * 2)
+
+        #priorbox min_size is 16
+        self.conv3, self.pool3 = conv_block(self.pool2, 3, [256] * 3, [3] * 3)
+        #priorbox min_size is 32
+        self.conv4, self.pool4 = conv_block(self.pool3, 3, [512] * 3, [3] * 3)
+        #priorbox min_size is 64
+        self.conv5, self.pool5 = conv_block(self.pool4, 3, [512] * 3, [3] * 3)
+
+        # fc6 and fc7 in paper, priorbox min_size is 128
+        self.conv6 = conv_block(self.pool5,
+                                2, [1024, 1024], [3, 1],
+                                with_pool=False)
+        # conv6_1 and conv6_2 in paper, priorbox min_size is 256
+        self.conv7 = conv_block(self.conv6,
+                                2, [256, 512], [1, 3], [1, 2],
+                                with_pool=False)
+        # conv7_1 and conv7_2 in paper, priorbox mini_size is 512
+        self.conv8 = conv_block(self.conv7,
+                                2, [128, 256], [1, 3], [1, 2],
+                                with_pool=False)
+
+    def _low_level_fpn(self):
+        """
+        Low-level feature pyramid network.
+        """
+        def fpn(up_from, up_to):
+            ch = up_to.shape[1]
+            b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+            conv1 = fluid.layers.conv2d(up_from,
+                                        ch,
+                                        1,
+                                        act='relu',
+                                        bias_attr=b_attr)
+            if self.use_transposed_conv2d:
+                w_attr = ParamAttr(learning_rate=0.,
+                                   regularizer=L2Decay(0.),
+                                   initializer=Bilinear())
+                upsampling = fluid.layers.conv2d_transpose(conv1,
+                                                           ch,
+                                                           output_size=None,
+                                                           filter_size=4,
+                                                           padding=1,
+                                                           stride=2,
+                                                           groups=ch,
+                                                           param_attr=w_attr,
+                                                           bias_attr=False,
+                                                           use_cudnn=True)
+            else:
+                upsampling = fluid.layers.resize_bilinear(
+                    conv1, out_shape=up_to.shape[2:])
+
+            conv2 = fluid.layers.conv2d(up_to,
+                                        ch,
+                                        1,
+                                        act='relu',
+                                        bias_attr=b_attr)
+            if self.is_infer:
+                upsampling = fluid.layers.crop(upsampling, shape=conv2)
+            # eltwise mul
+            conv_fuse = upsampling * conv2
+            return conv_fuse
+
+        self.lfpn2_on_conv5 = fpn(self.conv6, self.conv5)
+        self.lfpn1_on_conv4 = fpn(self.lfpn2_on_conv5, self.conv4)
+        self.lfpn0_on_conv3 = fpn(self.lfpn1_on_conv4, self.conv3)
+
+    def _cpm_module(self):
+        """
+        Context-sensitive Prediction Module 
+        """
+        def cpm(input):
+            # residual
+            branch1 = conv_bn(input, 1024, 1, 1, 0, None)
+            branch2a = conv_bn(input, 256, 1, 1, 0, act='relu')
+            branch2b = conv_bn(branch2a, 256, 3, 1, 1, act='relu')
+            branch2c = conv_bn(branch2b, 1024, 1, 1, 0, None)
+            sum = branch1 + branch2c
+            rescomb = fluid.layers.relu(x=sum)
+
+            # ssh
+            b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+            ssh_1 = fluid.layers.conv2d(rescomb,
+                                        256,
+                                        3,
+                                        1,
+                                        1,
+                                        bias_attr=b_attr)
+            ssh_dimred = fluid.layers.conv2d(rescomb,
+                                             128,
+                                             3,
+                                             1,
+                                             1,
+                                             act='relu',
+                                             bias_attr=b_attr)
+            ssh_2 = fluid.layers.conv2d(ssh_dimred,
+                                        128,
+                                        3,
+                                        1,
+                                        1,
+                                        bias_attr=b_attr)
+            ssh_3a = fluid.layers.conv2d(ssh_dimred,
+                                         128,
+                                         3,
+                                         1,
+                                         1,
+                                         act='relu',
+                                         bias_attr=b_attr)
+            ssh_3b = fluid.layers.conv2d(ssh_3a,
+                                         128,
+                                         3,
+                                         1,
+                                         1,
+                                         bias_attr=b_attr)
+
+            ssh_concat = fluid.layers.concat([ssh_1, ssh_2, ssh_3b], axis=1)
+            ssh_out = fluid.layers.relu(x=ssh_concat)
+            return ssh_out
+
+        self.ssh_conv3 = cpm(self.lfpn0_on_conv3)
+        self.ssh_conv4 = cpm(self.lfpn1_on_conv4)
+        self.ssh_conv5 = cpm(self.lfpn2_on_conv5)
+        self.ssh_conv6 = cpm(self.conv6)
+        self.ssh_conv7 = cpm(self.conv7)
+        self.ssh_conv8 = cpm(self.conv8)
+
+    def _l2_norm_scale(self, input, init_scale=1.0, channel_shared=False):
+        from paddle.fluid.layer_helper import LayerHelper
+        helper = LayerHelper("Scale")
+        l2_norm = fluid.layers.l2_normalize(input,
+                                            axis=1)  # l2 norm along channel
+        shape = [1] if channel_shared else [input.shape[1]]
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=shape,
+            dtype=input.dtype,
+            default_initializer=Constant(init_scale))
+        out = fluid.layers.elementwise_mul(x=l2_norm,
+                                           y=scale,
+                                           axis=-1 if channel_shared else 1)
+        return out
+
+    def _pyramidbox(self):
+        """
+        Get prior-boxes and pyramid-box
+        """
+        self.ssh_conv3_norm = self._l2_norm_scale(self.ssh_conv3,
+                                                  init_scale=10.)
+        self.ssh_conv4_norm = self._l2_norm_scale(self.ssh_conv4,
+                                                  init_scale=8.)
+        self.ssh_conv5_norm = self._l2_norm_scale(self.ssh_conv5,
+                                                  init_scale=5.)
+
+        def permute_and_reshape(input, last_dim):
+            trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1])
+            compile_shape = [
+                trans.shape[0],
+                np.prod(trans.shape[1:]) // last_dim, last_dim
+            ]
+            run_shape = fluid.layers.assign(
+                np.array([0, -1, last_dim]).astype("int32"))
+            return fluid.layers.reshape(trans,
+                                        shape=compile_shape,
+                                        actual_shape=run_shape)
+
+        face_locs, face_confs = [], []
+        head_locs, head_confs = [], []
+        boxes, vars = [], []
+        inputs = [
+            self.ssh_conv3_norm, self.ssh_conv4_norm, self.ssh_conv5_norm,
+            self.ssh_conv6, self.ssh_conv7, self.ssh_conv8
+        ]
+        b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+        for i, input in enumerate(inputs):
+            mbox_loc = fluid.layers.conv2d(input, 8, 3, 1, 1, bias_attr=b_attr)
+            face_loc, head_loc = fluid.layers.split(mbox_loc,
+                                                    num_or_sections=2,
+                                                    dim=1)
+            face_loc = permute_and_reshape(face_loc, 4)
+            head_loc = permute_and_reshape(head_loc, 4)
+
+            mbox_conf = fluid.layers.conv2d(input,
+                                            6,
+                                            3,
+                                            1,
+                                            1,
+                                            bias_attr=b_attr)
+            face_conf1, face_conf3, head_conf = fluid.layers.split(
+                mbox_conf, num_or_sections=[1, 3, 2], dim=1)
+            face_conf3_maxin = fluid.layers.reduce_max(face_conf3,
+                                                       dim=1,
+                                                       keep_dim=True)
+            face_conf = fluid.layers.concat([face_conf1, face_conf3_maxin],
+                                            axis=1)
+
+            face_conf = permute_and_reshape(face_conf, 2)
+            head_conf = permute_and_reshape(head_conf, 2)
+
+            face_locs.append(face_loc)
+            face_confs.append(face_conf)
+
+            head_locs.append(head_loc)
+            head_confs.append(head_conf)
+
+            box, var = fluid.layers.prior_box(input,
+                                              self.image,
+                                              min_sizes=[self.min_sizes[i]],
+                                              steps=[self.steps[i]] * 2,
+                                              aspect_ratios=[1.],
+                                              clip=False,
+                                              flip=True,
+                                              offset=0.5)
+            box = fluid.layers.reshape(box, shape=[-1, 4])
+            var = fluid.layers.reshape(var, shape=[-1, 4])
+
+            boxes.append(box)
+            vars.append(var)
+
+        self.face_mbox_loc = fluid.layers.concat(face_locs, axis=1)
+        self.face_mbox_conf = fluid.layers.concat(face_confs, axis=1)
+
+        self.head_mbox_loc = fluid.layers.concat(head_locs, axis=1)
+        self.head_mbox_conf = fluid.layers.concat(head_confs, axis=1)
+
+        self.prior_boxes = fluid.layers.concat(boxes)
+        self.box_vars = fluid.layers.concat(vars)
+
+    def _vgg_ssd(self):
+        self.conv3_norm = self._l2_norm_scale(self.conv3, init_scale=10.)
+        self.conv4_norm = self._l2_norm_scale(self.conv4, init_scale=8.)
+        self.conv5_norm = self._l2_norm_scale(self.conv5, init_scale=5.)
+
+        def permute_and_reshape(input, last_dim):
+            trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1])
+            compile_shape = [
+                trans.shape[0],
+                np.prod(trans.shape[1:]) // last_dim, last_dim
+            ]
+            run_shape = fluid.layers.assign(
+                np.array([0, -1, last_dim]).astype("int32"))
+            return fluid.layers.reshape(trans,
+                                        shape=compile_shape,
+                                        actual_shape=run_shape)
+
+        locs, confs = [], []
+        boxes, vars = [], []
+        b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+
+        # conv3
+        mbox_loc = fluid.layers.conv2d(self.conv3_norm,
+                                       4,
+                                       3,
+                                       1,
+                                       1,
+                                       bias_attr=b_attr)
+        loc = permute_and_reshape(mbox_loc, 4)
+        mbox_conf = fluid.layers.conv2d(self.conv3_norm,
+                                        4,
+                                        3,
+                                        1,
+                                        1,
+                                        bias_attr=b_attr)
+        conf1, conf3 = fluid.layers.split(mbox_conf,
+                                          num_or_sections=[1, 3],
+                                          dim=1)
+        conf3_maxin = fluid.layers.reduce_max(conf3, dim=1, keep_dim=True)
+        conf = fluid.layers.concat([conf1, conf3_maxin], axis=1)
+        conf = permute_and_reshape(conf, 2)
+        box, var = fluid.layers.prior_box(self.conv3_norm,
+                                          self.image,
+                                          min_sizes=[16.],
+                                          steps=[4, 4],
+                                          aspect_ratios=[1.],
+                                          clip=False,
+                                          flip=True,
+                                          offset=0.5)
+        box = fluid.layers.reshape(box, shape=[-1, 4])
+        var = fluid.layers.reshape(var, shape=[-1, 4])
+
+        locs.append(loc)
+        confs.append(conf)
+        boxes.append(box)
+        vars.append(var)
+
+        min_sizes = [32., 64., 128., 256., 512.]
+        steps = [8., 16., 32., 64., 128.]
+        inputs = [
+            self.conv4_norm, self.conv5_norm, self.conv6, self.conv7,
+            self.conv8
+        ]
+        for i, input in enumerate(inputs):
+            mbox_loc = fluid.layers.conv2d(input, 4, 3, 1, 1, bias_attr=b_attr)
+            loc = permute_and_reshape(mbox_loc, 4)
+
+            mbox_conf = fluid.layers.conv2d(input,
+                                            2,
+                                            3,
+                                            1,
+                                            1,
+                                            bias_attr=b_attr)
+            conf = permute_and_reshape(mbox_conf, 2)
+            box, var = fluid.layers.prior_box(input,
+                                              self.image,
+                                              min_sizes=[min_sizes[i]],
+                                              steps=[steps[i]] * 2,
+                                              aspect_ratios=[1.],
+                                              clip=False,
+                                              flip=True,
+                                              offset=0.5)
+            box = fluid.layers.reshape(box, shape=[-1, 4])
+            var = fluid.layers.reshape(var, shape=[-1, 4])
+
+            locs.append(loc)
+            confs.append(conf)
+            boxes.append(box)
+            vars.append(var)
+
+        self.face_mbox_loc = fluid.layers.concat(locs, axis=1)
+        self.face_mbox_conf = fluid.layers.concat(confs, axis=1)
+        self.prior_boxes = fluid.layers.concat(boxes)
+        self.box_vars = fluid.layers.concat(vars)
+
+    def vgg_ssd_loss(self):
+        loss = fluid.layers.ssd_loss(self.face_mbox_loc,
+                                     self.face_mbox_conf,
+                                     self.face_box,
+                                     self.gt_label,
+                                     self.prior_boxes,
+                                     self.box_vars,
+                                     overlap_threshold=0.35,
+                                     neg_overlap=0.35)
+        loss = fluid.layers.reduce_sum(loss)
+        return loss
+
+    def train(self):
+        face_loss = fluid.layers.ssd_loss(self.face_mbox_loc,
+                                          self.face_mbox_conf,
+                                          self.face_box,
+                                          self.gt_label,
+                                          self.prior_boxes,
+                                          self.box_vars,
+                                          overlap_threshold=0.35,
+                                          neg_overlap=0.35)
+        face_loss.persistable = True
+        head_loss = fluid.layers.ssd_loss(self.head_mbox_loc,
+                                          self.head_mbox_conf,
+                                          self.head_box,
+                                          self.gt_label,
+                                          self.prior_boxes,
+                                          self.box_vars,
+                                          overlap_threshold=0.35,
+                                          neg_overlap=0.35)
+        head_loss.persistable = True
+        face_loss = fluid.layers.reduce_sum(face_loss)
+        face_loss.persistable = True
+        head_loss = fluid.layers.reduce_sum(head_loss)
+        head_loss.persistable = True
+        total_loss = face_loss + head_loss
+        total_loss.persistable = True
+        return face_loss, head_loss, total_loss
+
+    def infer(self, main_program=None):
+        if main_program is None:
+            test_program = fluid.default_main_program().clone(for_test=True)
+        else:
+            test_program = main_program.clone(for_test=True)
+        with fluid.program_guard(test_program):
+            face_nmsed_out = fluid.layers.detection_output(
+                self.face_mbox_loc,
+                self.face_mbox_conf,
+                self.prior_boxes,
+                self.box_vars,
+                nms_threshold=0.3,
+                nms_top_k=5000,
+                keep_top_k=750,
+                score_threshold=0.01)
+        return test_program, face_nmsed_out
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_common.py b/insightface/detection/retinaface/rcnn/symbol/symbol_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..1343dd963ed4d472b1556ad10bc1a50cf5a86d01
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_common.py
@@ -0,0 +1,757 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem3, cascade_refine
+
+PREFIX = 'RF'
+F1 = 0
+F2 = 0
+_bwm = 1.0
+
+def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None):
+    if shared_weight is None:
+        weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                    init=mx.init.Normal(0.01),
+                                    attr={'__lr_mult__': '1.0'})
+        bias = mx.symbol.Variable(name="{}_bias".format(name),
+                                  init=mx.init.Constant(0.0),
+                                  attr={
+                                      '__lr_mult__': '2.0',
+                                      '__wd_mult__': str(bias_wd_mult)
+                                  })
+    else:
+        weight = shared_weight
+        bias = shared_bias
+        print('reuse shared var in', name)
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+    return conv
+
+
+def conv_deformable(net, num_filter, num_group=1, act_type='relu', name=''):
+    if config.USE_DCN == 1:
+        f = num_group * 18
+        conv_offset = mx.symbol.Convolution(name=name + '_conv_offset',
+                                            data=net,
+                                            num_filter=f,
+                                            pad=(1, 1),
+                                            kernel=(3, 3),
+                                            stride=(1, 1))
+        net = mx.contrib.symbol.DeformableConvolution(
+            name=name + "_conv",
+            data=net,
+            offset=conv_offset,
+            num_filter=num_filter,
+            pad=(1, 1),
+            kernel=(3, 3),
+            num_deformable_group=num_group,
+            stride=(1, 1),
+            no_bias=False)
+    else:
+        print('use dcnv2 at', name)
+        lr_mult = 0.1
+        weight_var = mx.sym.Variable(name=name + '_conv2_offset_weight',
+                                     init=mx.init.Zero(),
+                                     lr_mult=lr_mult)
+        bias_var = mx.sym.Variable(name=name + '_conv2_offset_bias',
+                                   init=mx.init.Zero(),
+                                   lr_mult=lr_mult)
+        conv2_offset = mx.symbol.Convolution(name=name + '_conv2_offset',
+                                             data=net,
+                                             num_filter=27,
+                                             pad=(1, 1),
+                                             kernel=(3, 3),
+                                             stride=(1, 1),
+                                             weight=weight_var,
+                                             bias=bias_var,
+                                             lr_mult=lr_mult)
+        conv2_offset_t = mx.sym.slice_axis(conv2_offset,
+                                           axis=1,
+                                           begin=0,
+                                           end=18)
+        conv2_mask = mx.sym.slice_axis(conv2_offset,
+                                       axis=1,
+                                       begin=18,
+                                       end=None)
+        conv2_mask = 2 * mx.sym.Activation(conv2_mask, act_type='sigmoid')
+
+        conv2 = mx.contrib.symbol.ModulatedDeformableConvolution(
+            name=name + '_conv2',
+            data=net,
+            offset=conv2_offset_t,
+            mask=conv2_mask,
+            num_filter=num_filter,
+            pad=(1, 1),
+            kernel=(3, 3),
+            stride=(1, 1),
+            num_deformable_group=num_group,
+            no_bias=True)
+        net = conv2
+    net = mx.sym.BatchNorm(data=net,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=0.9,
+                           name=name + '_bn')
+    if len(act_type) > 0:
+        net = mx.symbol.Activation(data=net,
+                                   act_type=act_type,
+                                   name=name + '_act')
+    return net
+
+def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0):
+    assert kernel[0] == 3
+    weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                init=mx.init.Normal(0.01),
+                                attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),
+                              init=mx.init.Constant(0.0),
+                              attr={
+                                  '__lr_mult__': '2.0',
+                                  '__wd_mult__': str(bias_wd_mult)
+                              })
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias)
+    conv = mx.sym.BatchNorm(data=conv,
+                            fix_gamma=False,
+                            eps=2e-5,
+                            momentum=0.9,
+                            name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1):
+
+    if config.USE_DCN > 1 and kernel == (3, 3) and pad == (
+            1, 1) and stride == (1, 1) and not separable:
+        return conv_deformable(from_layer,
+                               num_filter,
+                               num_group=1,
+                               act_type=act_type,
+                               name=name)
+
+    if separable:
+        assert kernel[0] > 1
+        assert filter_in > 0
+    if not separable:
+        weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                    init=mx.init.Normal(0.01),
+                                    attr={'__lr_mult__': '1.0'})
+        bias = mx.symbol.Variable(name="{}_bias".format(name),
+                                  init=mx.init.Constant(0.0),
+                                  attr={
+                                      '__lr_mult__': '2.0',
+                                      '__wd_mult__': str(bias_wd_mult)
+                                  })
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias)
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    else:
+        if filter_in < 0:
+            filter_in = num_filter
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_sep_bn')
+        conv = mx.symbol.Activation(data=conv, act_type='relu', \
+            name="{}_sep_bn_relu".format(name))
+        conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \
+            stride=(1,1), num_filter=num_filter, name="{}".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+
+def ssh_context_module(body, num_filter, filter_in, name):
+    conv_dimred = conv_act_layer(body,
+                                 name + '_conv1',
+                                 num_filter,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 separable=False,
+                                 filter_in=filter_in)
+    conv5x5 = conv_act_layer(conv_dimred,
+                             name + '_conv2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=False)
+    conv7x7_1 = conv_act_layer(conv_dimred,
+                               name + '_conv3_1',
+                               num_filter,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               separable=False)
+    conv7x7 = conv_act_layer(conv7x7_1,
+                             name + '_conv3_2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=False)
+    return (conv5x5, conv7x7)
+
+
+def ssh_detection_module(body, num_filter, filter_in, name):
+    assert num_filter % 4 == 0
+    conv3x3 = conv_act_layer(body,
+                             name + '_conv1',
+                             num_filter // 2,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=False,
+                             filter_in=filter_in)
+    #_filter = max(num_filter//4, 16)
+    _filter = num_filter // 4
+    conv5x5, conv7x7 = ssh_context_module(body, _filter, filter_in,
+                                          name + '_context')
+    ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7],
+                        dim=1,
+                        name=name + '_concat')
+    ret = mx.symbol.Activation(data=ret,
+                               act_type='relu',
+                               name=name + '_concat_relu')
+    out_filter = num_filter // 2 + _filter * 2
+    if config.USE_DCN > 0:
+        ret = conv_deformable(ret,
+                              num_filter=out_filter,
+                              name=name + '_concat_dcn')
+    return ret
+
+
+#def retina_context_module(body, kernel, num_filter, filter_in, name):
+#  conv_dimred = conv_act_layer(body, name+'_conv0',
+#      num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in)
+#  conv1 = conv_act_layer(conv_dimred, name+'_conv1',
+#      num_filter*6, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in)
+#  conv2 = conv_act_layer(conv1, name+'_conv2',
+#      num_filter*6, kernel=kernel, pad=((kernel[0]-1)//2, (kernel[1]-1)//2), stride=(1, 1), act_type='relu', separable=True, filter_in = num_filter*6)
+#  conv3 = conv_act_layer(conv2, name+'_conv3',
+#      num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False)
+#  conv3 = conv3 + conv_dimred
+#  return conv3
+
+
+def retina_detection_module(body, num_filter, filter_in, name):
+    assert num_filter % 4 == 0
+    conv1 = conv_act_layer(body,
+                           name + '_conv1',
+                           num_filter // 2,
+                           kernel=(3, 3),
+                           pad=(1, 1),
+                           stride=(1, 1),
+                           act_type='relu',
+                           separable=False,
+                           filter_in=filter_in)
+    conv2 = conv_act_layer(conv1,
+                           name + '_conv2',
+                           num_filter // 2,
+                           kernel=(3, 3),
+                           pad=(1, 1),
+                           stride=(1, 1),
+                           act_type='relu',
+                           separable=False,
+                           filter_in=num_filter // 2)
+    conv3 = conv_act_layer(conv2,
+                           name + '_conv3',
+                           num_filter // 2,
+                           kernel=(3, 3),
+                           pad=(1, 1),
+                           stride=(1, 1),
+                           act_type='relu',
+                           separable=False,
+                           filter_in=num_filter // 2)
+    conv4 = conv2 + conv3
+    body = mx.sym.concat(*[conv1, conv4], dim=1, name=name + '_concat')
+    if config.USE_DCN > 0:
+        body = conv_deformable(body,
+                               num_filter=num_filter,
+                               name=name + '_concat_dcn')
+    return body
+
+
+def head_module(body, num_filter, filter_in, name):
+    if config.HEAD_MODULE == 'SSH':
+        return ssh_detection_module(body, num_filter, filter_in, name)
+    else:
+        return retina_detection_module(body, num_filter, filter_in, name)
+
+
+def upsampling(data, num_filter, name):
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2),  stride=(2, 2), pad=(0,0),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    ret = mx.symbol.UpSampling(data,
+                               scale=2,
+                               sample_type='nearest',
+                               workspace=512,
+                               name=name,
+                               num_args=1)
+    return ret
+
+
+def get_sym_by_name(name, sym_buffer):
+    if name in sym_buffer:
+        return sym_buffer[name]
+    ret = None
+    name_key = name[0:1]
+    name_num = int(name[1:])
+    #print('getting', name, name_key, name_num)
+    if name_key == 'C':
+        assert name_num % 2 == 0
+        bottom = get_sym_by_name('C%d' % (name_num // 2), sym_buffer)
+        ret = conv_act_layer(bottom,
+                             '%s_C%d' (PREFIX, name_num),
+                             F1,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(2, 2),
+                             act_type='relu',
+                             bias_wd_mult=_bwm)
+    elif name_key == 'P':
+        assert name_num % 2 == 0
+        assert name_num <= max(config.RPN_FEAT_STRIDE)
+        lateral = get_sym_by_name('L%d' % (name_num), sym_buffer)
+        if name_num == max(config.RPN_FEAT_STRIDE) or name_num > 32:
+            ret = mx.sym.identity(lateral, name='%s_P%d' % (PREFIX, name_num))
+        else:
+            bottom = get_sym_by_name('L%d' % (name_num * 2), sym_buffer)
+            bottom_up = upsampling(bottom, F1, '%s_U%d' % (PREFIX, name_num))
+            if config.USE_CROP:
+                bottom_up = mx.symbol.Crop(*[bottom_up, lateral])
+            aggr = lateral + bottom_up
+            aggr = conv_act_layer(aggr,
+                                  '%s_A%d' % (PREFIX, name_num),
+                                  F1,
+                                  kernel=(3, 3),
+                                  pad=(1, 1),
+                                  stride=(1, 1),
+                                  act_type='relu',
+                                  bias_wd_mult=_bwm)
+            ret = mx.sym.identity(aggr, name='%s_P%d' % (PREFIX, name_num))
+    elif name_key == 'L':
+        c = get_sym_by_name('C%d' % (name_num), sym_buffer)
+        #print('L', name, F1)
+        ret = conv_act_layer(c,
+                             '%s_L%d' % (PREFIX, name_num),
+                             F1,
+                             kernel=(1, 1),
+                             pad=(0, 0),
+                             stride=(1, 1),
+                             act_type='relu',
+                             bias_wd_mult=_bwm)
+    else:
+        raise RuntimeError('%s is not a valid sym key name' % name)
+    sym_buffer[name] = ret
+    return ret
+
+
+def get_sym_conv(data, sym):
+    all_layers = sym.get_internals()
+
+    isize = 640
+    _, out_shape, _ = all_layers.infer_shape(data=(1, 3, isize, isize))
+    last_entry = None
+    c1 = None
+    c2 = None
+    c3 = None
+    c1_name = None
+    c2_name = None
+    c3_name = None
+    c1_filter = -1
+    c2_filter = -1
+    c3_filter = -1
+    #print(len(all_layers), len(out_shape))
+    #print(all_layers.__class__)
+    outputs = all_layers.list_outputs()
+    #print(outputs.__class__, len(outputs))
+    count = len(outputs)
+    stride2name = {}
+    stride2layer = {}
+    stride2shape = {}
+    for i in range(count):
+        name = outputs[i]
+        shape = out_shape[i]
+        print(i, name, count, shape)
+        if not name.endswith('_output'):
+            continue
+        if len(shape) != 4:
+            continue
+        assert isize % shape[2] == 0
+        if shape[1] > config.max_feat_channel:
+            break
+        stride = isize // shape[2]
+        stride2name[stride] = name
+        stride2layer[stride] = all_layers[name]
+        stride2shape[stride] = shape
+
+    strides = sorted(stride2name.keys())
+    for stride in strides:
+        print('stride', stride, stride2name[stride], stride2shape[stride])
+    print('F1_F2', F1, F2)
+    #print('cnames', c1_name, c2_name, c3_name, F1, F2)
+    _bwm = 1.0
+    ret = {}
+    sym_buffer = {}
+    for stride in [4, 8, 16, 32]:
+        sym_buffer['C%d' % stride] = stride2layer[stride]
+    if not config.USE_FPN:
+        for stride in config.RPN_FEAT_STRIDE:
+            name = 'L%d' % stride
+            ret[stride] = get_sym_by_name(name, sym_buffer)
+    else:
+        for stride in config.RPN_FEAT_STRIDE:
+            name = 'P%d' % stride
+            ret[stride] = get_sym_by_name(name, sym_buffer)
+
+    return ret
+
+
+def get_out(conv_fpn_feat,
+            prefix,
+            stride,
+            landmark=False,
+            lr_mult=1.0,
+            gt_boxes=None):
+    A = config.NUM_ANCHORS
+    bbox_pred_len = 4
+    landmark_pred_len = 10
+    if config.USE_BLUR:
+        bbox_pred_len = 5
+    if config.USE_OCCLUSION:
+        landmark_pred_len = 15
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    cls_label = mx.symbol.Variable(name='%s_label_stride%d' % (prefix, stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d' %
+                                     (prefix, stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d' %
+                                     (prefix, stride))
+    if landmark:
+        landmark_target = mx.symbol.Variable(
+            name='%s_landmark_target_stride%d' % (prefix, stride))
+        landmark_weight = mx.symbol.Variable(
+            name='%s_landmark_weight_stride%d' % (prefix, stride))
+    conv_feat = conv_fpn_feat[stride]
+    rpn_relu = head_module(conv_feat, F2 * config.CONTEXT_FILTER_RATIO, F1,
+                           'rf_head_stride%d' % stride)
+
+    rpn_cls_score = conv_only(rpn_relu,
+                              '%s_rpn_cls_score_stride%d' % (prefix, stride),
+                              2 * num_anchors,
+                              kernel=(1, 1),
+                              pad=(0, 0),
+                              stride=(1, 1))
+
+    rpn_bbox_pred = conv_only(rpn_relu,
+                              '%s_rpn_bbox_pred_stride%d' % (prefix, stride),
+                              bbox_pred_len * num_anchors,
+                              kernel=(1, 1),
+                              pad=(0, 0),
+                              stride=(1, 1))
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score,
+        shape=(0, 2, -1),
+        name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(
+        data=rpn_bbox_pred,
+        shape=(0, 0, -1),
+        name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix, stride))
+    if landmark:
+        rpn_landmark_pred = conv_only(rpn_relu,
+                                      '%s_rpn_landmark_pred_stride%d' %
+                                      (prefix, stride),
+                                      landmark_pred_len * num_anchors,
+                                      kernel=(1, 1),
+                                      pad=(0, 0),
+                                      stride=(1, 1))
+        rpn_landmark_pred_reshape = mx.symbol.Reshape(
+            data=rpn_landmark_pred,
+            shape=(0, 0, -1),
+            name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix, stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM >= 2:
+        label, anchor_weight, pos_count = mx.sym.Custom(
+            op_type='rpn_fpn_ohem3',
+            stride=int(stride),
+            network=config.network,
+            dataset=config.dataset,
+            prefix=prefix,
+            cls_score=rpn_cls_score_reshape,
+            labels=cls_label)
+
+        _bbox_weight = mx.sym.tile(anchor_weight, (1, 1, bbox_pred_len))
+        _bbox_weight = _bbox_weight.reshape(
+            (0, -1, A * bbox_pred_len)).transpose((0, 2, 1))
+        bbox_weight = mx.sym.elemwise_mul(bbox_weight,
+                                          _bbox_weight,
+                                          name='%s_bbox_weight_mul_stride%s' %
+                                          (prefix, stride))
+
+        if landmark:
+            _landmark_weight = mx.sym.tile(anchor_weight,
+                                           (1, 1, landmark_pred_len))
+            _landmark_weight = _landmark_weight.reshape(
+                (0, -1, A * landmark_pred_len)).transpose((0, 2, 1))
+            landmark_weight = mx.sym.elemwise_mul(
+                landmark_weight,
+                _landmark_weight,
+                name='%s_landmark_weight_mul_stride%s' % (prefix, stride))
+    else:
+        label = cls_label
+        #if not config.FACE_LANDMARK:
+        #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+        #else:
+        #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid',
+                                           use_ignore=True,
+                                           ignore_label=-1,
+                                           grad_scale=lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d' %
+                                           (prefix, stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    pos_count = mx.symbol.sum(pos_count)
+    pos_count = pos_count + 0.001  #avoid zero
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape - bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_' %
+                                         (prefix, stride),
+                                         scalar=3.0,
+                                         data=bbox_diff)
+    bbox_lr_mode0 = 0.25 * lr_mult * config.TRAIN.BATCH_IMAGES / config.TRAIN.RPN_BATCH_SIZE
+    landmark_lr_mode0 = 0.4 * config.LANDMARK_LR_MULT * bbox_lr_mode0
+    if config.LR_MODE == 0:
+        rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d' %
+                                        (prefix, stride),
+                                        data=rpn_bbox_loss_,
+                                        grad_scale=bbox_lr_mode0)
+    else:
+        rpn_bbox_loss_ = mx.symbol.broadcast_div(rpn_bbox_loss_, pos_count)
+        rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d' %
+                                        (prefix, stride),
+                                        data=rpn_bbox_loss_,
+                                        grad_scale=0.5 * lr_mult)
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+        landmark_diff = rpn_landmark_pred_reshape - landmark_target
+        landmark_diff = landmark_diff * landmark_weight
+        rpn_landmark_loss_ = mx.symbol.smooth_l1(
+            name='%s_rpn_landmark_loss_stride%d_' % (prefix, stride),
+            scalar=3.0,
+            data=landmark_diff)
+        if config.LR_MODE == 0:
+            rpn_landmark_loss = mx.sym.MakeLoss(
+                name='%s_rpn_landmark_loss_stride%d' % (prefix, stride),
+                data=rpn_landmark_loss_,
+                grad_scale=landmark_lr_mode0)
+        else:
+            rpn_landmark_loss_ = mx.symbol.broadcast_div(
+                rpn_landmark_loss_, pos_count)
+            rpn_landmark_loss = mx.sym.MakeLoss(
+                name='%s_rpn_landmark_loss_stride%d' % (prefix, stride),
+                data=rpn_landmark_loss_,
+                grad_scale=0.2 * config.LANDMARK_LR_MULT * lr_mult)
+        ret_group.append(rpn_landmark_loss)
+        ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    if config.USE_3D:
+        from rcnn.PY_OP import rpn_3d_mesh
+        pass
+    if config.CASCADE > 0:
+        if config.CASCADE_MODE == 0:
+            body = rpn_relu
+        elif config.CASCADE_MODE == 1:
+            body = head_module(conv_feat, F2 * config.CONTEXT_FILTER_RATIO, F1,
+                               '%s_head_stride%d_cas' % (PREFIX, stride))
+        elif config.CASCADE_MODE == 2:
+            body = conv_feat + rpn_relu
+            body = head_module(body, F2 * config.CONTEXT_FILTER_RATIO, F1,
+                               '%s_head_stride%d_cas' % (PREFIX, stride))
+        else:
+            body = head_module(conv_feat, F2 * config.CONTEXT_FILTER_RATIO, F1,
+                               '%s_head_stride%d_cas' % (PREFIX, stride))
+            body = mx.sym.concat(body,
+                                 rpn_cls_score,
+                                 rpn_bbox_pred,
+                                 rpn_landmark_pred,
+                                 dim=1)
+
+        #cls_pred = rpn_cls_prob
+        cls_pred_t0 = rpn_cls_score_reshape
+        cls_label_raw = cls_label
+        cls_label_t0 = label
+        bbox_pred_t0 = rpn_bbox_pred_reshape
+        #bbox_pred = rpn_bbox_pred
+        #bbox_pred = mx.sym.transpose(bbox_pred, (0, 2, 3, 1))
+        #bbox_pred_len = 4
+        #bbox_pred = mx.sym.reshape(bbox_pred, (0, -1, bbox_pred_len))
+        bbox_label_t0 = bbox_target
+        #prefix = prefix+'2'
+        for casid in range(config.CASCADE):
+            #pseudo-code
+            #anchor_label = GENANCHOR(bbox_label, bbox_pred, stride)
+            #bbox_label = F(anchor_label, bbox_pred)
+            #bbox_label = bbox_label - bbox_pred
+            cls_pred = conv_only(body,
+                                 '%s_rpn_cls_score_stride%d_cas%d' %
+                                 (prefix, stride, casid),
+                                 2 * num_anchors,
+                                 kernel=(1, 1),
+                                 pad=(0, 0),
+                                 stride=(1, 1))
+            rpn_cls_score_reshape = mx.symbol.Reshape(
+                data=cls_pred,
+                shape=(0, 2, -1),
+                name="%s_rpn_cls_score_reshape_stride%s_cas%d" %
+                (prefix, stride, casid))
+
+            #bbox_label equals to bbox_target
+            #cls_pred, cls_label, bbox_pred, bbox_label, bbox_weight, pos_count = mx.sym.Custom(op_type='cascade_refine', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_pred=cls_pred, cls_label = cls_label, bbox_pred = bbox_pred, bbox_label = bbox_label)
+            #cls_label, bbox_label, anchor_weight, pos_count = mx.sym.Custom(op_type='cascade_refine', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_pred_t0=cls_pred_t0, cls_label_t0 = cls_label_t0, cls_pred = rpn_cls_score_reshape, bbox_pred_t0 = bbox_pred_t0, bbox_label_t0 = bbox_label_t0)
+            cls_label, bbox_label, anchor_weight, pos_count = mx.sym.Custom(
+                op_type='cascade_refine',
+                stride=int(stride),
+                network=config.network,
+                dataset=config.dataset,
+                prefix=prefix,
+                cls_label_t0=cls_label_t0,
+                cls_pred_t0=cls_pred_t0,
+                cls_pred=rpn_cls_score_reshape,
+                bbox_pred_t0=bbox_pred_t0,
+                bbox_label_t0=bbox_label_t0,
+                cls_label_raw=cls_label_raw,
+                cas_gt_boxes=gt_boxes)
+            if stride in config.CASCADE_CLS_STRIDES:
+                rpn_cls_prob = mx.symbol.SoftmaxOutput(
+                    data=rpn_cls_score_reshape,
+                    label=cls_label,
+                    multi_output=True,
+                    normalization='valid',
+                    use_ignore=True,
+                    ignore_label=-1,
+                    grad_scale=lr_mult,
+                    name='%s_rpn_cls_prob_stride%d_cas%d' %
+                    (prefix, stride, casid))
+                ret_group.append(rpn_cls_prob)
+                ret_group.append(mx.sym.BlockGrad(cls_label))
+            if stride in config.CASCADE_BBOX_STRIDES:
+                bbox_pred = conv_only(body,
+                                      '%s_rpn_bbox_pred_stride%d_cas%d' %
+                                      (prefix, stride, casid),
+                                      bbox_pred_len * num_anchors,
+                                      kernel=(1, 1),
+                                      pad=(0, 0),
+                                      stride=(1, 1))
+
+                rpn_bbox_pred_reshape = mx.symbol.Reshape(
+                    data=bbox_pred,
+                    shape=(0, 0, -1),
+                    name="%s_rpn_bbox_pred_reshape_stride%s_cas%d" %
+                    (prefix, stride, casid))
+                _bbox_weight = mx.sym.tile(anchor_weight,
+                                           (1, 1, bbox_pred_len))
+                _bbox_weight = _bbox_weight.reshape(
+                    (0, -1, A * bbox_pred_len)).transpose((0, 2, 1))
+                bbox_weight = _bbox_weight
+                pos_count = mx.symbol.sum(pos_count)
+                pos_count = pos_count + 0.01  #avoid zero
+                #bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride))
+                #bbox loss
+                bbox_diff = rpn_bbox_pred_reshape - bbox_label
+                bbox_diff = bbox_diff * bbox_weight
+                rpn_bbox_loss_ = mx.symbol.smooth_l1(
+                    name='%s_rpn_bbox_loss_stride%d_cas%d' %
+                    (prefix, stride, casid),
+                    scalar=3.0,
+                    data=bbox_diff)
+                if config.LR_MODE == 0:
+                    rpn_bbox_loss = mx.sym.MakeLoss(
+                        name='%s_rpn_bbox_loss_stride%d_cas%d' %
+                        (prefix, stride, casid),
+                        data=rpn_bbox_loss_,
+                        grad_scale=bbox_lr_mode0)
+                else:
+                    rpn_bbox_loss_ = mx.symbol.broadcast_div(
+                        rpn_bbox_loss_, pos_count)
+                    rpn_bbox_loss = mx.sym.MakeLoss(
+                        name='%s_rpn_bbox_loss_stride%d_cas%d' %
+                        (prefix, stride, casid),
+                        data=rpn_bbox_loss_,
+                        grad_scale=0.5 * lr_mult)
+                ret_group.append(rpn_bbox_loss)
+                ret_group.append(mx.sym.BlockGrad(bbox_weight))
+                #bbox_pred = rpn_bbox_pred_reshape
+
+    return ret_group
+
+
+def get_sym_train(sym):
+    data = mx.symbol.Variable(name="data")
+    global F1, F2
+    F1 = config.HEAD_FILTER_NUM
+    F2 = F1
+
+    # shared convolutional layers
+    conv_fpn_feat = get_sym_conv(data, sym)
+    ret_group = []
+    gt_boxes = None
+    if config.CASCADE > 0:
+        gt_boxes = mx.sym.Variable('gt_boxes')
+
+    for stride in config.RPN_FEAT_STRIDE:
+        ret = get_out(conv_fpn_feat,
+                      'face',
+                      stride,
+                      config.FACE_LANDMARK,
+                      lr_mult=1.0,
+                      gt_boxes=gt_boxes)
+        ret_group += ret
+
+    return mx.sym.Group(ret_group)
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_common.py.bak b/insightface/detection/retinaface/rcnn/symbol/symbol_common.py.bak
new file mode 100644
index 0000000000000000000000000000000000000000..30f36d5eac4d87e30bc5065f05941c3fc365a9ae
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_common.py.bak
@@ -0,0 +1,649 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem, rpn_fpn_ohem2, rpn_fpn_ohem3
+
+def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None):
+  if shared_weight is None:
+    weight = mx.symbol.Variable(name="{}_weight".format(name),   
+        init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),   
+        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)})
+  else:
+    weight = shared_weight
+    bias = shared_bias
+    print('reuse shared var in', name)
+  conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+      stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+  return conv
+
+def conv_deformable(net, num_filter, num_group=1, act_type='relu',name=''):
+  if config.USE_DCN==1:
+    f = num_group*18
+    conv_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = net,
+                        num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1))
+    net = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=net, offset=conv_offset,
+                        num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False)
+  else:
+    print('use dcnv2 at', name)
+    lr_mult = 0.1
+    weight_var = mx.sym.Variable(name=name+'_conv2_offset_weight', init=mx.init.Zero(), lr_mult=lr_mult)
+    bias_var = mx.sym.Variable(name=name+'_conv2_offset_bias', init=mx.init.Zero(), lr_mult=lr_mult)
+    conv2_offset = mx.symbol.Convolution(name=name + '_conv2_offset', data=net, num_filter=27,
+      pad=(1, 1), kernel=(3, 3), stride=(1,1), weight=weight_var, bias=bias_var, lr_mult=lr_mult)
+    conv2_offset_t = mx.sym.slice_axis(conv2_offset, axis=1, begin=0, end=18)
+    conv2_mask =  mx.sym.slice_axis(conv2_offset, axis=1, begin=18, end=None)
+    conv2_mask = 2 * mx.sym.Activation(conv2_mask, act_type='sigmoid')
+
+    conv2 = mx.contrib.symbol.ModulatedDeformableConvolution(name=name + '_conv2', data=net, offset=conv2_offset_t, mask=conv2_mask,
+        num_filter=num_filter, pad=(1, 1), kernel=(3, 3), stride=(1,1), 
+        num_deformable_group=num_group, no_bias=True)
+    net = conv2
+  net = mx.sym.BatchNorm(data=net, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn')
+  if len(act_type)>0:
+    net = mx.symbol.Activation(data=net, act_type=act_type, name=name+'_act')
+  return net
+
+def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0):
+    assert kernel[0]==3
+    weight = mx.symbol.Variable(name="{}_weight".format(name),   
+        init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),   
+        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)})
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias)
+    conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn')
+    if len(act_type)>0:
+      relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+          name="{}_{}".format(name, act_type))
+    else:
+      relu = conv
+    return relu
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1):
+
+    if config.USE_DCN>1 and kernel==(3,3) and pad==(1,1) and stride==(1,1) and not separable:
+      return conv_deformable(from_layer, num_filter, num_group=1, act_type = act_type, name=name)
+
+    if separable:
+      assert kernel[0]>1
+      assert filter_in>0
+    if not separable:
+      weight = mx.symbol.Variable(name="{}_weight".format(name),   
+          init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+      bias = mx.symbol.Variable(name="{}_bias".format(name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)})
+      conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+          stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias)
+      conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn')
+    else:
+      if filter_in<0:
+        filter_in = num_filter
+      conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+          stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name))
+      conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_sep_bn')
+      conv = mx.symbol.Activation(data=conv, act_type='relu', \
+          name="{}_sep_bn_relu".format(name))
+      conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \
+          stride=(1,1), num_filter=num_filter, name="{}".format(name))
+      conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn')
+    if len(act_type)>0:
+      relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+          name="{}_{}".format(name, act_type))
+    else:
+      relu = conv
+    return relu
+
+def ssh_context_module(body, num_filter, filter_in, name):
+  conv_dimred = conv_act_layer(body, name+'_conv1',
+      num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in)
+  conv5x5 = conv_act_layer(conv_dimred, name+'_conv2',
+      num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False)
+  conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1',
+      num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False)
+  conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2',
+      num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False)
+  return (conv5x5, conv7x7)
+
+
+def ssh_detection_module(body, num_filter, filter_in, name):
+  conv3x3 = conv_act_layer(body, name+'_conv1',
+      num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False, filter_in=filter_in)
+  conv5x5, conv7x7 = ssh_context_module(body, num_filter//2, filter_in, name+'_context')
+  ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat')
+  ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu')
+  if config.USE_DCN>0:
+    ret = conv_deformable(ret, num_filter = num_filter*2, name = name+'_concat_dcn')
+  return ret
+
+def insight_context_module(body, kernel, num_filter, filter_in, name):
+  conv_dimred = conv_act_layer(body, name+'_conv0',
+      num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in)
+  conv1 = conv_act_layer(conv_dimred, name+'_conv1',
+      num_filter*6, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in)
+  conv2 = conv_act_layer(conv1, name+'_conv2',
+      num_filter*6, kernel=kernel, pad=((kernel[0]-1)//2, (kernel[1]-1)//2), stride=(1, 1), act_type='relu', separable=True, filter_in = num_filter*6)
+  conv3 = conv_act_layer(conv2, name+'_conv3',
+      num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False)
+  conv3 = conv3 + conv_dimred
+  return conv3
+
+def insight_detection_module(body, num_filter, filter_in, name):
+  conv3x3 = insight_context_module(body, (3,3), num_filter//2, filter_in, name+'_context3x3')
+  conv5x5 = insight_context_module(body, (5,5), num_filter//2, filter_in, name+'_context5x5')
+  ret = mx.sym.concat(*[conv3x3, conv5x5], dim=1, name = name+'_concat')
+  if config.USE_DCN:
+    ret = conv_deformable(ret, num_filter = num_filter*2, name = name+'_concat_dcn')
+  return ret
+
+def upsampling(data, num_filter, name):
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2),  stride=(2, 2), pad=(0,0),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    ret = mx.symbol.UpSampling(data, scale=2, sample_type='nearest', workspace=512, name=name, num_args=1)
+    return ret
+
+def get_sym_conv(data, sym):
+    mm = config.MULTIPLIER
+    all_layers = sym.get_internals()
+    #print(all_layers)
+    ##c1 = all_layers['mobilenetv20_features_linearbottleneck6_relu60_relu6_output'] #96
+    #c1 = all_layers['mobilenetv20_features_linearbottleneck5_elemwise_add0_output'] # 16
+    ##c2 = all_layers['mobilenetv20_features_linearbottleneck13_relu60_relu6_output']
+    #c2 = all_layers['mobilenetv20_features_linearbottleneck12_elemwise_add0_output'] # 48
+    ##c3 = all_layers['mobilenetv20_features_linearbottleneck16_batchnorm2_fwd_output'] # 160
+    #c3 = all_layers['mobilenetv20_features_linearbottleneck13_batchnorm2_fwd_output'] # 80
+    #c1_filter = int(32*mm)
+    #c2_filter = int(96*mm)
+    #c3_filter = int(160*mm)
+
+    #c1 = all_layers['mobilenet0_relu10_fwd_output']
+    #c2 = all_layers['mobilenet0_relu22_fwd_output']
+    #c3 = all_layers['mobilenet0_relu26_fwd_output']
+
+    #c1 = all_layers['conv_6_relu_output']
+    #c2 = all_layers['conv_12_relu_output']
+    #c3 = all_layers['conv_14_relu_output']
+    #c1_filter = int(256*mm)
+    #c2_filter = int(512*mm)
+    #c3_filter = int(1024*mm)
+
+    isize = 640
+    _, out_shape, _ = all_layers.infer_shape(data = (1,3,isize,isize))
+    last_entry = None
+    c1 = None
+    c2 = None
+    c3 = None
+    c1_name = None
+    c2_name = None
+    c3_name = None
+    c1_filter = -1
+    c2_filter = -1
+    c3_filter = -1
+    #print(len(all_layers), len(out_shape))
+    #print(all_layers.__class__)
+    outputs = all_layers.list_outputs()
+    #print(outputs.__class__, len(outputs))
+    count = len(outputs)
+    stride2name = {}
+    stride2layer = {}
+    for i in range(count):
+      name = outputs[i]
+      shape = out_shape[i]
+      if not name.endswith('_output'):
+        continue
+      if len(shape)!=4:
+        continue
+      assert isize%shape[2]==0
+      stride = isize//shape[2]
+      stride2name[stride] = name
+      stride2layer[stride] = all_layers[name]
+      #print(name, shape)
+      #if c1 is None and shape[2]==isize//16:
+      #  cname = last_entry[0]
+      #  #print('c1', last_entry)
+      #  c1 = all_layers[cname]
+      #  c1_name = cname
+      #if c2 is None and shape[2]==isize//32:
+      #  cname = last_entry[0]
+      #  #print('c2', last_entry)
+      #  c2 = all_layers[cname]
+      #  c2_name = cname
+      #if shape[2]==isize//32:
+      #  c3 = all_layers[name]
+      #  #print('c3', name, shape)
+      #  c3_name = name
+
+      #last_entry = (name, shape)
+
+    #F1 = int(256*mm)
+    #F2 = int(128*mm)
+    F1 = int(config.HEAD_FILTER_NUM*mm)
+    F2 = F1
+    if config.SHARE_WEIGHT_BBOX or config.SHARE_WEIGHT_LANDMARK:
+      F2 = F1
+    print('stride2name', stride2name, F1, F2)
+    #print('cnames', c1_name, c2_name, c3_name, F1, F2)
+    _bwm = 1.0
+    if config.NET_MODE==0:
+      c1_lateral = conv_act_layer(c1, 'rf_c1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c2_lateral = conv_act_layer(c2, 'rf_c2_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+      #    num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+      #    name='ssh_m2_red_upsampling')
+      #c2_up = mx.symbol.UpSampling(c2_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2_lateral, F2, 'rf_c2_red_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+      c1 = c1_lateral+c2_up
+
+      c1 = conv_act_layer(c1, 'rf_c1_conv',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      m1 = ssh_detection_module(c1, F2, F2, 'rf_c1_det')
+      m2 = ssh_detection_module(c2, F1, c2_filter, 'rf_c2_det')
+      m3 = ssh_detection_module(c3, F1, c3_filter, 'rf_c3_det')
+    elif config.NET_MODE==1:
+      c3_lateral = conv_act_layer(c3, 'ssh_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c3_up = mx.symbol.UpSampling(c3_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3_lateral, F2, 'ssh_c3_upsampling')
+      c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'ssh_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+      c1 = c1_lateral+c2_up
+
+      c1 = conv_act_layer(c1, 'ssh_m1_conv',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+      m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+      m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE==2:
+      c0 = stride2layer[4]
+      c1 = stride2layer[8]
+      c2 = stride2layer[16]
+      c3 = stride2layer[32]
+      c3 = conv_act_layer(c3, 'rf_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3, F2, 'rf_c3_upsampling')
+      c2_lateral = conv_act_layer(c2, 'rf_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'rf_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'rf_c1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'rf_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+      c1 = c1_lateral+c2_up
+      c1 = conv_act_layer(c1, 'rf_c1_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      m1 = ssh_detection_module(c1, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c1_det') #output *2 filters
+      m2 = ssh_detection_module(c2, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c2_det') # output *2 filters
+      m3 = ssh_detection_module(c3, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c3_det')
+      if len(config.RPN_ANCHOR_CFG)==3:
+        ret = {8: m1, 16:m2, 32: m3}
+      elif len(config.RPN_ANCHOR_CFG)==1:
+        ret = {16:m2}
+      elif len(config.RPN_ANCHOR_CFG)==2:
+        ret = {8: m1, 16:m2}
+      elif len(config.RPN_ANCHOR_CFG)==5:
+        c0_lateral = conv_act_layer(c0, 'rf_c0_lateral',
+            F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c1_up = upsampling(c1, F2, 'rf_c1_upsampling')
+        c1_up = mx.symbol.Crop(*[c1_up, c0_lateral])
+        c0 = c0_lateral+c1_up
+        c0 = conv_act_layer(c0, 'rf_c0_aggr',
+            F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+        c4 = conv_act_layer(c3, 'rf_c4',
+            F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm)
+        m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c0_det') #output *2 filters
+        m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c4_det') # output *2 filters
+        ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4}
+      elif len(config.RPN_ANCHOR_CFG)==6:
+        c0_lateral = conv_act_layer(c0, 'rf_c0_lateral',
+            F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c1_up = upsampling(c1, F2, 'rf_c1_upsampling')
+        c1_up = mx.symbol.Crop(*[c1_up, c0_lateral])
+        c0 = c0_lateral+c1_up
+        c0 = conv_act_layer(c0, 'rf_c0_aggr',
+            F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+        c4 = conv_act_layer(c3, 'rf_c4',
+            F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm)
+        c5 = conv_act_layer(c4, 'rf_c5',
+            F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm)
+        m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c0_det') #output *2 filters
+        m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c4_det') # output *2 filters
+        m5 = ssh_detection_module(c5, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c5_det')
+        ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4, 128: m5}
+
+    elif config.NET_MODE==3:
+      assert len(config.RPN_ANCHOR_CFG)==6
+      c0 = stride2layer[4]
+      c1 = stride2layer[8]
+      c2 = stride2layer[16]
+      c3 = stride2layer[32]
+      c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c4 = conv_act_layer(c3, 'ssh_c4',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm)
+      c5 = conv_act_layer(c4, 'ssh_c5',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm)
+      c5_up = upsampling(c5, F2, 'ssh_c5_upsampling')
+      c4_lateral = c4
+      c5_up = mx.symbol.Crop(*[c5_up, c4_lateral])
+      c4 = c4_lateral+c5_up
+      c4 = conv_act_layer(c4, 'ssh_c4_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c4_up = upsampling(c4, F2, 'ssh_c4_upsampling')
+      c4_up = mx.symbol.Crop(*[c4_up, c3])
+      c3 = c3+c4_up
+      c3 = conv_act_layer(c3, 'ssh_c3_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+      #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+      c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'ssh_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+      c1 = c1_lateral+c2_up
+      c1 = conv_act_layer(c1, 'ssh_c1_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      m1 = ssh_detection_module(c1, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m1_det') #output *2 filters
+      m2 = ssh_detection_module(c2, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m2_det') # output *2 filters
+      m3 = ssh_detection_module(c3, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m3_det')
+      c0_lateral = conv_act_layer(c0, 'ssh_c0_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_up = upsampling(c1, F2, 'ssh_c1_upsampling')
+      c1_up = mx.symbol.Crop(*[c1_up, c0_lateral])
+      c0 = c0_lateral+c1_up
+      c0 = conv_act_layer(c0, 'ssh_c0_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+      m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m0_det') #output *2 filters
+      m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m4_det') # output *2 filters
+      m5 = ssh_detection_module(c5, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m5_det')
+      ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4, 128: m5}
+    elif config.NET_MODE==4:
+      c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+      c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'ssh_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+      c1 = c1_lateral+c2_up
+      c1 = conv_act_layer(c1, 'ssh_c1_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+      m1 = ssh_detection_module(c1, F2//2, F2, 'ssh_m1_det')
+      m2 = ssh_detection_module(c2, F1//2, c2_filter, 'ssh_m2_det')
+      m3 = ssh_detection_module(c3, F1//2, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE==5:
+      c3 = conv_act_layer_dw(c3, 'ssh_c3_lateral_m',
+          F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+      c2 = conv_act_layer_dw(c2, 'ssh_c2_lateral_m',
+          F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'ssh_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1 = conv_act_layer_dw(c1, 'ssh_c1_lateral_m',
+          F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+      c1 = c1_lateral+c2_up
+      c1 = conv_act_layer(c1, 'ssh_c1_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+      m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+      m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+      m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE==6:
+      c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+      c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+      c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+      c2 = c2_lateral+c3_up
+      c2 = conv_act_layer(c2, 'ssh_c2_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+          F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+      #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+      c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+      #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+      c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+      c1 = c1_lateral+c2_up
+      c1 = conv_act_layer(c1, 'ssh_c1_aggr',
+          F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+
+      m1 = insight_detection_module(c1, F2, F2, 'ssh_m1_det') 
+      m2 = insight_detection_module(c2, F1, F2, 'ssh_m2_det') 
+      m3 = insight_detection_module(c3, F1, F2, 'ssh_m3_det')
+
+    #return {8: m1, 16:m2, 32: m3}
+    return ret
+
+def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0, shared_vars = None):
+    A = config.NUM_ANCHORS
+    bbox_pred_len = 4
+    landmark_pred_len = 10
+    if config.USE_BLUR:
+      bbox_pred_len = 5
+    if config.USE_OCCLUSION:
+      landmark_pred_len = 15
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride))
+    if landmark:
+      landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride))
+      landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride))
+    rpn_relu = conv_fpn_feat[stride]
+    maxout_stat = 0
+    if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]:
+      maxout_stat = 1
+    if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]:
+      maxout_stat = 2
+
+
+    if maxout_stat==0:
+      rpn_cls_score = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors,
+          kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[0][0], shared_bias = shared_vars[0][1])
+    elif maxout_stat==1:
+      cls_list = []
+      for a in range(num_anchors):
+        rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3,
+            kernel=(1,1), pad=(0,0), stride=(1, 1))
+        rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True)
+        cls_list.append(rpn_cls_score_bg)
+        rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1,
+            kernel=(1,1), pad=(0,0), stride=(1, 1))
+        cls_list.append(rpn_cls_score_fg)
+      rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride))
+    else:
+      cls_list = []
+      for a in range(num_anchors):
+        rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1,
+            kernel=(1,1), pad=(0,0), stride=(1, 1))
+        cls_list.append(rpn_cls_score_bg)
+        rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3,
+            kernel=(1,1), pad=(0,0), stride=(1, 1))
+        rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True)
+        cls_list.append(rpn_cls_score_fg)
+      rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride))
+
+    rpn_bbox_pred = conv_only(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), bbox_pred_len*num_anchors,
+        kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[1][0], shared_bias = shared_vars[1][1])
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score,
+                                              shape=(0, 2, -1),
+                                              name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride))
+
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred,
+                                              shape=(0, 0, -1),
+                                              name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride))
+    if landmark:
+      rpn_landmark_pred = conv_only(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), landmark_pred_len*num_anchors,
+          kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[2][0], shared_bias = shared_vars[2][1])
+      rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred,
+                                              shape=(0, 0, -1),
+                                              name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM>=2:
+      label, anchor_weight, valid_count = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label)
+
+      _bbox_weight = mx.sym.tile(anchor_weight, (1,1,bbox_pred_len))
+      _bbox_weight = _bbox_weight.reshape((0, -1, A * bbox_pred_len)).transpose((0,2,1))
+      bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride))
+
+      if landmark:
+        _landmark_weight = mx.sym.tile(anchor_weight, (1,1,landmark_pred_len))
+        _landmark_weight = _landmark_weight.reshape((0, -1, A * landmark_pred_len)).transpose((0,2,1))
+        landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride))
+      #if not config.FACE_LANDMARK:
+      #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+      #else:
+      #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid', use_ignore=True, ignore_label=-1,
+                                           grad_scale = lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d'%(prefix,stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    valid_count = mx.symbol.mean(valid_count)
+    valid_count = valid_count + 0.001 #avoid zero
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape-bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff)
+    rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+    #rpn_bbox_loss_ = mx.symbol.broadcast_div(rpn_bbox_loss_, valid_count)
+    #rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.BATCH_IMAGES*16))
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+      landmark_diff = rpn_landmark_pred_reshape-landmark_target
+      landmark_diff = landmark_diff * landmark_weight
+      rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff)
+      rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+      #rpn_landmark_loss_ = mx.symbol.broadcast_div(rpn_landmark_loss_, valid_count)
+      #rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.BATCH_IMAGES*40))
+      ret_group.append(rpn_landmark_loss)
+      ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    return ret_group
+
+def get_sym_train(sym):
+    data = mx.symbol.Variable(name="data")
+
+    # shared convolutional layers
+    conv_fpn_feat = get_sym_conv(data, sym)
+    ret_group = []
+    shared_vars = []
+    if config.SHARE_WEIGHT_BBOX:
+      assert config.USE_MAXOUT==0
+      _name = 'face_rpn_cls_score_share'
+      shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),   
+          init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+      shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+      shared_vars.append( [shared_weight, shared_bias] )
+      _name = 'face_rpn_bbox_pred_share'
+      shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),   
+          init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+      shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+      shared_vars.append( [shared_weight, shared_bias] )
+    else:
+      shared_vars.append( [None, None] )
+      shared_vars.append( [None, None] )
+    if config.SHARE_WEIGHT_LANDMARK:
+      _name = 'face_rpn_landmark_pred_share'
+      shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),   
+          init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+      shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+      shared_vars.append( [shared_weight, shared_bias] )
+    else:
+      shared_vars.append( [None, None] )
+
+    for stride in config.RPN_FEAT_STRIDE:
+      ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0, shared_vars = shared_vars)
+      ret_group += ret
+      if config.HEAD_BOX:
+        assert not config.SHARE_WEIGHT_BBOX and not config.SHARE_WEIGHT_LANDMARK
+        shared_vars = [ [None, None], [None, None], [None, None] ]
+        ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=0.5, shared_vars = shared_vars)
+        ret_group += ret
+
+    return mx.sym.Group(ret_group)
+
+
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py b/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f65a3eb346d529ae11c26a11e3845a060bb0b6
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py
@@ -0,0 +1,834 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem3
+from rcnn.symbol.symbol_common import get_sym_train
+
+
+def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None):
+    if shared_weight is None:
+        weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                    init=mx.init.Normal(0.01),
+                                    attr={'__lr_mult__': '1.0'})
+        bias = mx.symbol.Variable(name="{}_bias".format(name),
+                                  init=mx.init.Constant(0.0),
+                                  attr={
+                                      '__lr_mult__': '2.0',
+                                      '__wd_mult__': str(bias_wd_mult)
+                                  })
+    else:
+        weight = shared_weight
+        bias = shared_bias
+        print('reuse shared var in', name)
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+    return conv
+
+def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0):
+    assert kernel[0] == 3
+    weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                init=mx.init.Normal(0.01),
+                                attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),
+                              init=mx.init.Constant(0.0),
+                              attr={
+                                  '__lr_mult__': '2.0',
+                                  '__wd_mult__': str(bias_wd_mult)
+                              })
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias)
+    conv = mx.sym.BatchNorm(data=conv,
+                            fix_gamma=False,
+                            eps=2e-5,
+                            momentum=0.9,
+                            name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1):
+
+    separable = False
+    if separable:
+        assert kernel[0] == 3
+    if not separable:
+        weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                    init=mx.init.Normal(0.01),
+                                    attr={'__lr_mult__': '1.0'})
+        bias = mx.symbol.Variable(name="{}_bias".format(name),
+                                  init=mx.init.Constant(0.0),
+                                  attr={
+                                      '__lr_mult__': '2.0',
+                                      '__wd_mult__': str(bias_wd_mult)
+                                  })
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias)
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    else:
+        if filter_in < 0:
+            filter_in = num_filter
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_sep_bn')
+        conv = mx.symbol.Activation(data=conv, act_type='relu', \
+            name="{}_sep_bn_relu".format(name))
+        conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \
+            stride=(1,1), num_filter=num_filter, name="{}".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+
+def ssh_context_module(body, num_filter, filter_in, name):
+    conv_dimred = conv_act_layer(body,
+                                 name + '_conv1',
+                                 num_filter,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 separable=True,
+                                 filter_in=filter_in)
+    conv5x5 = conv_act_layer(conv_dimred,
+                             name + '_conv2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True)
+    conv7x7_1 = conv_act_layer(conv_dimred,
+                               name + '_conv3_1',
+                               num_filter,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               separable=True)
+    conv7x7 = conv_act_layer(conv7x7_1,
+                             name + '_conv3_2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True)
+    return (conv5x5, conv7x7)
+
+
+def ssh_detection_module(body, num_filter, filter_in, name):
+    conv3x3 = conv_act_layer(body,
+                             name + '_conv1',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True,
+                             filter_in=filter_in)
+    conv5x5, conv7x7 = ssh_context_module(body, num_filter // 2, filter_in,
+                                          name + '_context')
+    ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7],
+                        dim=1,
+                        name=name + '_concat')
+    ret = mx.symbol.Activation(data=ret,
+                               act_type='relu',
+                               name=name + '_concat_relu')
+    return ret
+
+
+def upsampling(data, num_filter, name):
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2),  stride=(2, 2), pad=(0,0),
+    #    num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+    #    name=name)
+    ret = mx.symbol.UpSampling(data,
+                               scale=2,
+                               sample_type='nearest',
+                               workspace=512,
+                               name=name,
+                               num_args=1)
+    return ret
+
+
+def get_mnet_conv(data, sym):
+    mm = config.MULTIPLIER
+    all_layers = sym.get_internals()
+    #print(all_layers)
+    ##c1 = all_layers['mobilenetv20_features_linearbottleneck6_relu60_relu6_output'] #96
+    #c1 = all_layers['mobilenetv20_features_linearbottleneck5_elemwise_add0_output'] # 16
+    ##c2 = all_layers['mobilenetv20_features_linearbottleneck13_relu60_relu6_output']
+    #c2 = all_layers['mobilenetv20_features_linearbottleneck12_elemwise_add0_output'] # 48
+    ##c3 = all_layers['mobilenetv20_features_linearbottleneck16_batchnorm2_fwd_output'] # 160
+    #c3 = all_layers['mobilenetv20_features_linearbottleneck13_batchnorm2_fwd_output'] # 80
+    #c1_filter = int(32*mm)
+    #c2_filter = int(96*mm)
+    #c3_filter = int(160*mm)
+
+    #c1 = all_layers['mobilenet0_relu10_fwd_output']
+    #c2 = all_layers['mobilenet0_relu22_fwd_output']
+    #c3 = all_layers['mobilenet0_relu26_fwd_output']
+
+    #c1 = all_layers['conv_6_relu_output']
+    #c2 = all_layers['conv_12_relu_output']
+    #c3 = all_layers['conv_14_relu_output']
+    #c1_filter = int(256*mm)
+    #c2_filter = int(512*mm)
+    #c3_filter = int(1024*mm)
+
+    isize = 640
+    _, out_shape, _ = all_layers.infer_shape(data=(1, 3, isize, isize))
+    last_entry = None
+    c1 = None
+    c2 = None
+    c3 = None
+    c1_name = None
+    c2_name = None
+    c3_name = None
+    c1_filter = -1
+    c2_filter = -1
+    c3_filter = -1
+    #print(len(all_layers), len(out_shape))
+    #print(all_layers.__class__)
+    outputs = all_layers.list_outputs()
+    #print(outputs.__class__, len(outputs))
+    count = len(outputs)
+    for i in range(count):
+        name = outputs[i]
+        shape = out_shape[i]
+        if not name.endswith('_output'):
+            continue
+        if len(shape) != 4:
+            continue
+        #print(name, shape)
+        if c1 is None and shape[2] == isize // 16:
+            cname = last_entry[0]
+            #print('c1', last_entry)
+            c1 = all_layers[cname]
+            c1_name = cname
+        if c2 is None and shape[2] == isize // 32:
+            cname = last_entry[0]
+            #print('c2', last_entry)
+            c2 = all_layers[cname]
+            c2_name = cname
+        if shape[2] == isize // 32:
+            c3 = all_layers[name]
+            #print('c3', name, shape)
+            c3_name = name
+
+        last_entry = (name, shape)
+    print('cnames', c1_name, c2_name, c3_name)
+
+    F1 = int(256 * mm)
+    F2 = int(128 * mm)
+    if config.SHARE_WEIGHT_BBOX or config.SHARE_WEIGHT_LANDMARK:
+        F2 = F1
+    _bwm = 1.0
+    if config.NET_MODE == 0:
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_m2_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+        #    num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+        #    name='ssh_m2_red_upsampling')
+        #c2_up = mx.symbol.UpSampling(c2_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2_lateral, F2, 'ssh_m2_red_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+        c1 = c1_lateral + c2_up
+
+        c1 = conv_act_layer(c1,
+                            'ssh_m1_conv',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 1:
+        c3_lateral = conv_act_layer(c3,
+                                    'ssh_c3_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #c3_up = mx.symbol.UpSampling(c3_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+        c3_up = upsampling(c3_lateral, F2, 'ssh_c3_upsampling')
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+        c1 = c1_lateral + c2_up
+
+        c1 = conv_act_layer(c1,
+                            'ssh_m1_conv',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 2:
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+        c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 3:
+        #c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c3 = ssh_detection_module(c3, F2 // 2, c3_filter, 'ssh_c3_lateral')
+        #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+        c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+        #c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c2_lateral = ssh_detection_module(c2, F2 // 2, c2_filter,
+                                          'ssh_c2_lateral')
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        #c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c1_lateral = ssh_detection_module(c1, F2 // 2, c1_filter,
+                                          'ssh_c1_lateral')
+        #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 4:
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+        c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2 // 2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1 // 2, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1 // 2, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 5:
+        c3 = conv_act_layer_dw(c3,
+                               'ssh_c3_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1)
+        c3_up = upsampling(c3, F2, 'ssh_c3_upsampling')
+        c2 = conv_act_layer_dw(c2,
+                               'ssh_c2_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1 = conv_act_layer_dw(c1,
+                               'ssh_c1_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        c2_up = upsampling(c2, F2, 'ssh_c2_upsampling')
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+
+    return {8: m1, 16: m2, 32: m3}
+
+
+def get_out(conv_fpn_feat,
+            prefix,
+            stride,
+            landmark=False,
+            lr_mult=1.0,
+            shared_vars=None):
+    A = config.NUM_ANCHORS
+    bbox_pred_len = 4
+    landmark_pred_len = 10
+    if config.USE_BLUR:
+        bbox_pred_len = 5
+    if config.USE_OCCLUSION:
+        landmark_pred_len = 15
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    label = mx.symbol.Variable(name='%s_label_stride%d' % (prefix, stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d' %
+                                     (prefix, stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d' %
+                                     (prefix, stride))
+    if landmark:
+        landmark_target = mx.symbol.Variable(
+            name='%s_landmark_target_stride%d' % (prefix, stride))
+        landmark_weight = mx.symbol.Variable(
+            name='%s_landmark_weight_stride%d' % (prefix, stride))
+    rpn_relu = conv_fpn_feat[stride]
+    maxout_stat = 0
+    if config.USE_MAXOUT >= 1 and stride == config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 1
+    if config.USE_MAXOUT >= 2 and stride != config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 2
+
+    if maxout_stat == 0:
+        rpn_cls_score = conv_only(rpn_relu,
+                                  '%s_rpn_cls_score_stride%d' %
+                                  (prefix, stride),
+                                  2 * num_anchors,
+                                  kernel=(1, 1),
+                                  pad=(0, 0),
+                                  stride=(1, 1),
+                                  shared_weight=shared_vars[0][0],
+                                  shared_bias=shared_vars[0][1])
+    elif maxout_stat == 1:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+    else:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+
+    rpn_bbox_pred = conv_only(rpn_relu,
+                              '%s_rpn_bbox_pred_stride%d' % (prefix, stride),
+                              bbox_pred_len * num_anchors,
+                              kernel=(1, 1),
+                              pad=(0, 0),
+                              stride=(1, 1),
+                              shared_weight=shared_vars[1][0],
+                              shared_bias=shared_vars[1][1])
+
+    # prepare rpn data
+    if not config.FBN:
+        rpn_cls_score_reshape = mx.symbol.Reshape(
+            data=rpn_cls_score,
+            shape=(0, 2, -1),
+            name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+    else:
+        rpn_cls_score_reshape = mx.symbol.Reshape(
+            data=rpn_cls_score,
+            shape=(0, 2, -1),
+            name="%s_rpn_cls_score_reshape_stride%s_pre" % (prefix, stride))
+        rpn_cls_score_reshape = mx.symbol.BatchNorm(
+            rpn_cls_score_reshape,
+            fix_gamma=True,
+            eps=2e-5,
+            name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(
+        data=rpn_bbox_pred,
+        shape=(0, 0, -1),
+        name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix, stride))
+    if landmark:
+        rpn_landmark_pred = conv_only(rpn_relu,
+                                      '%s_rpn_landmark_pred_stride%d' %
+                                      (prefix, stride),
+                                      landmark_pred_len * num_anchors,
+                                      kernel=(1, 1),
+                                      pad=(0, 0),
+                                      stride=(1, 1),
+                                      shared_weight=shared_vars[2][0],
+                                      shared_bias=shared_vars[2][1])
+        rpn_landmark_pred_reshape = mx.symbol.Reshape(
+            data=rpn_landmark_pred,
+            shape=(0, 0, -1),
+            name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix, stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM >= 2:
+        label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3',
+                                             stride=int(stride),
+                                             network=config.network,
+                                             dataset=config.dataset,
+                                             prefix=prefix,
+                                             cls_score=rpn_cls_score_reshape,
+                                             labels=label)
+
+        _bbox_weight = mx.sym.tile(anchor_weight, (1, 1, bbox_pred_len))
+        _bbox_weight = _bbox_weight.reshape(
+            (0, -1, A * bbox_pred_len)).transpose((0, 2, 1))
+        bbox_weight = mx.sym.elemwise_mul(bbox_weight,
+                                          _bbox_weight,
+                                          name='%s_bbox_weight_mul_stride%s' %
+                                          (prefix, stride))
+
+        if landmark:
+            _landmark_weight = mx.sym.tile(anchor_weight,
+                                           (1, 1, landmark_pred_len))
+            _landmark_weight = _landmark_weight.reshape(
+                (0, -1, A * landmark_pred_len)).transpose((0, 2, 1))
+            landmark_weight = mx.sym.elemwise_mul(
+                landmark_weight,
+                _landmark_weight,
+                name='%s_landmark_weight_mul_stride%s' % (prefix, stride))
+        #if not config.FACE_LANDMARK:
+        #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+        #else:
+        #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid',
+                                           use_ignore=True,
+                                           ignore_label=-1,
+                                           grad_scale=lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d' %
+                                           (prefix, stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape - bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_' %
+                                         (prefix, stride),
+                                         scalar=3.0,
+                                         data=bbox_diff)
+    rpn_bbox_loss = mx.sym.MakeLoss(
+        name='%s_rpn_bbox_loss_stride%d' % (prefix, stride),
+        data=rpn_bbox_loss_,
+        grad_scale=1.0 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+        landmark_diff = rpn_landmark_pred_reshape - landmark_target
+        landmark_diff = landmark_diff * landmark_weight
+        rpn_landmark_loss_ = mx.symbol.smooth_l1(
+            name='%s_rpn_landmark_loss_stride%d_' % (prefix, stride),
+            scalar=3.0,
+            data=landmark_diff)
+        rpn_landmark_loss = mx.sym.MakeLoss(
+            name='%s_rpn_landmark_loss_stride%d' % (prefix, stride),
+            data=rpn_landmark_loss_,
+            grad_scale=0.5 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+        ret_group.append(rpn_landmark_loss)
+        ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    return ret_group
+
+
+def get_mnet_train(sym):
+    return get_sym_train(sym)
+    #data = mx.symbol.Variable(name="data")
+    ## shared convolutional layers
+    #conv_fpn_feat = get_mnet_conv(data, sym)
+    #ret_group = []
+    #shared_vars = []
+    #if config.SHARE_WEIGHT_BBOX:
+    #  assert config.USE_MAXOUT==0
+    #  _name = 'face_rpn_cls_score_share'
+    #  shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),
+    #      init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    #  shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),
+    #      init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+    #  shared_vars.append( [shared_weight, shared_bias] )
+    #  _name = 'face_rpn_bbox_pred_share'
+    #  shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),
+    #      init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    #  shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),
+    #      init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+    #  shared_vars.append( [shared_weight, shared_bias] )
+    #else:
+    #  shared_vars.append( [None, None] )
+    #  shared_vars.append( [None, None] )
+    #if config.SHARE_WEIGHT_LANDMARK:
+    #  _name = 'face_rpn_landmark_pred_share'
+    #  shared_weight = mx.symbol.Variable(name="{}_weight".format(_name),
+    #      init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    #  shared_bias = mx.symbol.Variable(name="{}_bias".format(_name),
+    #      init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)})
+    #  shared_vars.append( [shared_weight, shared_bias] )
+    #else:
+    #  shared_vars.append( [None, None] )
+
+    #for stride in config.RPN_FEAT_STRIDE:
+    #  ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0, shared_vars = shared_vars)
+    #  ret_group += ret
+    #  if config.HEAD_BOX:
+    #    ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=0.5)
+    #    ret_group += ret
+
+    #return mx.sym.Group(ret_group)
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py.bak b/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py.bak
new file mode 100644
index 0000000000000000000000000000000000000000..899804fe990484dd956ba973f4a12f2e82080814
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_mnet.py.bak
@@ -0,0 +1,362 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem, rpn_fpn_ohem2, rpn_fpn_ohem3
+
+USE_DCN = False
+MM = 1.0
+
+def ConvBlock(channels, kernel_size, strides, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels, kernel_size, strides=strides, padding=1, use_bias=False),
+            nn.BatchNorm(scale=True),
+            nn.Activation('relu')
+        )
+    return out
+
+def Conv1x1(channels, is_linear=False, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels, 1, padding=0, use_bias=False),
+            nn.BatchNorm(scale=True)
+        )
+        if not is_linear:
+            out.add(nn.Activation('relu'))
+    return out
+
+def DWise(channels, strides, kernel_size=3, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels, kernel_size, strides=strides, padding=kernel_size // 2, groups=channels, use_bias=False),
+            nn.BatchNorm(scale=True),
+            nn.Activation('relu')
+        )
+    return out
+
+class SepCONV(nn.HybridBlock):
+    def __init__(self, inp, output, kernel_size, depth_multiplier=1, with_bn=True, **kwargs):
+        super(SepCONV, self).__init__(**kwargs)
+        with self.name_scope():
+            self.net = nn.HybridSequential()
+            cn = int(inp*depth_multiplier)
+
+            if output is None:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp, channels=cn, groups=inp, kernel_size=kernel_size, strides=(1,1), padding=kernel_size // 2
+                        , use_bias=not with_bn)
+                )
+            else:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp, channels=cn, groups=inp, kernel_size=kernel_size, strides=(1,1), padding=kernel_size // 2
+                        , use_bias=False),
+                    nn.BatchNorm(),
+                    nn.Activation('relu'),
+                    nn.Conv2D(in_channels=cn, channels=output, kernel_size=(1,1), strides=(1,1)
+                        , use_bias=not with_bn)
+                )
+
+            self.with_bn = with_bn
+            self.act = nn.Activation('relu')
+            if with_bn:
+                self.bn = nn.BatchNorm()
+    def hybrid_forward(self, F ,x):
+        x = self.net(x)
+        if self.with_bn:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+class ExpandedConv(nn.HybridBlock):
+    def __init__(self, inp, oup, t, strides, kernel=3, same_shape=True, **kwargs):
+        super(ExpandedConv, self).__init__(**kwargs)
+
+        self.same_shape = same_shape
+        self.strides = strides
+        with self.name_scope(): 
+            self.bottleneck = nn.HybridSequential()
+            self.bottleneck.add(
+                Conv1x1(inp*t, prefix="expand_"),
+                DWise(inp*t, self.strides, kernel, prefix="dwise_"),
+                Conv1x1(oup, is_linear=True, prefix="linear_")
+            )
+    def hybrid_forward(self, F, x):
+        out = self.bottleneck(x)
+        if self.strides == 1 and self.same_shape:
+            out = F.elemwise_add(out, x)
+        return out
+
+def ExpandedConvSequence(t, k, inp, oup, repeats, first_strides, **kwargs):
+    seq = nn.HybridSequential(**kwargs)
+    with seq.name_scope():
+        seq.add(ExpandedConv(inp, oup, t, first_strides, k, same_shape=False))
+        curr_inp = oup
+        for i in range(1, repeats):
+            seq.add(ExpandedConv(curr_inp, oup, t, 1))
+            curr_inp = oup
+    return seq
+
+class Mnasnet(nn.HybridBlock):
+    def __init__(self, multiplier=1.0, **kwargs):
+        super(Mnasnet, self).__init__(**kwargs)
+        mm = multiplier
+        
+        self.first_oup = 32
+        self.interverted_residual_setting = [
+            # t, c,  n, s, k
+            [3, int(24*mm),  3, 2, 3, "stage2_"],  # -> 56x56
+            [3, int(40*mm),  3, 2, 5, "stage3_"],  # -> 28x28
+            [6, int(80*mm),  3, 2, 5, "stage4_1_"],  # -> 14x14
+            [6, int(96*mm),  2, 1, 3, "stage4_2_"],  # -> 14x14
+            [6, int(192*mm), 4, 2, 5, "stage5_1_"], # -> 7x7
+            [6, int(320*mm), 1, 1, 3, "stage5_2_"], # -> 7x7          
+        ]
+        self.last_channels = 1280
+
+        with self.name_scope():
+            self.features = nn.HybridSequential()
+            self.features.add(ConvBlock(self.first_oup, 3, 2, prefix="stage1_conv0_"))
+            self.features.add(SepCONV(self.first_oup, 16, 3, prefix="stage1_sepconv0_"))
+            inp = 16
+            for i, (t, c, n, s, k, prefix) in enumerate(self.interverted_residual_setting):
+                oup = c
+                self.features.add(ExpandedConvSequence(t, k, inp, oup, n, s, prefix=prefix))
+                inp = oup
+
+            self.features.add(Conv1x1(self.last_channels, prefix="stage5_3_"))
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        return x
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, dcn=False):
+
+    weight = mx.symbol.Variable(name="{}_weight".format(name),   
+        init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),   
+        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)})
+    if not dcn:
+      conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+          stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+    else:
+      assert kernel[0]==3 and kernel[1]==3
+      num_group = 1
+      f = num_group*18
+      offset_weight = mx.symbol.Variable(name="{}_offset_weight".format(name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '1.0'})
+      offset_bias = mx.symbol.Variable(name="{}_offset_bias".format(name),   
+          init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)})
+      conv_offset = mx.symbol.Convolution(name=name+'_offset', data = from_layer, weight=offset_weight, bias=offset_bias,
+                          num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1))
+      conv = mx.contrib.symbol.DeformableConvolution(name=name, data=from_layer, offset=conv_offset, weight=weight, bias=bias,
+                          num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False)
+    if len(act_type)>0:
+      relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+          name="{}_{}".format(name, act_type))
+    else:
+      relu = conv
+    return relu
+
+def ssh_context_module(body, num_filters, name):
+  conv_dimred = conv_act_layer(body, name+'_conv1',
+      num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False)
+  conv5x5 = conv_act_layer(conv_dimred, name+'_conv2',
+      num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN)
+  conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1',
+      num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False)
+  conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2',
+      num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN)
+  return (conv5x5, conv7x7)
+
+def ssh_detection_module(body, num_filters, name):
+  conv3x3 = conv_act_layer(body, name+'_conv1',
+      num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN)
+  conv5x5, conv7x7 = ssh_context_module(body, num_filters//2, name+'_context')
+  ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat')
+  ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu')
+  return ret
+
+def conv_bn(input, filter, ksize, stride, padding, act_type='relu', name=''):
+  conv = mx.symbol.Convolution(data=input, kernel=(ksize,ksize), pad=(padding,padding), \
+      stride=(stride,stride), num_filter=filter, name=name+"_conv")
+  ret = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn')
+  if act_type is not None:
+    ret = mx.symbol.Activation(data=ret, act_type=act_type, \
+        name="{}_{}".format(name, act_type))
+  return ret
+
+def cpm(input, name):
+    # residual
+    branch1 = conv_bn(input, 1024, 1, 1, 0, act_type=None, name=name+"_branch1")
+    branch2a = conv_bn(input, 256, 1, 1, 0, act_type='relu', name=name+"_branch2a")
+    branch2b = conv_bn(branch2a, 256, 3, 1, 1, act_type='relu', name=name+"_branch2b")
+    branch2c = conv_bn(branch2b, 1024, 1, 1, 0, act_type=None, name=name+"_branch2c")
+    sum = branch1 + branch2c
+    rescomb = mx.symbol.Activation(data=sum, act_type='relu', name="%s_relu2"%(name))
+
+    ssh_out = ssh_detection_module(rescomb, 256, name=name+"_ssh")
+    return ssh_out
+
+def get_mnet_conv(data):
+    mm = MM
+    net = Mnasnet(mm, prefix="")
+    body = net(data)
+
+    all_layers = body.get_internals()
+    #print(all_layers)
+    c1 = all_layers['stage3_expandedconv2_elemwise_add0_output']
+    c2 = all_layers['stage4_2_expandedconv1_elemwise_add0_output']
+    #c3 = all_layers['stage5_3_relu0_fwd_output']
+    c3 = all_layers['stage5_2_expandedconv0_linear_batchnorm0_fwd_output']
+
+    F1 = int(256*mm)
+    F2 = int(128*mm)
+    _bwm = 1.0
+    conv4_128 = conv_act_layer(c1, 'ssh_m1_red_conv',
+        F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+    conv5_128 = conv_act_layer(c2, 'ssh_m2_red_conv',
+        F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+    conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+        num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+        name='ssh_m2_red_upsampling')
+    #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+    conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+    #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128])
+
+    conv_sum = conv4_128+conv5_128_up
+    #conv_sum = conv_1x1
+
+    m1_conv = conv_act_layer(conv_sum, 'ssh_m1_conv',
+        F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+    m1 = ssh_detection_module(m1_conv, F2, 'ssh_m1_det')
+    m2 = ssh_detection_module(c2, F1, 'ssh_m2_det')
+    m3 = ssh_detection_module(c3, F1, 'ssh_m3_det')
+    return {8: m1, 16:m2, 32: m3}
+
+def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0):
+    A = config.NUM_ANCHORS
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride))
+    if landmark:
+      landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride))
+      landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride))
+    rpn_relu = conv_fpn_feat[stride]
+    maxout_stat = 0
+    if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]:
+      maxout_stat = 1
+    if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]:
+      maxout_stat = 2
+
+    if maxout_stat==0:
+      rpn_cls_score = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors,
+          kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+    elif maxout_stat==1:
+      cls_list = []
+      for a in range(num_anchors):
+        rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3,
+            kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+        rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True)
+        cls_list.append(rpn_cls_score_bg)
+        rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1,
+            kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+        cls_list.append(rpn_cls_score_fg)
+      rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride))
+    else:
+      cls_list = []
+      for a in range(num_anchors):
+        rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1,
+            kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+        cls_list.append(rpn_cls_score_bg)
+        rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3,
+            kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+        rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True)
+        cls_list.append(rpn_cls_score_fg)
+      rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride))
+
+    rpn_bbox_pred = conv_act_layer(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), 4*num_anchors,
+        kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score,
+                                              shape=(0, 2, -1),
+                                              name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride))
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred,
+                                              shape=(0, 0, -1),
+                                              name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride))
+    if landmark:
+      rpn_landmark_pred = conv_act_layer(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), 10*num_anchors,
+          kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='')
+      rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred,
+                                              shape=(0, 0, -1),
+                                              name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM>=2:
+      label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label)
+
+      _bbox_weight = mx.sym.tile(anchor_weight, (1,1,4))
+      _bbox_weight = _bbox_weight.reshape((0, -1, A * 4)).transpose((0,2,1))
+      bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride))
+
+      if landmark:
+        _landmark_weight = mx.sym.tile(anchor_weight, (1,1,10))
+        _landmark_weight = _landmark_weight.reshape((0, -1, A * 10)).transpose((0,2,1))
+        landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride))
+      #if not config.FACE_LANDMARK:
+      #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+      #else:
+      #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid', use_ignore=True, ignore_label=-1,
+                                           grad_scale = lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d'%(prefix,stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape-bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff)
+    rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+      landmark_diff = rpn_landmark_pred_reshape-landmark_target
+      landmark_diff = landmark_diff * landmark_weight
+      rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff)
+      rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+      ret_group.append(rpn_landmark_loss)
+      ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    return ret_group
+
+def get_mnet_train():
+    data = mx.symbol.Variable(name="data")
+
+    # shared convolutional layers
+    conv_fpn_feat = get_mnet_conv(data)
+    ret_group = []
+    for stride in config.RPN_FEAT_STRIDE:
+      ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0)
+      ret_group += ret
+      if config.HEAD_BOX:
+        ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=1.0)
+        ret_group += ret
+
+    return mx.sym.Group(ret_group)
+
+
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_resnet.py b/insightface/detection/retinaface/rcnn/symbol/symbol_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6312b1b8a7c7804a0ca66c3a82feb853c72706
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_resnet.py
@@ -0,0 +1,827 @@
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem3
+from rcnn.symbol.symbol_common import get_sym_train
+
+def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), bias_wd_mult=0.0):
+    weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                init=mx.init.Normal(0.01),
+                                attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),
+                              init=mx.init.Constant(0.0),
+                              attr={
+                                  '__lr_mult__': '2.0',
+                                  '__wd_mult__': str(bias_wd_mult)
+                              })
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+    return conv
+
+def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0):
+    assert kernel[0] == 3
+    weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                init=mx.init.Normal(0.01),
+                                attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),
+                              init=mx.init.Constant(0.0),
+                              attr={
+                                  '__lr_mult__': '2.0',
+                                  '__wd_mult__': str(bias_wd_mult)
+                              })
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias)
+    conv = mx.sym.BatchNorm(data=conv,
+                            fix_gamma=False,
+                            eps=2e-5,
+                            momentum=0.9,
+                            name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1):
+
+    separable = False
+    if separable:
+        assert kernel[0] == 3
+    if not separable:
+        weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                    init=mx.init.Normal(0.01),
+                                    attr={'__lr_mult__': '1.0'})
+        bias = mx.symbol.Variable(name="{}_bias".format(name),
+                                  init=mx.init.Constant(0.0),
+                                  attr={
+                                      '__lr_mult__': '2.0',
+                                      '__wd_mult__': str(bias_wd_mult)
+                                  })
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias)
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    else:
+        if filter_in < 0:
+            filter_in = num_filter
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_sep_bn')
+        conv = mx.symbol.Activation(data=conv, act_type='relu', \
+            name="{}_sep_bn_relu".format(name))
+        conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \
+            stride=(1,1), num_filter=num_filter, name="{}".format(name))
+        conv = mx.sym.BatchNorm(data=conv,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name=name + '_bn')
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+
+def ssh_context_module(body, num_filter, filter_in, name):
+    conv_dimred = conv_act_layer(body,
+                                 name + '_conv1',
+                                 num_filter,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 separable=True,
+                                 filter_in=filter_in)
+    conv5x5 = conv_act_layer(conv_dimred,
+                             name + '_conv2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True)
+    conv7x7_1 = conv_act_layer(conv_dimred,
+                               name + '_conv3_1',
+                               num_filter,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               separable=True)
+    conv7x7 = conv_act_layer(conv7x7_1,
+                             name + '_conv3_2',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True)
+    return (conv5x5, conv7x7)
+
+
+def conv_deformable(net, num_filter, num_group=1, act_type='relu', name=''):
+    f = num_group * 18
+    conv_offset = mx.symbol.Convolution(name=name + '_conv_offset',
+                                        data=net,
+                                        num_filter=f,
+                                        pad=(1, 1),
+                                        kernel=(3, 3),
+                                        stride=(1, 1))
+    net = mx.contrib.symbol.DeformableConvolution(
+        name=name + "_conv",
+        data=net,
+        offset=conv_offset,
+        num_filter=num_filter,
+        pad=(1, 1),
+        kernel=(3, 3),
+        num_deformable_group=num_group,
+        stride=(1, 1),
+        no_bias=False)
+    net = mx.sym.BatchNorm(data=net,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=0.9,
+                           name=name + '_bn')
+    if len(act_type) > 0:
+        net = mx.symbol.Activation(data=net,
+                                   act_type=act_type,
+                                   name=name + '_act')
+    return net
+
+
+def ssh_detection_module(body, num_filter, filter_in, name):
+    conv3x3 = conv_act_layer(body,
+                             name + '_conv1',
+                             num_filter,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             separable=True,
+                             filter_in=filter_in)
+    conv5x5, conv7x7 = ssh_context_module(body, num_filter // 2, filter_in,
+                                          name + '_context')
+    ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7],
+                        dim=1,
+                        name=name + '_concat')
+    ret = mx.symbol.Activation(data=ret,
+                               act_type='relu',
+                               name=name + '_concat_relu')
+    if config.USE_DCN == 1:
+        ret = conv_deformable(ret,
+                              num_filter=num_filter * 2,
+                              name=name + '_concat_dcn')
+    elif config.USE_DCN == 2:
+        ret = conv_deformable2(ret,
+                               num_filter=num_filter * 2,
+                               name=name + '_concat_dcn')
+    return ret
+
+
+def get_resnet_conv(data, sym):
+    all_layers = sym.get_internals()
+    isize = 640
+    _, out_shape, _ = all_layers.infer_shape(data=(1, 3, isize, isize))
+    last_entry = None
+    c1 = None
+    c2 = None
+    c3 = None
+    #print(len(all_layers), len(out_shape))
+    #print(all_layers.__class__)
+    outputs = all_layers.list_outputs()
+    #print(outputs.__class__, len(outputs))
+    count = len(outputs)
+    for i in range(count):
+        name = outputs[i]
+        shape = out_shape[i]
+        if not name.endswith('_output'):
+            continue
+        if len(shape) != 4:
+            continue
+        print(name, shape)
+        if c1 is None and shape[2] == isize // 16:
+            cname = last_entry[0]
+            print('c1', last_entry)
+            c1 = all_layers[cname]
+        if c2 is None and shape[2] == isize // 32:
+            cname = last_entry[0]
+            print('c2', last_entry)
+            c2 = all_layers[cname]
+        if shape[2] == isize // 32:
+            c3 = all_layers[name]
+            print('c3', name, shape)
+
+        last_entry = (name, shape)
+
+    c1_filter = -1
+    c2_filter = -1
+    c3_filter = -1
+
+    F1 = 256
+    F2 = 256
+    _bwm = 1.0
+    if config.NET_MODE == 0:
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_m2_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4),  stride=(2, 2), pad=(1,1),
+        #    num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'},
+        #    name='ssh_m2_red_upsampling')
+        c2_up = mx.symbol.UpSampling(c2_lateral,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+        c1 = c1_lateral + c2_up
+
+        c1 = conv_act_layer(c1,
+                            'ssh_m1_conv',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 1:
+        c3_lateral = conv_act_layer(c3,
+                                    'ssh_c3_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.UpSampling(c3_lateral,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_c3_up',
+                                     num_args=1)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_up = mx.symbol.UpSampling(c2,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+
+        c1 = c1_lateral + c2_up
+
+        c1 = conv_act_layer(c1,
+                            'ssh_m1_conv',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 2:
+        n1 = ssh_detection_module(c1, F2, F2, 'ssh_n1_det')
+        n2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_n2_det')
+        n3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_n3_det')
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c3_up = mx.symbol.UpSampling(c3,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_c3_up',
+                                     num_args=1)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_up = mx.symbol.UpSampling(c2,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 3:
+        #c3 = conv_act_layer(c3, 'ssh_c3_lateral',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c3 = ssh_detection_module(c3, F2 // 2, c3_filter, 'ssh_c3_lateral')
+        c3_up = mx.symbol.UpSampling(c3,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_c3_up',
+                                     num_args=1)
+        #c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c2_lateral = ssh_detection_module(c2, F2 // 2, c2_filter,
+                                          'ssh_c2_lateral')
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        #c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv',
+        #    F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm)
+        c1_lateral = ssh_detection_module(c1, F2 // 2, c1_filter,
+                                          'ssh_c1_lateral')
+        c2_up = mx.symbol.UpSampling(c2,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 4:
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c3_up = mx.symbol.UpSampling(c3,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_c3_up',
+                                     num_args=1)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_up = mx.symbol.UpSampling(c2,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2 // 2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1 // 2, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1 // 2, c3_filter, 'ssh_m3_det')
+    elif config.NET_MODE == 5:
+        c3 = conv_act_layer_dw(c3,
+                               'ssh_c3_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c3 = conv_act_layer(c3,
+                            'ssh_c3_lateral',
+                            F2,
+                            kernel=(1, 1),
+                            pad=(0, 0),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c3_up = mx.symbol.UpSampling(c3,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_c3_up',
+                                     num_args=1)
+        c2 = conv_act_layer_dw(c2,
+                               'ssh_c2_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c2_lateral = conv_act_layer(c2,
+                                    'ssh_c2_lateral',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c3_up = mx.symbol.Crop(*[c3_up, c2_lateral])
+        c2 = c2_lateral + c3_up
+        c2 = conv_act_layer(c2,
+                            'ssh_c2_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+        c1 = conv_act_layer_dw(c1,
+                               'ssh_c1_lateral_m',
+                               F2,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               bias_wd_mult=_bwm)
+        c1_lateral = conv_act_layer(c1,
+                                    'ssh_m1_red_conv',
+                                    F2,
+                                    kernel=(1, 1),
+                                    pad=(0, 0),
+                                    stride=(1, 1),
+                                    act_type='relu',
+                                    bias_wd_mult=_bwm)
+        c2_up = mx.symbol.UpSampling(c2,
+                                     scale=2,
+                                     sample_type='nearest',
+                                     workspace=512,
+                                     name='ssh_m2_red_up',
+                                     num_args=1)
+        #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        c2_up = mx.symbol.Crop(*[c2_up, c1_lateral])
+        c1 = c1_lateral + c2_up
+        c1 = conv_act_layer(c1,
+                            'ssh_c1_aggr',
+                            F2,
+                            kernel=(3, 3),
+                            pad=(1, 1),
+                            stride=(1, 1),
+                            act_type='relu',
+                            bias_wd_mult=_bwm)
+
+        m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det')
+        m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det')
+
+    return {8: m1, 16: m2, 32: m3}, {8: n1, 16: n2, 32: n3}
+
+
+def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0):
+    A = config.NUM_ANCHORS
+    bbox_pred_len = 4
+    landmark_pred_len = 10
+    if config.USE_BLUR:
+        bbox_pred_len = 5
+    if config.USE_OCCLUSION:
+        landmark_pred_len = 15
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    label = mx.symbol.Variable(name='%s_label_stride%d' % (prefix, stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d' %
+                                     (prefix, stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d' %
+                                     (prefix, stride))
+    if landmark:
+        landmark_target = mx.symbol.Variable(
+            name='%s_landmark_target_stride%d' % (prefix, stride))
+        landmark_weight = mx.symbol.Variable(
+            name='%s_landmark_weight_stride%d' % (prefix, stride))
+    rpn_relu = conv_fpn_feat[stride]
+    maxout_stat = 0
+    if config.USE_MAXOUT >= 1 and stride == config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 1
+    if config.USE_MAXOUT >= 2 and stride != config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 2
+
+    if maxout_stat == 0:
+        rpn_cls_score = conv_only(rpn_relu,
+                                  '%s_rpn_cls_score_stride%d' %
+                                  (prefix, stride),
+                                  2 * num_anchors,
+                                  kernel=(1, 1),
+                                  pad=(0, 0),
+                                  stride=(1, 1))
+    elif maxout_stat == 1:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+    else:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_only(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1))
+            rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+
+    rpn_bbox_pred = conv_only(rpn_relu,
+                              '%s_rpn_bbox_pred_stride%d' % (prefix, stride),
+                              bbox_pred_len * num_anchors,
+                              kernel=(1, 1),
+                              pad=(0, 0),
+                              stride=(1, 1))
+
+    # prepare rpn data
+    if not config.FBN:
+        rpn_cls_score_reshape = mx.symbol.Reshape(
+            data=rpn_cls_score,
+            shape=(0, 2, -1),
+            name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+    else:
+        rpn_cls_score_reshape = mx.symbol.Reshape(
+            data=rpn_cls_score,
+            shape=(0, 2, -1),
+            name="%s_rpn_cls_score_reshape_stride%s_pre" % (prefix, stride))
+        rpn_cls_score_reshape = mx.symbol.BatchNorm(
+            rpn_cls_score_reshape,
+            fix_gamma=True,
+            eps=2e-5,
+            name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(
+        data=rpn_bbox_pred,
+        shape=(0, 0, -1),
+        name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix, stride))
+    if landmark:
+        rpn_landmark_pred = conv_only(rpn_relu,
+                                      '%s_rpn_landmark_pred_stride%d' %
+                                      (prefix, stride),
+                                      landmark_pred_len * num_anchors,
+                                      kernel=(1, 1),
+                                      pad=(0, 0),
+                                      stride=(1, 1))
+        rpn_landmark_pred_reshape = mx.symbol.Reshape(
+            data=rpn_landmark_pred,
+            shape=(0, 0, -1),
+            name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix, stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM >= 2:
+        label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3',
+                                             stride=int(stride),
+                                             network=config.network,
+                                             dataset=config.dataset,
+                                             prefix=prefix,
+                                             cls_score=rpn_cls_score_reshape,
+                                             labels=label)
+
+        _bbox_weight = mx.sym.tile(anchor_weight, (1, 1, bbox_pred_len))
+        _bbox_weight = _bbox_weight.reshape(
+            (0, -1, A * bbox_pred_len)).transpose((0, 2, 1))
+        bbox_weight = mx.sym.elemwise_mul(bbox_weight,
+                                          _bbox_weight,
+                                          name='%s_bbox_weight_mul_stride%s' %
+                                          (prefix, stride))
+
+        if landmark:
+            _landmark_weight = mx.sym.tile(anchor_weight,
+                                           (1, 1, landmark_pred_len))
+            _landmark_weight = _landmark_weight.reshape(
+                (0, -1, A * landmark_pred_len)).transpose((0, 2, 1))
+            landmark_weight = mx.sym.elemwise_mul(
+                landmark_weight,
+                _landmark_weight,
+                name='%s_landmark_weight_mul_stride%s' % (prefix, stride))
+        #if not config.FACE_LANDMARK:
+        #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+        #else:
+        #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid',
+                                           use_ignore=True,
+                                           ignore_label=-1,
+                                           grad_scale=lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d' %
+                                           (prefix, stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape - bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_' %
+                                         (prefix, stride),
+                                         scalar=3.0,
+                                         data=bbox_diff)
+    rpn_bbox_loss = mx.sym.MakeLoss(
+        name='%s_rpn_bbox_loss_stride%d' % (prefix, stride),
+        data=rpn_bbox_loss_,
+        grad_scale=1.0 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+        landmark_diff = rpn_landmark_pred_reshape - landmark_target
+        landmark_diff = landmark_diff * landmark_weight
+        rpn_landmark_loss_ = mx.symbol.smooth_l1(
+            name='%s_rpn_landmark_loss_stride%d_' % (prefix, stride),
+            scalar=3.0,
+            data=landmark_diff)
+        rpn_landmark_loss = mx.sym.MakeLoss(
+            name='%s_rpn_landmark_loss_stride%d' % (prefix, stride),
+            data=rpn_landmark_loss_,
+            grad_scale=0.5 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+        ret_group.append(rpn_landmark_loss)
+        ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    return ret_group
+
+
+def get_resnet_train(sym):
+    return get_sym_train(sym)
+    #data = mx.symbol.Variable(name="data")
+    ## shared convolutional layers
+    #conv_fpn_feat, conv_fpn_feat2 = get_resnet_conv(data, sym)
+    #ret_group = []
+    #for stride in config.RPN_FEAT_STRIDE:
+    #  ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0)
+    #  ret_group += ret
+    #  if config.HEAD_BOX:
+    #    ret = get_out(conv_fpn_feat2, 'head', stride, False, lr_mult=1.0)
+    #    ret_group += ret
+
+    #return mx.sym.Group(ret_group)
diff --git a/insightface/detection/retinaface/rcnn/symbol/symbol_ssh.py b/insightface/detection/retinaface/rcnn/symbol/symbol_ssh.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8be30d0a78fabf15473be505ca27777414753cb
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/symbol/symbol_ssh.py
@@ -0,0 +1,725 @@
+import mxnet as mx
+import numpy as np
+from rcnn.config import config
+from rcnn.PY_OP import rpn_fpn_ohem3
+FPN = False
+USE_DCN = False
+
+def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
+    stride=(1,1), act_type="relu", bias_wd_mult=0.0, dcn=False):
+
+    weight = mx.symbol.Variable(name="{}_weight".format(name),
+                                init=mx.init.Normal(0.01),
+                                attr={'__lr_mult__': '1.0'})
+    bias = mx.symbol.Variable(name="{}_bias".format(name),
+                              init=mx.init.Constant(0.0),
+                              attr={
+                                  '__lr_mult__': '2.0',
+                                  '__wd_mult__': str(bias_wd_mult)
+                              })
+    if not dcn:
+        conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+            stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias)
+    else:
+        assert kernel[0] == 3 and kernel[1] == 3
+        num_group = 1
+        f = num_group * 18
+        offset_weight = mx.symbol.Variable(
+            name="{}_offset_weight".format(name),
+            init=mx.init.Constant(0.0),
+            attr={'__lr_mult__': '1.0'})
+        offset_bias = mx.symbol.Variable(name="{}_offset_bias".format(name),
+                                         init=mx.init.Constant(0.0),
+                                         attr={
+                                             '__lr_mult__': '2.0',
+                                             '__wd_mult__': str(bias_wd_mult)
+                                         })
+        conv_offset = mx.symbol.Convolution(name=name + '_offset',
+                                            data=from_layer,
+                                            weight=offset_weight,
+                                            bias=offset_bias,
+                                            num_filter=f,
+                                            pad=(1, 1),
+                                            kernel=(3, 3),
+                                            stride=(1, 1))
+        conv = mx.contrib.symbol.DeformableConvolution(
+            name=name,
+            data=from_layer,
+            offset=conv_offset,
+            weight=weight,
+            bias=bias,
+            num_filter=num_filter,
+            pad=(1, 1),
+            kernel=(3, 3),
+            num_deformable_group=num_group,
+            stride=(1, 1),
+            no_bias=False)
+    if len(act_type) > 0:
+        relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    else:
+        relu = conv
+    return relu
+
+
+def ssh_context_module(body, num_filters, name):
+    conv_dimred = conv_act_layer(body,
+                                 name + '_conv1',
+                                 num_filters,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 dcn=False)
+    conv5x5 = conv_act_layer(conv_dimred,
+                             name + '_conv2',
+                             num_filters,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             dcn=USE_DCN)
+    conv7x7_1 = conv_act_layer(conv_dimred,
+                               name + '_conv3_1',
+                               num_filters,
+                               kernel=(3, 3),
+                               pad=(1, 1),
+                               stride=(1, 1),
+                               act_type='relu',
+                               dcn=False)
+    conv7x7 = conv_act_layer(conv7x7_1,
+                             name + '_conv3_2',
+                             num_filters,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             dcn=USE_DCN)
+    return (conv5x5, conv7x7)
+
+
+def ssh_detection_module(body, num_filters, name):
+    conv3x3 = conv_act_layer(body,
+                             name + '_conv1',
+                             num_filters,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='',
+                             dcn=USE_DCN)
+    conv5x5, conv7x7 = ssh_context_module(body, num_filters // 2,
+                                          name + '_context')
+    ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7],
+                        dim=1,
+                        name=name + '_concat')
+    ret = mx.symbol.Activation(data=ret,
+                               act_type='relu',
+                               name=name + '_concat_relu')
+    return ret
+
+
+def conv_bn(input, filter, ksize, stride, padding, act_type='relu', name=''):
+    conv = mx.symbol.Convolution(data=input, kernel=(ksize,ksize), pad=(padding,padding), \
+        stride=(stride,stride), num_filter=filter, name=name+"_conv")
+    ret = mx.sym.BatchNorm(data=conv,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=0.9,
+                           name=name + '_bn')
+    if act_type is not None:
+        ret = mx.symbol.Activation(data=ret, act_type=act_type, \
+            name="{}_{}".format(name, act_type))
+    return ret
+
+
+def cpm(input, name):
+    # residual
+    branch1 = conv_bn(input,
+                      1024,
+                      1,
+                      1,
+                      0,
+                      act_type=None,
+                      name=name + "_branch1")
+    branch2a = conv_bn(input,
+                       256,
+                       1,
+                       1,
+                       0,
+                       act_type='relu',
+                       name=name + "_branch2a")
+    branch2b = conv_bn(branch2a,
+                       256,
+                       3,
+                       1,
+                       1,
+                       act_type='relu',
+                       name=name + "_branch2b")
+    branch2c = conv_bn(branch2b,
+                       1024,
+                       1,
+                       1,
+                       0,
+                       act_type=None,
+                       name=name + "_branch2c")
+    sum = branch1 + branch2c
+    rescomb = mx.symbol.Activation(data=sum,
+                                   act_type='relu',
+                                   name="%s_relu2" % (name))
+
+    ssh_out = ssh_detection_module(rescomb, 256, name=name + "_ssh")
+    return ssh_out
+
+
+def get_feat_down(conv_feat):
+    #P5 = mx.symbol.Convolution(data=conv_feat[0], kernel=(1, 1), num_filter=256, name="P5_lateral")
+    P5 = conv_act_layer(conv_feat[0],
+                        'P5_lateral',
+                        256,
+                        kernel=(1, 1),
+                        pad=(0, 0),
+                        stride=(1, 1),
+                        act_type='relu')
+
+    # P5 2x upsampling + C4 = P4
+    P5_up = mx.symbol.UpSampling(P5,
+                                 scale=2,
+                                 sample_type='nearest',
+                                 workspace=512,
+                                 name='P5_upsampling',
+                                 num_args=1)
+    #P4_la   = mx.symbol.Convolution(data=conv_feat[1], kernel=(1, 1), num_filter=256, name="P4_lateral")
+    P4_la = conv_act_layer(conv_feat[1],
+                           'P4_lateral',
+                           256,
+                           kernel=(1, 1),
+                           pad=(0, 0),
+                           stride=(1, 1),
+                           act_type='relu')
+    P5_clip = mx.symbol.Crop(*[P5_up, P4_la], name="P4_clip")
+    P4 = mx.sym.ElementWiseSum(*[P5_clip, P4_la], name="P4_sum")
+    #P4      = mx.symbol.Convolution(data=P4, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P4_aggregate")
+    P4 = conv_act_layer(P4,
+                        'P4_aggregate',
+                        256,
+                        kernel=(3, 3),
+                        pad=(1, 1),
+                        stride=(1, 1),
+                        act_type='relu')
+
+    # P4 2x upsampling + C3 = P3
+    P4_up = mx.symbol.UpSampling(P4,
+                                 scale=2,
+                                 sample_type='nearest',
+                                 workspace=512,
+                                 name='P4_upsampling',
+                                 num_args=1)
+    #P3_la   = mx.symbol.Convolution(data=conv_feat[2], kernel=(1, 1), num_filter=256, name="P3_lateral")
+    P3_la = conv_act_layer(conv_feat[2],
+                           'P3_lateral',
+                           256,
+                           kernel=(1, 1),
+                           pad=(0, 0),
+                           stride=(1, 1),
+                           act_type='relu')
+    P4_clip = mx.symbol.Crop(*[P4_up, P3_la], name="P3_clip")
+    P3 = mx.sym.ElementWiseSum(*[P4_clip, P3_la], name="P3_sum")
+    #P3      = mx.symbol.Convolution(data=P3, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P3_aggregate")
+    P3 = conv_act_layer(P3,
+                        'P3_aggregate',
+                        256,
+                        kernel=(3, 3),
+                        pad=(1, 1),
+                        stride=(1, 1),
+                        act_type='relu')
+
+    return P3, P4, P5
+
+
+def get_ssh_conv(data):
+    """
+    shared convolutional layers
+    :param data: Symbol
+    :return: Symbol
+    """
+    # group 1
+    #conv1_1 = mx.symbol.Convolution(
+    #    data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_1")
+    #relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
+    relu1_1 = conv_act_layer(data,
+                             'conv1_1',
+                             64,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv1_2 = mx.symbol.Convolution(
+    #    data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_2")
+    #relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
+    relu1_2 = conv_act_layer(relu1_1,
+                             'conv1_2',
+                             64,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    pool1 = mx.symbol.Pooling(data=relu1_2,
+                              pool_type="max",
+                              kernel=(2, 2),
+                              stride=(2, 2),
+                              name="pool1")
+    # group 2
+    #conv2_1 = mx.symbol.Convolution(
+    #    data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_1")
+    #relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
+    relu2_1 = conv_act_layer(pool1,
+                             'conv2_1',
+                             128,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv2_2 = mx.symbol.Convolution(
+    #    data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_2")
+    #relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
+    relu2_2 = conv_act_layer(relu2_1,
+                             'conv2_2',
+                             128,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    pool2 = mx.symbol.Pooling(data=relu2_2,
+                              pool_type="max",
+                              kernel=(2, 2),
+                              stride=(2, 2),
+                              name="pool2")
+    # group 3
+    #conv3_1 = mx.symbol.Convolution(
+    #    data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_1")
+    #relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
+    relu3_1 = conv_act_layer(pool2,
+                             'conv3_1',
+                             256,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv3_2 = mx.symbol.Convolution(
+    #    data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_2")
+    #relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
+    relu3_2 = conv_act_layer(relu3_1,
+                             'conv3_2',
+                             256,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv3_3 = mx.symbol.Convolution(
+    #    data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_3")
+    #relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
+    relu3_3 = conv_act_layer(relu3_2,
+                             'conv3_3',
+                             256,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    pool3 = mx.symbol.Pooling(data=relu3_3,
+                              pool_type="max",
+                              kernel=(2, 2),
+                              stride=(2, 2),
+                              name="pool3")
+    # group 4
+    #conv4_1 = mx.symbol.Convolution(
+    #    data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_1")
+    #relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
+    relu4_1 = conv_act_layer(pool3,
+                             'conv4_1',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv4_2 = mx.symbol.Convolution(
+    #    data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_2")
+    #relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
+    relu4_2 = conv_act_layer(relu4_1,
+                             'conv4_2',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv4_3 = mx.symbol.Convolution(
+    #    data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_3")
+    #relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
+    relu4_3 = conv_act_layer(relu4_2,
+                             'conv4_3',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    pool4 = mx.symbol.Pooling(data=relu4_3,
+                              pool_type="max",
+                              kernel=(2, 2),
+                              stride=(2, 2),
+                              name="pool4")
+    # group 5
+    #conv5_1 = mx.symbol.Convolution(
+    #    data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_1")
+    #relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
+    relu5_1 = conv_act_layer(pool4,
+                             'conv5_1',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv5_2 = mx.symbol.Convolution(
+    #    data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_2")
+    #relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
+    relu5_2 = conv_act_layer(relu5_1,
+                             'conv5_2',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    #conv5_3 = mx.symbol.Convolution(
+    #    data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_3")
+    #relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+    relu5_3 = conv_act_layer(relu5_2,
+                             'conv5_3',
+                             512,
+                             kernel=(3, 3),
+                             pad=(1, 1),
+                             stride=(1, 1),
+                             act_type='relu')
+    m3_pool = mx.sym.Pooling(data=relu5_3,
+                             kernel=(2, 2),
+                             stride=(2, 2),
+                             pad=(0, 0),
+                             pool_type='max')
+    if config.SSH_MODE <= 5:
+        #if FPN:
+        #  relu4_3, relu5_3, m3_pool = get_feat_down([m3_pool, relu5_3, relu4_3])
+
+        F1 = 256
+        F2 = 128
+        if config.SSH_MODE == 1:
+            F2 = 256
+        _bwm = 1.0
+        conv4_128 = conv_act_layer(relu4_3,
+                                   'ssh_m1_red_conv',
+                                   F2,
+                                   kernel=(1, 1),
+                                   pad=(0, 0),
+                                   stride=(1, 1),
+                                   act_type='relu',
+                                   bias_wd_mult=_bwm)
+        conv5_128 = conv_act_layer(relu5_3,
+                                   'ssh_m2_red_conv',
+                                   F2,
+                                   kernel=(1, 1),
+                                   pad=(0, 0),
+                                   stride=(1, 1),
+                                   act_type='relu',
+                                   bias_wd_mult=_bwm)
+        conv5_128_up = mx.symbol.Deconvolution(data=conv5_128,
+                                               num_filter=F2,
+                                               kernel=(4, 4),
+                                               stride=(2, 2),
+                                               pad=(1, 1),
+                                               num_group=F2,
+                                               no_bias=True,
+                                               attr={
+                                                   '__lr_mult__': '0.0',
+                                                   '__wd_mult__': '0.0'
+                                               },
+                                               name='ssh_m2_red_upsampling')
+        #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128])
+
+        conv_sum = conv4_128 + conv5_128_up
+        #conv_sum = conv_1x1
+
+        m1_conv = conv_act_layer(conv_sum,
+                                 'ssh_m1_conv',
+                                 F2,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 bias_wd_mult=_bwm)
+        m1 = ssh_detection_module(m1_conv, F2, 'ssh_m1_det')
+        m2 = ssh_detection_module(relu5_3, F1, 'ssh_m2_det')
+        m3 = ssh_detection_module(m3_pool, F1, 'ssh_m3_det')
+        return {8: m1, 16: m2, 32: m3}
+    else:
+        F1 = 256
+        F2 = 256
+        _bwm = 1.0
+        conv4_128 = conv_act_layer(relu4_3,
+                                   'ssh_m1_red_conv',
+                                   F2,
+                                   kernel=(1, 1),
+                                   pad=(0, 0),
+                                   stride=(1, 1),
+                                   act_type='relu',
+                                   bias_wd_mult=_bwm)
+        conv5_128 = conv_act_layer(relu5_3,
+                                   'ssh_m2_red_conv',
+                                   F2,
+                                   kernel=(1, 1),
+                                   pad=(0, 0),
+                                   stride=(1, 1),
+                                   act_type='relu',
+                                   bias_wd_mult=_bwm)
+        conv5_128_up = mx.symbol.Deconvolution(data=conv5_128,
+                                               num_filter=F2,
+                                               kernel=(4, 4),
+                                               stride=(2, 2),
+                                               pad=(1, 1),
+                                               num_group=F2,
+                                               no_bias=True,
+                                               attr={
+                                                   '__lr_mult__': '0.0',
+                                                   '__wd_mult__': '0.0'
+                                               },
+                                               name='ssh_m2_red_upsampling')
+        #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1)
+        conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up])
+        #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128])
+
+        conv_sum = conv4_128 + conv5_128_up
+        m1_conv = conv_act_layer(conv_sum,
+                                 'ssh_m1_conv',
+                                 F2,
+                                 kernel=(3, 3),
+                                 pad=(1, 1),
+                                 stride=(1, 1),
+                                 act_type='relu',
+                                 bias_wd_mult=_bwm)
+        m1 = cpm(m1_conv, 'ssh_m1_det')
+        m2 = cpm(relu5_3, 'ssh_m2_det')
+        m3 = cpm(m3_pool, 'ssh_m3_det')
+        return {8: m1, 16: m2, 32: m3}
+
+
+def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0):
+    A = config.NUM_ANCHORS
+    ret_group = []
+    num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+    label = mx.symbol.Variable(name='%s_label_stride%d' % (prefix, stride))
+    bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d' %
+                                     (prefix, stride))
+    bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d' %
+                                     (prefix, stride))
+    if landmark:
+        landmark_target = mx.symbol.Variable(
+            name='%s_landmark_target_stride%d' % (prefix, stride))
+        landmark_weight = mx.symbol.Variable(
+            name='%s_landmark_weight_stride%d' % (prefix, stride))
+    rpn_relu = conv_fpn_feat[stride]
+    maxout_stat = 0
+    if config.USE_MAXOUT >= 1 and stride == config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 1
+    if config.USE_MAXOUT >= 2 and stride != config.RPN_FEAT_STRIDE[-1]:
+        maxout_stat = 2
+
+    if maxout_stat == 0:
+        rpn_cls_score = conv_act_layer(rpn_relu,
+                                       '%s_rpn_cls_score_stride%d' %
+                                       (prefix, stride),
+                                       2 * num_anchors,
+                                       kernel=(1, 1),
+                                       pad=(0, 0),
+                                       stride=(1, 1),
+                                       act_type='')
+    elif maxout_stat == 1:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_act_layer(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                act_type='')
+            rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_act_layer(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                act_type='')
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+    else:
+        cls_list = []
+        for a in range(num_anchors):
+            rpn_cls_score_bg = conv_act_layer(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_bg' % (prefix, stride, a),
+                1,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                act_type='')
+            cls_list.append(rpn_cls_score_bg)
+            rpn_cls_score_fg = conv_act_layer(
+                rpn_relu,
+                '%s_rpn_cls_score_stride%d_anchor%d_fg' % (prefix, stride, a),
+                3,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                act_type='')
+            rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg,
+                                          axis=1,
+                                          keepdims=True)
+            cls_list.append(rpn_cls_score_fg)
+        rpn_cls_score = mx.sym.concat(*cls_list,
+                                      dim=1,
+                                      name='%s_rpn_cls_score_stride%d' %
+                                      (prefix, stride))
+
+    rpn_bbox_pred = conv_act_layer(rpn_relu,
+                                   '%s_rpn_bbox_pred_stride%d' %
+                                   (prefix, stride),
+                                   4 * num_anchors,
+                                   kernel=(1, 1),
+                                   pad=(0, 0),
+                                   stride=(1, 1),
+                                   act_type='')
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score,
+        shape=(0, 2, -1),
+        name="%s_rpn_cls_score_reshape_stride%s" % (prefix, stride))
+    rpn_bbox_pred_reshape = mx.symbol.Reshape(
+        data=rpn_bbox_pred,
+        shape=(0, 0, -1),
+        name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix, stride))
+    if landmark:
+        rpn_landmark_pred = conv_act_layer(rpn_relu,
+                                           '%s_rpn_landmark_pred_stride%d' %
+                                           (prefix, stride),
+                                           10 * num_anchors,
+                                           kernel=(1, 1),
+                                           pad=(0, 0),
+                                           stride=(1, 1),
+                                           act_type='')
+        rpn_landmark_pred_reshape = mx.symbol.Reshape(
+            data=rpn_landmark_pred,
+            shape=(0, 0, -1),
+            name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix, stride))
+
+    if config.TRAIN.RPN_ENABLE_OHEM >= 2:
+        label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3',
+                                             stride=int(stride),
+                                             network=config.network,
+                                             dataset=config.dataset,
+                                             prefix=prefix,
+                                             cls_score=rpn_cls_score_reshape,
+                                             labels=label)
+
+        _bbox_weight = mx.sym.tile(anchor_weight, (1, 1, 4))
+        _bbox_weight = _bbox_weight.reshape((0, -1, A * 4)).transpose(
+            (0, 2, 1))
+        bbox_weight = mx.sym.elemwise_mul(bbox_weight,
+                                          _bbox_weight,
+                                          name='%s_bbox_weight_mul_stride%s' %
+                                          (prefix, stride))
+
+        if landmark:
+            _landmark_weight = mx.sym.tile(anchor_weight, (1, 1, 10))
+            _landmark_weight = _landmark_weight.reshape(
+                (0, -1, A * 10)).transpose((0, 2, 1))
+            landmark_weight = mx.sym.elemwise_mul(
+                landmark_weight,
+                _landmark_weight,
+                name='%s_landmark_weight_mul_stride%s' % (prefix, stride))
+        #if not config.FACE_LANDMARK:
+        #  label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label)
+        #else:
+        #  label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label)
+    #cls loss
+    rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape,
+                                           label=label,
+                                           multi_output=True,
+                                           normalization='valid',
+                                           use_ignore=True,
+                                           ignore_label=-1,
+                                           grad_scale=lr_mult,
+                                           name='%s_rpn_cls_prob_stride%d' %
+                                           (prefix, stride))
+    ret_group.append(rpn_cls_prob)
+    ret_group.append(mx.sym.BlockGrad(label))
+
+    #bbox loss
+    bbox_diff = rpn_bbox_pred_reshape - bbox_target
+    bbox_diff = bbox_diff * bbox_weight
+    rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_' %
+                                         (prefix, stride),
+                                         scalar=3.0,
+                                         data=bbox_diff)
+    rpn_bbox_loss = mx.sym.MakeLoss(
+        name='%s_rpn_bbox_loss_stride%d' % (prefix, stride),
+        data=rpn_bbox_loss_,
+        grad_scale=1.0 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+    ret_group.append(rpn_bbox_loss)
+    ret_group.append(mx.sym.BlockGrad(bbox_weight))
+
+    #landmark loss
+    if landmark:
+        landmark_diff = rpn_landmark_pred_reshape - landmark_target
+        landmark_diff = landmark_diff * landmark_weight
+        rpn_landmark_loss_ = mx.symbol.smooth_l1(
+            name='%s_rpn_landmark_loss_stride%d_' % (prefix, stride),
+            scalar=3.0,
+            data=landmark_diff)
+        rpn_landmark_loss = mx.sym.MakeLoss(
+            name='%s_rpn_landmark_loss_stride%d' % (prefix, stride),
+            data=rpn_landmark_loss_,
+            grad_scale=0.5 * lr_mult / (config.TRAIN.RPN_BATCH_SIZE))
+        ret_group.append(rpn_landmark_loss)
+        ret_group.append(mx.sym.BlockGrad(landmark_weight))
+    return ret_group
+
+
+def get_ssh_train():
+    """
+    Region Proposal Network with VGG
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+
+    # shared convolutional layers
+    conv_fpn_feat = get_ssh_conv(data)
+    ret_group = []
+    for stride in config.RPN_FEAT_STRIDE:
+        ret = get_out(conv_fpn_feat,
+                      'face',
+                      stride,
+                      config.FACE_LANDMARK,
+                      lr_mult=1.0)
+        ret_group += ret
+        if config.HEAD_BOX:
+            ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=1.0)
+            ret_group += ret
+
+    return mx.sym.Group(ret_group)
diff --git a/insightface/detection/retinaface/rcnn/tools/__init__.py b/insightface/detection/retinaface/rcnn/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/tools/reeval.py b/insightface/detection/retinaface/rcnn/tools/reeval.py
new file mode 100644
index 0000000000000000000000000000000000000000..0903a06b41cd055b2567568004a67a22b5af35bb
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/tools/reeval.py
@@ -0,0 +1,68 @@
+import argparse
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import os
+import mxnet as mx
+
+from ..logger import logger
+from ..config import config, default, generate_config
+from ..dataset import *
+
+
+def reeval(args):
+    # load imdb
+    imdb = eval(args.dataset)(args.image_set, args.root_path,
+                              args.dataset_path)
+
+    # load detection results
+    cache_file = os.path.join(imdb.cache_path, imdb.name, 'detections.pkl')
+    with open(cache_file) as f:
+        detections = pickle.load(f)
+
+    # eval
+    imdb.evaluate_detections(detections)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='imdb test')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # other
+    parser.add_argument('--no_shuffle',
+                        help='disable random shuffle',
+                        action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logger.info('Called with argument: %s' % args)
+    reeval(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/rcnn/tools/test_rcnn.py b/insightface/detection/retinaface/rcnn/tools/test_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d571ed60fb517ee820616dac3d1cc60c215bf613
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/tools/test_rcnn.py
@@ -0,0 +1,161 @@
+import argparse
+import pprint
+import mxnet as mx
+
+from ..logger import logger
+from ..config import config, default, generate_config
+from ..symbol import *
+from ..dataset import *
+from ..core.loader import TestLoader
+from ..core.tester import Predictor, pred_eval
+from ..utils.load_model import load_param
+
+
+def test_rcnn(network, dataset, image_set, root_path, dataset_path, ctx,
+              prefix, epoch, vis, shuffle, has_rpn, proposal, thresh):
+    # set config
+    if has_rpn:
+        config.TEST.HAS_RPN = True
+
+    # print config
+    logger.info(pprint.pformat(config))
+
+    # load symbol and testing data
+    if has_rpn:
+        sym = eval('get_' + network + '_test')(num_classes=config.NUM_CLASSES,
+                                               num_anchors=config.NUM_ANCHORS)
+        imdb = eval(dataset)(image_set, root_path, dataset_path)
+        roidb = imdb.gt_roidb()
+    else:
+        sym = eval('get_' + network +
+                   '_rcnn_test')(num_classes=config.NUM_CLASSES)
+        imdb = eval(dataset)(image_set, root_path, dataset_path)
+        gt_roidb = imdb.gt_roidb()
+        roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb)
+
+    # get test data iter
+    test_data = TestLoader(roidb,
+                           batch_size=1,
+                           shuffle=shuffle,
+                           has_rpn=has_rpn)
+
+    # load model
+    arg_params, aux_params = load_param(prefix,
+                                        epoch,
+                                        convert=True,
+                                        ctx=ctx,
+                                        process=True)
+
+    # infer shape
+    data_shape_dict = dict(test_data.provide_data)
+    arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
+
+    # check parameters
+    for k in sym.list_arguments():
+        if k in data_shape_dict or 'label' in k:
+            continue
+        assert k in arg_params, k + ' not initialized'
+        assert arg_params[k].shape == arg_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
+    for k in sym.list_auxiliary_states():
+        assert k in aux_params, k + ' not initialized'
+        assert aux_params[k].shape == aux_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)
+
+    # decide maximum shape
+    data_names = [k[0] for k in test_data.provide_data]
+    label_names = None
+    max_data_shape = [('data', (1, 3, max([v[0] for v in config.SCALES]),
+                                max([v[1] for v in config.SCALES])))]
+    if not has_rpn:
+        max_data_shape.append(
+            ('rois', (1, config.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5)))
+
+    # create predictor
+    predictor = Predictor(sym,
+                          data_names,
+                          label_names,
+                          context=ctx,
+                          max_data_shapes=max_data_shape,
+                          provide_data=test_data.provide_data,
+                          provide_label=test_data.provide_label,
+                          arg_params=arg_params,
+                          aux_params=aux_params)
+
+    # start detection
+    pred_eval(predictor, test_data, imdb, vis=vis, thresh=thresh)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.test_image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # testing
+    parser.add_argument('--prefix',
+                        help='model to test with',
+                        default=default.rcnn_prefix,
+                        type=str)
+    parser.add_argument('--epoch',
+                        help='model to test with',
+                        default=default.rcnn_epoch,
+                        type=int)
+    parser.add_argument('--gpu',
+                        help='GPU device to test with',
+                        default=0,
+                        type=int)
+    # rcnn
+    parser.add_argument('--vis',
+                        help='turn on visualization',
+                        action='store_true')
+    parser.add_argument('--thresh',
+                        help='valid detection threshold',
+                        default=1e-3,
+                        type=float)
+    parser.add_argument('--shuffle',
+                        help='shuffle data on visualization',
+                        action='store_true')
+    parser.add_argument('--has_rpn',
+                        help='generate proposals on the fly',
+                        action='store_true')
+    parser.add_argument('--proposal',
+                        help='can be ss for selective search or rpn',
+                        default='rpn',
+                        type=str)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logger.info('Called with argument: %s' % args)
+    ctx = mx.gpu(args.gpu)
+    test_rcnn(args.network, args.dataset, args.image_set, args.root_path,
+              args.dataset_path, ctx, args.prefix, args.epoch, args.vis,
+              args.shuffle, args.has_rpn, args.proposal, args.thresh)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/rcnn/tools/test_rpn.py b/insightface/detection/retinaface/rcnn/tools/test_rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7704068321e3267d3439da00aaf60033fe77922
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/tools/test_rpn.py
@@ -0,0 +1,160 @@
+import argparse
+import pprint
+import mxnet as mx
+
+from ..logger import logger
+from ..config import config, default, generate_config
+from ..symbol import *
+from ..dataset import *
+from ..core.loader import TestLoader
+from ..core.tester import Predictor, generate_proposals, test_proposals
+from ..utils.load_model import load_param
+
+
+def test_rpn(network,
+             dataset,
+             image_set,
+             root_path,
+             dataset_path,
+             ctx,
+             prefix,
+             epoch,
+             vis,
+             shuffle,
+             thresh,
+             test_output=False):
+    # rpn generate proposal config
+    config.TEST.HAS_RPN = True
+
+    # print config
+    logger.info(pprint.pformat(config))
+
+    # load symbol
+    sym = eval('get_' + network + '_rpn_test')()
+
+    # load dataset and prepare imdb for training
+    imdb = eval(dataset)(image_set, root_path, dataset_path)
+    roidb = imdb.gt_roidb()
+    test_data = TestLoader(roidb,
+                           batch_size=1,
+                           shuffle=shuffle,
+                           has_rpn=True,
+                           withlabel=True)
+
+    # load model
+    arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # infer shape
+    data_shape_dict = dict(test_data.provide_data)
+    arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
+
+    # check parameters
+    for k in sym.list_arguments():
+        if k in data_shape_dict or 'label' in k:
+            continue
+        assert k in arg_params, k + ' not initialized'
+        assert arg_params[k].shape == arg_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
+    for k in sym.list_auxiliary_states():
+        assert k in aux_params, k + ' not initialized'
+        assert aux_params[k].shape == aux_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)
+
+    # decide maximum shape
+    data_names = [k[0] for k in test_data.provide_data]
+    label_names = None if test_data.provide_label is None else [
+        k[0] for k in test_data.provide_label
+    ]
+    max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]),
+                                max([v[1] for v in config.SCALES])))]
+
+    # create predictor
+    predictor = Predictor(sym,
+                          data_names,
+                          label_names,
+                          context=ctx,
+                          max_data_shapes=max_data_shape,
+                          provide_data=test_data.provide_data,
+                          provide_label=test_data.provide_label,
+                          arg_params=arg_params,
+                          aux_params=aux_params)
+
+    # start testing
+    if not test_output:
+        imdb_boxes = generate_proposals(predictor,
+                                        test_data,
+                                        imdb,
+                                        vis=vis,
+                                        thresh=thresh)
+        imdb.evaluate_recall(roidb, candidate_boxes=imdb_boxes)
+    else:
+        test_proposals(predictor, test_data, imdb, roidb, vis=vis)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Test a Region Proposal Network')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.test_image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # testing
+    parser.add_argument('--prefix',
+                        help='model to test with',
+                        default=default.rpn_prefix,
+                        type=str)
+    parser.add_argument('--epoch',
+                        help='model to test with',
+                        default=default.rpn_epoch,
+                        type=int)
+    # rpn
+    parser.add_argument('--gpu',
+                        help='GPU device to test with',
+                        default=0,
+                        type=int)
+    parser.add_argument('--vis',
+                        help='turn on visualization',
+                        action='store_true')
+    parser.add_argument('--thresh',
+                        help='rpn proposal threshold',
+                        default=0,
+                        type=float)
+    parser.add_argument('--shuffle',
+                        help='shuffle data on visualization',
+                        action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logger.info('Called with argument: %s' % args)
+    ctx = mx.gpu(args.gpu)
+    test_rpn(args.network, args.dataset, args.image_set, args.root_path,
+             args.dataset_path, ctx, args.prefix, args.epoch, args.vis,
+             args.shuffle, args.thresh)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/rcnn/tools/train_rcnn.py b/insightface/detection/retinaface/rcnn/tools/train_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..167a3b0b1f3678fdb332c132f1d423a8bc5270ac
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/tools/train_rcnn.py
@@ -0,0 +1,286 @@
+import argparse
+import pprint
+import mxnet as mx
+
+from ..logger import logger
+from ..config import config, default, generate_config
+from ..symbol import *
+from ..core import callback, metric
+from ..core.loader import ROIIter
+from ..core.module import MutableModule
+from ..processing.bbox_regression import add_bbox_regression_targets
+from ..utils.load_data import load_proposal_roidb, merge_roidb, filter_roidb
+from ..utils.load_model import load_param
+
+
+def train_rcnn(network, dataset, image_set, root_path, dataset_path, frequent,
+               kvstore, work_load_list, no_flip, no_shuffle, resume, ctx,
+               pretrained, epoch, prefix, begin_epoch, end_epoch, train_shared,
+               lr, lr_step, proposal):
+    # set up config
+    config.TRAIN.BATCH_IMAGES = 2
+    config.TRAIN.BATCH_ROIS = 128
+    if proposal == 'ss':
+        config.TRAIN.BG_THRESH_LO = 0.1  # reproduce Fast R-CNN
+
+    # load symbol
+    sym = eval('get_' + network + '_rcnn')(num_classes=config.NUM_CLASSES)
+
+    # setup multi-gpu
+    batch_size = len(ctx)
+    input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size
+
+    # print config
+    logger.info(pprint.pformat(config))
+
+    # load dataset and prepare imdb for training
+    image_sets = [iset for iset in image_set.split('+')]
+    roidbs = [
+        load_proposal_roidb(dataset,
+                            image_set,
+                            root_path,
+                            dataset_path,
+                            proposal=proposal,
+                            append_gt=True,
+                            flip=not no_flip) for image_set in image_sets
+    ]
+    roidb = merge_roidb(roidbs)
+    roidb = filter_roidb(roidb)
+    means, stds = add_bbox_regression_targets(roidb)
+
+    # load training data
+    train_data = ROIIter(roidb,
+                         batch_size=input_batch_size,
+                         shuffle=not no_shuffle,
+                         ctx=ctx,
+                         work_load_list=work_load_list,
+                         aspect_grouping=config.TRAIN.ASPECT_GROUPING)
+
+    # infer max shape
+    max_data_shape = [('data', (input_batch_size, 3,
+                                max([v[0] for v in config.SCALES]),
+                                max([v[1] for v in config.SCALES])))]
+    logger.info('providing maximum shape %s' % max_data_shape)
+
+    # infer shape
+    data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
+    arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
+    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
+    logger.info('output shape %s' % pprint.pformat(out_shape_dict))
+
+    # load and initialize params
+    if resume:
+        arg_params, aux_params = load_param(prefix, begin_epoch, convert=True)
+    else:
+        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
+        arg_params['cls_score_weight'] = mx.random.normal(
+            0, 0.01, shape=arg_shape_dict['cls_score_weight'])
+        arg_params['cls_score_bias'] = mx.nd.zeros(
+            shape=arg_shape_dict['cls_score_bias'])
+        arg_params['bbox_pred_weight'] = mx.random.normal(
+            0, 0.001, shape=arg_shape_dict['bbox_pred_weight'])
+        arg_params['bbox_pred_bias'] = mx.nd.zeros(
+            shape=arg_shape_dict['bbox_pred_bias'])
+
+    # check parameter shapes
+    for k in sym.list_arguments():
+        if k in data_shape_dict:
+            continue
+        assert k in arg_params, k + ' not initialized'
+        assert arg_params[k].shape == arg_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
+    for k in sym.list_auxiliary_states():
+        assert k in aux_params, k + ' not initialized'
+        assert aux_params[k].shape == aux_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)
+
+    # prepare training
+    # create solver
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    if train_shared:
+        fixed_param_prefix = config.FIXED_PARAMS_SHARED
+    else:
+        fixed_param_prefix = config.FIXED_PARAMS
+    mod = MutableModule(sym,
+                        data_names=data_names,
+                        label_names=label_names,
+                        logger=logger,
+                        context=ctx,
+                        work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape,
+                        fixed_param_prefix=fixed_param_prefix)
+
+    # decide training params
+    # metric
+    eval_metric = metric.RCNNAccMetric()
+    cls_metric = metric.RCNNLogLossMetric()
+    bbox_metric = metric.RCNNL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    # callback
+    batch_end_callback = mx.callback.Speedometer(train_data.batch_size,
+                                                 frequent=frequent,
+                                                 auto_reset=False)
+    epoch_end_callback = callback.do_checkpoint(prefix, means, stds)
+    # decide learning rate
+    base_lr = lr
+    lr_factor = 0.1
+    lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
+    lr_epoch_diff = [
+        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
+    ]
+    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
+    lr_iters = [
+        int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff
+    ]
+    logger.info('lr %f lr_epoch_diff %s lr_iters %s' %
+                (lr, lr_epoch_diff, lr_iters))
+    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
+    # optimizer
+    optimizer_params = {
+        'momentum': 0.9,
+        'wd': 0.0005,
+        'learning_rate': lr,
+        'lr_scheduler': lr_scheduler,
+        'rescale_grad': (1.0 / batch_size),
+        'clip_gradient': 5
+    }
+
+    # train
+    mod.fit(train_data,
+            eval_metric=eval_metrics,
+            epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback,
+            kvstore=kvstore,
+            optimizer='sgd',
+            optimizer_params=optimizer_params,
+            arg_params=arg_params,
+            aux_params=aux_params,
+            begin_epoch=begin_epoch,
+            num_epoch=end_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Fast R-CNN Network')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # training
+    parser.add_argument('--frequent',
+                        help='frequency of logging',
+                        default=default.frequent,
+                        type=int)
+    parser.add_argument('--kvstore',
+                        help='the kv-store type',
+                        default=default.kvstore,
+                        type=str)
+    parser.add_argument('--work_load_list',
+                        help='work load for different devices',
+                        default=None,
+                        type=list)
+    parser.add_argument('--no_flip',
+                        help='disable flip images',
+                        action='store_true')
+    parser.add_argument('--no_shuffle',
+                        help='disable random shuffle',
+                        action='store_true')
+    parser.add_argument('--resume',
+                        help='continue training',
+                        action='store_true')
+    # rcnn
+    parser.add_argument('--gpus',
+                        help='GPU device to train with',
+                        default='0',
+                        type=str)
+    parser.add_argument('--pretrained',
+                        help='pretrained model prefix',
+                        default=default.pretrained,
+                        type=str)
+    parser.add_argument('--pretrained_epoch',
+                        help='pretrained model epoch',
+                        default=default.pretrained_epoch,
+                        type=int)
+    parser.add_argument('--prefix',
+                        help='new model prefix',
+                        default=default.rcnn_prefix,
+                        type=str)
+    parser.add_argument('--begin_epoch',
+                        help='begin epoch of training',
+                        default=0,
+                        type=int)
+    parser.add_argument('--end_epoch',
+                        help='end epoch of training',
+                        default=default.rcnn_epoch,
+                        type=int)
+    parser.add_argument('--lr',
+                        help='base learning rate',
+                        default=default.rcnn_lr,
+                        type=float)
+    parser.add_argument('--lr_step',
+                        help='learning rate steps (in epoch)',
+                        default=default.rcnn_lr_step,
+                        type=str)
+    parser.add_argument('--train_shared',
+                        help='second round train shared params',
+                        action='store_true')
+    parser.add_argument('--proposal',
+                        help='can be ss for selective search or rpn',
+                        default='rpn',
+                        type=str)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logger.info('Called with argument: %s' % args)
+    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    train_rcnn(args.network,
+               args.dataset,
+               args.image_set,
+               args.root_path,
+               args.dataset_path,
+               args.frequent,
+               args.kvstore,
+               args.work_load_list,
+               args.no_flip,
+               args.no_shuffle,
+               args.resume,
+               ctx,
+               args.pretrained,
+               args.pretrained_epoch,
+               args.prefix,
+               args.begin_epoch,
+               args.end_epoch,
+               train_shared=args.train_shared,
+               lr=args.lr,
+               lr_step=args.lr_step,
+               proposal=args.proposal)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/rcnn/tools/train_rpn.py b/insightface/detection/retinaface/rcnn/tools/train_rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d600809341fad4f859a5b703cbdafaaac0df2f6
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/tools/train_rpn.py
@@ -0,0 +1,300 @@
+import argparse
+import logging
+import pprint
+import mxnet as mx
+
+from ..config import config, default, generate_config
+from ..symbol import *
+from ..core import callback, metric
+from ..core.loader import AnchorLoaderFPN
+from ..core.module import MutableModule
+from ..utils.load_data import load_gt_roidb, merge_roidb, filter_roidb
+from ..utils.load_model import load_param
+
+
+def train_rpn(network, dataset, image_set, root_path, dataset_path, frequent,
+              kvstore, work_load_list, no_flip, no_shuffle, resume, ctx,
+              pretrained, epoch, prefix, begin_epoch, end_epoch, train_shared,
+              lr, lr_step):
+    # set up logger
+    logging.basicConfig()
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # setup config
+    assert config.TRAIN.BATCH_IMAGES == 1
+
+    # load symbol
+    sym = eval('get_' + network + '_rpn')()
+    feat_sym = []
+    for stride in config.RPN_FEAT_STRIDE:
+        feat_sym.append(sym.get_internals()['rpn_cls_score_stride%s_output' %
+                                            stride])
+
+    # setup multi-gpu
+    batch_size = len(ctx)
+    input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size
+
+    # print config
+    pprint.pprint(config)
+
+    # load dataset and prepare imdb for training
+    image_sets = [iset for iset in image_set.split('+')]
+    roidbs = [
+        load_gt_roidb(dataset,
+                      image_set,
+                      root_path,
+                      dataset_path,
+                      flip=not no_flip) for image_set in image_sets
+    ]
+    roidb = merge_roidb(roidbs)
+    roidb = filter_roidb(roidb)
+
+    # load training data
+    #train_data = AnchorLoaderFPN(feat_sym, roidb, batch_size=input_batch_size, shuffle=not no_shuffle,
+    #                              ctx=ctx, work_load_list=work_load_list,
+    #                              feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES,
+    #                              anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING,
+    #                              allowed_border=9999)
+    train_data = AnchorLoaderFPN(feat_sym,
+                                 roidb,
+                                 batch_size=input_batch_size,
+                                 shuffle=not no_shuffle,
+                                 ctx=ctx,
+                                 work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (input_batch_size, 3,
+                                max([v[0] for v in config.SCALES]),
+                                max([v[1] for v in config.SCALES])))]
+    max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
+    print 'providing maximum shape', max_data_shape, max_label_shape
+
+    # infer shape
+    data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
+    arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    out_shape_dict = zip(sym.list_outputs(), out_shape)
+    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
+    print 'output shape'
+    pprint.pprint(out_shape_dict)
+
+    # load and initialize params
+    if resume:
+        arg_params, aux_params = load_param(prefix, begin_epoch, convert=True)
+    else:
+        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
+        init = mx.init.Xavier(factor_type="in",
+                              rnd_type='gaussian',
+                              magnitude=2)
+        init_internal = mx.init.Normal(sigma=0.01)
+        for k in sym.list_arguments():
+            if k in data_shape_dict:
+                continue
+            if k not in arg_params:
+                print 'init', k
+                arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k])
+                if not k.endswith('bias'):
+                    init_internal(k, arg_params[k])
+
+        for k in sym.list_auxiliary_states():
+            if k not in aux_params:
+                print 'init', k
+                aux_params[k] = mx.nd.zeros(shape=aux_shape_dict[k])
+                init(k, aux_params[k])
+
+    # check parameter shapes
+    for k in sym.list_arguments():
+        if k in data_shape_dict:
+            continue
+        assert k in arg_params, k + ' not initialized'
+        assert arg_params[k].shape == arg_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
+    for k in sym.list_auxiliary_states():
+        assert k in aux_params, k + ' not initialized'
+        assert aux_params[k].shape == aux_shape_dict[k], \
+            'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)
+
+    # create solver
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    if train_shared:
+        fixed_param_prefix = config.FIXED_PARAMS_SHARED
+    else:
+        fixed_param_prefix = config.FIXED_PARAMS
+    mod = MutableModule(sym,
+                        data_names=data_names,
+                        label_names=label_names,
+                        logger=logger,
+                        context=ctx,
+                        work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape,
+                        max_label_shapes=max_label_shape,
+                        fixed_param_prefix=fixed_param_prefix)
+
+    # decide training params
+    # metric
+    eval_metric = metric.RPNAccMetric()
+    cls_metric = metric.RPNLogLossMetric()
+    bbox_metric = metric.RPNL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    # callback
+    batch_end_callback = []
+    batch_end_callback.append(
+        mx.callback.Speedometer(train_data.batch_size, frequent=frequent))
+    epoch_end_callback = mx.callback.do_checkpoint(prefix)
+    # decide learning rate
+    base_lr = lr
+    lr_factor = 0.1
+    lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
+    lr_epoch_diff = [
+        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
+    ]
+    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
+    lr_iters = [
+        int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff
+    ]
+    print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters
+    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
+    # optimizer
+    optimizer_params = {
+        'momentum': 0.9,
+        'wd': 0.0001,
+        'learning_rate': lr,
+        'lr_scheduler': lr_scheduler,
+        'rescale_grad': (1.0 / batch_size),
+        'clip_gradient': 5
+    }
+
+    # train
+    mod.fit(train_data,
+            eval_metric=eval_metrics,
+            epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback,
+            kvstore=kvstore,
+            optimizer='sgd',
+            optimizer_params=optimizer_params,
+            arg_params=arg_params,
+            aux_params=aux_params,
+            begin_epoch=begin_epoch,
+            num_epoch=end_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Train a Region Proposal Network')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # training
+    parser.add_argument('--frequent',
+                        help='frequency of logging',
+                        default=default.frequent,
+                        type=int)
+    parser.add_argument('--kvstore',
+                        help='the kv-store type',
+                        default=default.kvstore,
+                        type=str)
+    parser.add_argument('--work_load_list',
+                        help='work load for different devices',
+                        default=None,
+                        type=list)
+    parser.add_argument('--no_flip',
+                        help='disable flip images',
+                        action='store_true')
+    parser.add_argument('--no_shuffle',
+                        help='disable random shuffle',
+                        action='store_true')
+    parser.add_argument('--resume',
+                        help='continue training',
+                        action='store_true')
+    # rpn
+    parser.add_argument('--gpus',
+                        help='GPU device to train with',
+                        default='0',
+                        type=str)
+    parser.add_argument('--pretrained',
+                        help='pretrained model prefix',
+                        default=default.pretrained,
+                        type=str)
+    parser.add_argument('--pretrained_epoch',
+                        help='pretrained model epoch',
+                        default=default.pretrained_epoch,
+                        type=int)
+    parser.add_argument('--prefix',
+                        help='new model prefix',
+                        default=default.rpn_prefix,
+                        type=str)
+    parser.add_argument('--begin_epoch',
+                        help='begin epoch of training',
+                        default=0,
+                        type=int)
+    parser.add_argument('--end_epoch',
+                        help='end epoch of training',
+                        default=default.rpn_epoch,
+                        type=int)
+    parser.add_argument('--lr',
+                        help='base learning rate',
+                        default=default.rpn_lr,
+                        type=float)
+    parser.add_argument('--lr_step',
+                        help='learning rate steps (in epoch)',
+                        default=default.rpn_lr_step,
+                        type=str)
+    parser.add_argument('--train_shared',
+                        help='second round train shared params',
+                        action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    print 'Called with argument:', args
+    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    train_rpn(args.network,
+              args.dataset,
+              args.image_set,
+              args.root_path,
+              args.dataset_path,
+              args.frequent,
+              args.kvstore,
+              args.work_load_list,
+              args.no_flip,
+              args.no_shuffle,
+              args.resume,
+              ctx,
+              args.pretrained,
+              args.pretrained_epoch,
+              args.prefix,
+              args.begin_epoch,
+              args.end_epoch,
+              train_shared=args.train_shared,
+              lr=args.lr,
+              lr_step=args.lr_step)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/rcnn/utils/__init__.py b/insightface/detection/retinaface/rcnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface/rcnn/utils/combine_model.py b/insightface/detection/retinaface/rcnn/utils/combine_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..824efefdd1fcfc437d716b1d6b3747ba5b7feca1
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/utils/combine_model.py
@@ -0,0 +1,22 @@
+from .load_model import load_checkpoint
+from .save_model import save_checkpoint
+
+
+def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out):
+    args1, auxs1 = load_checkpoint(prefix1, epoch1)
+    args2, auxs2 = load_checkpoint(prefix2, epoch2)
+    arg_names = args1.keys() + args2.keys()
+    aux_names = auxs1.keys() + auxs2.keys()
+    args = dict()
+    for arg in arg_names:
+        if arg in args1:
+            args[arg] = args1[arg]
+        else:
+            args[arg] = args2[arg]
+    auxs = dict()
+    for aux in aux_names:
+        if aux in auxs1:
+            auxs[aux] = auxs1[aux]
+        else:
+            auxs[aux] = auxs2[aux]
+    save_checkpoint(prefix_out, epoch_out, args, auxs)
diff --git a/insightface/detection/retinaface/rcnn/utils/load_data.py b/insightface/detection/retinaface/rcnn/utils/load_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9d3a45a19e0c334e2eb483e71cf846195e62bd
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/utils/load_data.py
@@ -0,0 +1,64 @@
+import numpy as np
+from ..logger import logger
+from ..config import config
+from ..dataset import *
+
+
+def load_gt_roidb(dataset_name,
+                  image_set_name,
+                  root_path,
+                  dataset_path,
+                  flip=False):
+    """ load ground truth roidb """
+    imdb = eval(dataset_name)(image_set_name, root_path, dataset_path)
+    roidb = imdb.gt_roidb()
+    print('roidb size', len(roidb))
+    if flip:
+        roidb = imdb.append_flipped_images(roidb)
+    print('flipped roidb size', len(roidb))
+    return roidb
+
+
+def load_proposal_roidb(dataset_name,
+                        image_set_name,
+                        root_path,
+                        dataset_path,
+                        proposal='rpn',
+                        append_gt=True,
+                        flip=False):
+    """ load proposal roidb (append_gt when training) """
+    imdb = eval(dataset_name)(image_set_name, root_path, dataset_path)
+    gt_roidb = imdb.gt_roidb()
+    roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, append_gt)
+    if flip:
+        roidb = imdb.append_flipped_images(roidb)
+    return roidb
+
+
+def merge_roidb(roidbs):
+    """ roidb are list, concat them together """
+    roidb = roidbs[0]
+    for r in roidbs[1:]:
+        roidb.extend(r)
+    return roidb
+
+
+def filter_roidb(roidb):
+    """ remove roidb entries without usable rois """
+    def is_valid(entry):
+        """ valid images have at least 1 fg or bg roi """
+        overlaps = entry['max_overlaps']
+        fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0]
+        bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI)
+                           & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+        valid = len(fg_inds) > 0 or len(bg_inds) > 0
+        #valid = len(fg_inds) > 0
+        return valid
+
+    num = len(roidb)
+    filtered_roidb = [entry for entry in roidb if is_valid(entry)]
+    num_after = len(filtered_roidb)
+    logger.info('load data: filtered %d roidb entries: %d -> %d' %
+                (num - num_after, num, num_after))
+
+    return filtered_roidb
diff --git a/insightface/detection/retinaface/rcnn/utils/load_model.py b/insightface/detection/retinaface/rcnn/utils/load_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f8354869b666f3d60660aaa6d171227df0e32a1
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/utils/load_model.py
@@ -0,0 +1,59 @@
+import mxnet as mx
+
+
+def load_checkpoint(prefix, epoch):
+    """
+    Load model checkpoint from file.
+    :param prefix: Prefix of model name.
+    :param epoch: Epoch number of model we would like to load.
+    :return: (arg_params, aux_params)
+    arg_params : dict of str to NDArray
+        Model parameter, dict of name to NDArray of net's weights.
+    aux_params : dict of str to NDArray
+        Model parameter, dict of name to NDArray of net's auxiliary states.
+    """
+    save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch))
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return arg_params, aux_params
+
+
+def convert_context(params, ctx):
+    """
+    :param params: dict of str to NDArray
+    :param ctx: the context to convert to
+    :return: dict of str of NDArray with context ctx
+    """
+    new_params = dict()
+    for k, v in params.items():
+        new_params[k] = v.as_in_context(ctx)
+    return new_params
+
+
+def load_param(prefix, epoch, convert=False, ctx=None, process=False):
+    """
+    wrapper for load checkpoint
+    :param prefix: Prefix of model name.
+    :param epoch: Epoch number of model we would like to load.
+    :param convert: reference model should be converted to GPU NDArray first
+    :param ctx: if convert then ctx must be designated.
+    :param process: model should drop any test
+    :return: (arg_params, aux_params)
+    """
+    arg_params, aux_params = load_checkpoint(prefix, epoch)
+    if convert:
+        if ctx is None:
+            ctx = mx.cpu()
+        arg_params = convert_context(arg_params, ctx)
+        aux_params = convert_context(aux_params, ctx)
+    if process:
+        tests = [k for k in arg_params.keys() if '_test' in k]
+        for test in tests:
+            arg_params[test.replace('_test', '')] = arg_params.pop(test)
+    return arg_params, aux_params
diff --git a/insightface/detection/retinaface/rcnn/utils/save_model.py b/insightface/detection/retinaface/rcnn/utils/save_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c595f4dba77478a369301e369c1916e5c453f8a
--- /dev/null
+++ b/insightface/detection/retinaface/rcnn/utils/save_model.py
@@ -0,0 +1,18 @@
+import mxnet as mx
+
+
+def save_checkpoint(prefix, epoch, arg_params, aux_params):
+    """Checkpoint the model data into file.
+    :param prefix: Prefix of model name.
+    :param epoch: The epoch number of the model.
+    :param arg_params: dict of str to NDArray
+        Model parameter, dict of name to NDArray of net's weights.
+    :param aux_params: dict of str to NDArray
+        Model parameter, dict of name to NDArray of net's auxiliary states.
+    :return: None
+    prefix-epoch.params will be saved for parameters.
+    """
+    save_dict = {('arg:%s' % k): v for k, v in arg_params.items()}
+    save_dict.update({('aux:%s' % k): v for k, v in aux_params.items()})
+    param_name = '%s-%04d.params' % (prefix, epoch)
+    mx.nd.save(param_name, save_dict)
diff --git a/insightface/detection/retinaface/retinaface.py b/insightface/detection/retinaface/retinaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b274e3da8b54f3bce949e74ffeb18f18f2db1ce
--- /dev/null
+++ b/insightface/detection/retinaface/retinaface.py
@@ -0,0 +1,839 @@
+from __future__ import print_function
+import sys
+import os
+import datetime
+import time
+import numpy as np
+import mxnet as mx
+from mxnet import ndarray as nd
+import cv2
+#from rcnn import config
+from rcnn.logger import logger
+#from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes, landmark_pred
+from rcnn.processing.bbox_transform import clip_boxes
+from rcnn.processing.generate_anchor import generate_anchors_fpn, anchors_plane
+from rcnn.processing.nms import gpu_nms_wrapper, cpu_nms_wrapper
+from rcnn.processing.bbox_transform import bbox_overlaps
+
+
+class RetinaFace:
+    def __init__(self,
+                 prefix,
+                 epoch,
+                 ctx_id=0,
+                 network='net3',
+                 nms=0.4,
+                 nocrop=False,
+                 decay4=0.5,
+                 vote=False):
+        self.ctx_id = ctx_id
+        self.network = network
+        self.decay4 = decay4
+        self.nms_threshold = nms
+        self.vote = vote
+        self.nocrop = nocrop
+        self.debug = False
+        self.fpn_keys = []
+        self.anchor_cfg = None
+        pixel_means = [0.0, 0.0, 0.0]
+        pixel_stds = [1.0, 1.0, 1.0]
+        pixel_scale = 1.0
+        self.preprocess = False
+        _ratio = (1., )
+        fmc = 3
+        if network == 'ssh' or network == 'vgg':
+            pixel_means = [103.939, 116.779, 123.68]
+            self.preprocess = True
+        elif network == 'net3':
+            _ratio = (1., )
+        elif network == 'net3a':
+            _ratio = (1., 1.5)
+        elif network == 'net6':  #like pyramidbox or s3fd
+            fmc = 6
+        elif network == 'net5':  #retinaface
+            fmc = 5
+        elif network == 'net5a':
+            fmc = 5
+            _ratio = (1., 1.5)
+        elif network == 'net4':
+            fmc = 4
+        elif network == 'net4a':
+            fmc = 4
+            _ratio = (1., 1.5)
+        elif network == 'x5':
+            fmc = 5
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        elif network == 'x3':
+            fmc = 3
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        elif network == 'x3a':
+            fmc = 3
+            _ratio = (1., 1.5)
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        else:
+            assert False, 'network setting error %s' % network
+
+        if fmc == 3:
+            self._feat_stride_fpn = [32, 16, 8]
+            self.anchor_cfg = {
+                '32': {
+                    'SCALES': (32, 16),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (8, 4),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 4:
+            self._feat_stride_fpn = [32, 16, 8, 4]
+            self.anchor_cfg = {
+                '32': {
+                    'SCALES': (32, 16),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (8, 4),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '4': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 6:
+            self._feat_stride_fpn = [128, 64, 32, 16, 8, 4]
+            self.anchor_cfg = {
+                '128': {
+                    'SCALES': (32, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '64': {
+                    'SCALES': (16, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '32': {
+                    'SCALES': (8, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (4, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '4': {
+                    'SCALES': (1, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 5:
+            self._feat_stride_fpn = [64, 32, 16, 8, 4]
+            self.anchor_cfg = {}
+            _ass = 2.0**(1.0 / 3)
+            _basescale = 1.0
+            for _stride in [4, 8, 16, 32, 64]:
+                key = str(_stride)
+                value = {
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                }
+                scales = []
+                for _ in range(3):
+                    scales.append(_basescale)
+                    _basescale *= _ass
+                value['SCALES'] = tuple(scales)
+                self.anchor_cfg[key] = value
+
+        print(self._feat_stride_fpn, self.anchor_cfg)
+
+        for s in self._feat_stride_fpn:
+            self.fpn_keys.append('stride%s' % s)
+
+        dense_anchor = False
+        #self._anchors_fpn = dict(zip(self.fpn_keys, generate_anchors_fpn(base_size=fpn_base_size, scales=self._scales, ratios=self._ratios)))
+        self._anchors_fpn = dict(
+            zip(
+                self.fpn_keys,
+                generate_anchors_fpn(dense_anchor=dense_anchor,
+                                     cfg=self.anchor_cfg)))
+        for k in self._anchors_fpn:
+            v = self._anchors_fpn[k].astype(np.float32)
+            self._anchors_fpn[k] = v
+
+        self._num_anchors = dict(
+            zip(self.fpn_keys,
+                [anchors.shape[0] for anchors in self._anchors_fpn.values()]))
+        #self._bbox_pred = nonlinear_pred
+        #self._landmark_pred = landmark_pred
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        if self.ctx_id >= 0:
+            self.ctx = mx.gpu(self.ctx_id)
+            self.nms = gpu_nms_wrapper(self.nms_threshold, self.ctx_id)
+        else:
+            self.ctx = mx.cpu()
+            self.nms = cpu_nms_wrapper(self.nms_threshold)
+        self.pixel_means = np.array(pixel_means, dtype=np.float32)
+        self.pixel_stds = np.array(pixel_stds, dtype=np.float32)
+        self.pixel_scale = float(pixel_scale)
+        print('means', self.pixel_means)
+        self.use_landmarks = False
+        if len(sym) // len(self._feat_stride_fpn) >= 3:
+            self.use_landmarks = True
+        print('use_landmarks', self.use_landmarks)
+        self.cascade = 0
+        if float(len(sym)) // len(self._feat_stride_fpn) > 3.0:
+            self.cascade = 1
+        print('cascade', self.cascade)
+        #self.bbox_stds = [0.1, 0.1, 0.2, 0.2]
+        #self.landmark_std = 0.1
+        self.bbox_stds = [1.0, 1.0, 1.0, 1.0]
+        self.landmark_std = 1.0
+
+        if self.debug:
+            c = len(sym) // len(self._feat_stride_fpn)
+            sym = sym[(c * 0):]
+            self._feat_stride_fpn = [32, 16, 8]
+        print('sym size:', len(sym))
+
+        image_size = (640, 640)
+        self.model = mx.mod.Module(symbol=sym,
+                                   context=self.ctx,
+                                   label_names=None)
+        self.model.bind(data_shapes=[('data', (1, 3, image_size[0],
+                                               image_size[1]))],
+                        for_training=False)
+        self.model.set_params(arg_params, aux_params)
+
+    def get_input(self, img):
+        im = img.astype(np.float32)
+        im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+        for i in range(3):
+            im_tensor[
+                0,
+                i, :, :] = (im[:, :, 2 - i] / self.pixel_scale -
+                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
+        #if self.debug:
+        #  timeb = datetime.datetime.now()
+        #  diff = timeb - timea
+        #  print('X2 uses', diff.total_seconds(), 'seconds')
+        data = nd.array(im_tensor)
+        return data
+
+    def detect(self, img, threshold=0.5, scales=[1.0], do_flip=False):
+        #print('in_detect', threshold, scales, do_flip, do_nms)
+        proposals_list = []
+        scores_list = []
+        landmarks_list = []
+        strides_list = []
+        timea = datetime.datetime.now()
+        flips = [0]
+        if do_flip:
+            flips = [0, 1]
+
+        imgs = [img]
+        if isinstance(img, list):
+            imgs = img
+        for img in imgs:
+            for im_scale in scales:
+                for flip in flips:
+                    if im_scale != 1.0:
+                        im = cv2.resize(img,
+                                        None,
+                                        None,
+                                        fx=im_scale,
+                                        fy=im_scale,
+                                        interpolation=cv2.INTER_LINEAR)
+                    else:
+                        im = img.copy()
+                    if flip:
+                        im = im[:, ::-1, :]
+                    if self.nocrop:
+                        if im.shape[0] % 32 == 0:
+                            h = im.shape[0]
+                        else:
+                            h = (im.shape[0] // 32 + 1) * 32
+                        if im.shape[1] % 32 == 0:
+                            w = im.shape[1]
+                        else:
+                            w = (im.shape[1] // 32 + 1) * 32
+                        _im = np.zeros((h, w, 3), dtype=np.float32)
+                        _im[0:im.shape[0], 0:im.shape[1], :] = im
+                        im = _im
+                    else:
+                        im = im.astype(np.float32)
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X1 uses', diff.total_seconds(), 'seconds')
+                    #self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))], for_training=False)
+                    #im_info = [im.shape[0], im.shape[1], im_scale]
+                    im_info = [im.shape[0], im.shape[1]]
+                    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+                    for i in range(3):
+                        im_tensor[0, i, :, :] = (
+                            im[:, :, 2 - i] / self.pixel_scale -
+                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X2 uses', diff.total_seconds(), 'seconds')
+                    data = nd.array(im_tensor)
+                    db = mx.io.DataBatch(data=(data, ),
+                                         provide_data=[('data', data.shape)])
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X3 uses', diff.total_seconds(), 'seconds')
+                    self.model.forward(db, is_train=False)
+                    net_out = self.model.get_outputs()
+                    #post_nms_topN = self._rpn_post_nms_top_n
+                    #min_size_dict = self._rpn_min_size_fpn
+
+                    sym_idx = 0
+
+                    for _idx, s in enumerate(self._feat_stride_fpn):
+                        #if len(scales)>1 and s==32 and im_scale==scales[-1]:
+                        #  continue
+                        _key = 'stride%s' % s
+                        stride = int(s)
+                        is_cascade = False
+                        if self.cascade:
+                            is_cascade = True
+                        #if self.vote and stride==4 and len(scales)>2 and (im_scale==scales[0]):
+                        #  continue
+                        #print('getting', im_scale, stride, idx, len(net_out), data.shape, file=sys.stderr)
+                        scores = net_out[sym_idx].asnumpy()
+                        if self.debug:
+                            timeb = datetime.datetime.now()
+                            diff = timeb - timea
+                            print('A uses', diff.total_seconds(), 'seconds')
+                        #print(scores.shape)
+                        #print('scores',stride, scores.shape, file=sys.stderr)
+                        scores = scores[:, self._num_anchors['stride%s' %
+                                                             s]:, :, :]
+
+                        bbox_deltas = net_out[sym_idx + 1].asnumpy()
+
+                        #if DEBUG:
+                        #    print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+                        #    print 'scale: {}'.format(im_info[2])
+
+                        #_height, _width = int(im_info[0] / stride), int(im_info[1] / stride)
+                        height, width = bbox_deltas.shape[
+                            2], bbox_deltas.shape[3]
+
+                        A = self._num_anchors['stride%s' % s]
+                        K = height * width
+                        anchors_fpn = self._anchors_fpn['stride%s' % s]
+                        anchors = anchors_plane(height, width, stride,
+                                                anchors_fpn)
+                        #print((height, width), (_height, _width), anchors.shape, bbox_deltas.shape, scores.shape, file=sys.stderr)
+                        anchors = anchors.reshape((K * A, 4))
+                        #print('num_anchors', self._num_anchors['stride%s'%s], file=sys.stderr)
+                        #print('HW', (height, width), file=sys.stderr)
+                        #print('anchors_fpn', anchors_fpn.shape, file=sys.stderr)
+                        #print('anchors', anchors.shape, file=sys.stderr)
+                        #print('bbox_deltas', bbox_deltas.shape, file=sys.stderr)
+                        #print('scores', scores.shape, file=sys.stderr)
+
+                        #scores = self._clip_pad(scores, (height, width))
+                        scores = scores.transpose((0, 2, 3, 1)).reshape(
+                            (-1, 1))
+
+                        #print('pre', bbox_deltas.shape, height, width)
+                        #bbox_deltas = self._clip_pad(bbox_deltas, (height, width))
+                        #print('after', bbox_deltas.shape, height, width)
+                        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1))
+                        bbox_pred_len = bbox_deltas.shape[3] // A
+                        #print(bbox_deltas.shape)
+                        bbox_deltas = bbox_deltas.reshape((-1, bbox_pred_len))
+                        bbox_deltas[:,
+                                    0::4] = bbox_deltas[:, 0::
+                                                        4] * self.bbox_stds[0]
+                        bbox_deltas[:,
+                                    1::4] = bbox_deltas[:, 1::
+                                                        4] * self.bbox_stds[1]
+                        bbox_deltas[:,
+                                    2::4] = bbox_deltas[:, 2::
+                                                        4] * self.bbox_stds[2]
+                        bbox_deltas[:,
+                                    3::4] = bbox_deltas[:, 3::
+                                                        4] * self.bbox_stds[3]
+                        proposals = self.bbox_pred(anchors, bbox_deltas)
+
+                        #print(anchors.shape, bbox_deltas.shape, A, K, file=sys.stderr)
+                        if is_cascade:
+                            cascade_sym_num = 0
+                            cls_cascade = False
+                            bbox_cascade = False
+                            __idx = [3, 4]
+                            if not self.use_landmarks:
+                                __idx = [2, 3]
+                            for diff_idx in __idx:
+                                if sym_idx + diff_idx >= len(net_out):
+                                    break
+                                body = net_out[sym_idx + diff_idx].asnumpy()
+                                if body.shape[1] // A == 2:  #cls branch
+                                    if cls_cascade or bbox_cascade:
+                                        break
+                                    else:
+                                        cascade_scores = body[:, self.
+                                                              _num_anchors[
+                                                                  'stride%s' %
+                                                                  s]:, :, :]
+                                        cascade_scores = cascade_scores.transpose(
+                                            (0, 2, 3, 1)).reshape((-1, 1))
+                                        #scores = (scores+cascade_scores)/2.0
+                                        scores = cascade_scores  #TODO?
+                                        cascade_sym_num += 1
+                                        cls_cascade = True
+                                        #print('find cascade cls at stride', stride)
+                                elif body.shape[1] // A == 4:  #bbox branch
+                                    cascade_deltas = body.transpose(
+                                        (0, 2, 3, 1)).reshape(
+                                            (-1, bbox_pred_len))
+                                    cascade_deltas[:, 0::
+                                                   4] = cascade_deltas[:, 0::
+                                                                       4] * self.bbox_stds[
+                                                                           0]
+                                    cascade_deltas[:, 1::
+                                                   4] = cascade_deltas[:, 1::
+                                                                       4] * self.bbox_stds[
+                                                                           1]
+                                    cascade_deltas[:, 2::
+                                                   4] = cascade_deltas[:, 2::
+                                                                       4] * self.bbox_stds[
+                                                                           2]
+                                    cascade_deltas[:, 3::
+                                                   4] = cascade_deltas[:, 3::
+                                                                       4] * self.bbox_stds[
+                                                                           3]
+                                    proposals = self.bbox_pred(
+                                        proposals, cascade_deltas)
+                                    cascade_sym_num += 1
+                                    bbox_cascade = True
+                                    #print('find cascade bbox at stride', stride)
+
+                        proposals = clip_boxes(proposals, im_info[:2])
+
+                        #if self.vote:
+                        #  if im_scale>1.0:
+                        #    keep = self._filter_boxes2(proposals, 160*im_scale, -1)
+                        #  else:
+                        #    keep = self._filter_boxes2(proposals, -1, 100*im_scale)
+                        #  if stride==4:
+                        #    keep = self._filter_boxes2(proposals, 12*im_scale, -1)
+                        #    proposals = proposals[keep, :]
+                        #    scores = scores[keep]
+
+                        #keep = self._filter_boxes(proposals, min_size_dict['stride%s'%s] * im_info[2])
+                        #proposals = proposals[keep, :]
+                        #scores = scores[keep]
+                        #print('333', proposals.shape)
+                        if stride == 4 and self.decay4 < 1.0:
+                            scores *= self.decay4
+
+                        scores_ravel = scores.ravel()
+                        #print('__shapes', proposals.shape, scores_ravel.shape)
+                        #print('max score', np.max(scores_ravel))
+                        order = np.where(scores_ravel >= threshold)[0]
+                        #_scores = scores_ravel[order]
+                        #_order = _scores.argsort()[::-1]
+                        #order = order[_order]
+                        proposals = proposals[order, :]
+                        scores = scores[order]
+                        if flip:
+                            oldx1 = proposals[:, 0].copy()
+                            oldx2 = proposals[:, 2].copy()
+                            proposals[:, 0] = im.shape[1] - oldx2 - 1
+                            proposals[:, 2] = im.shape[1] - oldx1 - 1
+
+                        proposals[:, 0:4] /= im_scale
+
+                        proposals_list.append(proposals)
+                        scores_list.append(scores)
+                        if self.nms_threshold < 0.0:
+                            _strides = np.empty(shape=(scores.shape),
+                                                dtype=np.float32)
+                            _strides.fill(stride)
+                            strides_list.append(_strides)
+
+                        if not self.vote and self.use_landmarks:
+                            landmark_deltas = net_out[sym_idx + 2].asnumpy()
+                            #landmark_deltas = self._clip_pad(landmark_deltas, (height, width))
+                            landmark_pred_len = landmark_deltas.shape[1] // A
+                            landmark_deltas = landmark_deltas.transpose(
+                                (0, 2, 3, 1)).reshape(
+                                    (-1, 5, landmark_pred_len // 5))
+                            landmark_deltas *= self.landmark_std
+                            #print(landmark_deltas.shape, landmark_deltas)
+                            landmarks = self.landmark_pred(
+                                anchors, landmark_deltas)
+                            landmarks = landmarks[order, :]
+
+                            if flip:
+                                landmarks[:, :,
+                                          0] = im.shape[1] - landmarks[:, :,
+                                                                       0] - 1
+                                #for a in range(5):
+                                #  oldx1 = landmarks[:, a].copy()
+                                #  landmarks[:,a] = im.shape[1] - oldx1 - 1
+                                order = [1, 0, 2, 4, 3]
+                                flandmarks = landmarks.copy()
+                                for idx, a in enumerate(order):
+                                    flandmarks[:, idx, :] = landmarks[:, a, :]
+                                    #flandmarks[:, idx*2] = landmarks[:,a*2]
+                                    #flandmarks[:, idx*2+1] = landmarks[:,a*2+1]
+                                landmarks = flandmarks
+                            landmarks[:, :, 0:2] /= im_scale
+                            #landmarks /= im_scale
+                            #landmarks = landmarks.reshape( (-1, landmark_pred_len) )
+                            landmarks_list.append(landmarks)
+                            #proposals = np.hstack((proposals, landmarks))
+                        if self.use_landmarks:
+                            sym_idx += 3
+                        else:
+                            sym_idx += 2
+                        if is_cascade:
+                            sym_idx += cascade_sym_num
+
+        if self.debug:
+            timeb = datetime.datetime.now()
+            diff = timeb - timea
+            print('B uses', diff.total_seconds(), 'seconds')
+        proposals = np.vstack(proposals_list)
+        landmarks = None
+        if proposals.shape[0] == 0:
+            if self.use_landmarks:
+                landmarks = np.zeros((0, 5, 2))
+            if self.nms_threshold < 0.0:
+                return np.zeros((0, 6)), landmarks
+            else:
+                return np.zeros((0, 5)), landmarks
+        scores = np.vstack(scores_list)
+        #print('shapes', proposals.shape, scores.shape)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        #if config.TEST.SCORE_THRESH>0.0:
+        #  _count = np.sum(scores_ravel>config.TEST.SCORE_THRESH)
+        #  order = order[:_count]
+        proposals = proposals[order, :]
+        scores = scores[order]
+        if self.nms_threshold < 0.0:
+            strides = np.vstack(strides_list)
+            strides = strides[order]
+        if not self.vote and self.use_landmarks:
+            landmarks = np.vstack(landmarks_list)
+            landmarks = landmarks[order].astype(np.float32, copy=False)
+
+        if self.nms_threshold > 0.0:
+            pre_det = np.hstack((proposals[:, 0:4], scores)).astype(np.float32,
+                                                                    copy=False)
+            if not self.vote:
+                keep = self.nms(pre_det)
+                det = np.hstack((pre_det, proposals[:, 4:]))
+                det = det[keep, :]
+                if self.use_landmarks:
+                    landmarks = landmarks[keep]
+            else:
+                det = np.hstack((pre_det, proposals[:, 4:]))
+                det = self.bbox_vote(det)
+        elif self.nms_threshold < 0.0:
+            det = np.hstack(
+                (proposals[:, 0:4], scores, strides)).astype(np.float32,
+                                                             copy=False)
+        else:
+            det = np.hstack((proposals[:, 0:4], scores)).astype(np.float32,
+                                                                copy=False)
+
+        if self.debug:
+            timeb = datetime.datetime.now()
+            diff = timeb - timea
+            print('C uses', diff.total_seconds(), 'seconds')
+        return det, landmarks
+
+    def detect_center(self, img, threshold=0.5, scales=[1.0], do_flip=False):
+        det, landmarks = self.detect(img, threshold, scales, do_flip)
+        if det.shape[0] == 0:
+            return None, None
+        bindex = 0
+        if det.shape[0] > 1:
+            img_size = np.asarray(img.shape)[0:2]
+            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                           det[:, 1])
+            img_center = img_size / 2
+            offsets = np.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                                 (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            bindex = np.argmax(bounding_box_size - offset_dist_squared *
+                               2.0)  # some extra weight on the centering
+        bbox = det[bindex, :]
+        landmark = landmarks[bindex, :, :]
+        return bbox, landmark
+
+    @staticmethod
+    def check_large_pose(landmark, bbox):
+        assert landmark.shape == (5, 2)
+        assert len(bbox) == 4
+
+        def get_theta(base, x, y):
+            vx = x - base
+            vy = y - base
+            vx[1] *= -1
+            vy[1] *= -1
+            tx = np.arctan2(vx[1], vx[0])
+            ty = np.arctan2(vy[1], vy[0])
+            d = ty - tx
+            d = np.degrees(d)
+            #print(vx, tx, vy, ty, d)
+            #if d<-1.*math.pi:
+            #  d+=2*math.pi
+            #elif d>math.pi:
+            #  d-=2*math.pi
+            if d < -180.0:
+                d += 360.
+            elif d > 180.0:
+                d -= 360.0
+            return d
+
+        landmark = landmark.astype(np.float32)
+
+        theta1 = get_theta(landmark[0], landmark[3], landmark[2])
+        theta2 = get_theta(landmark[1], landmark[2], landmark[4])
+        #print(va, vb, theta2)
+        theta3 = get_theta(landmark[0], landmark[2], landmark[1])
+        theta4 = get_theta(landmark[1], landmark[0], landmark[2])
+        theta5 = get_theta(landmark[3], landmark[4], landmark[2])
+        theta6 = get_theta(landmark[4], landmark[2], landmark[3])
+        theta7 = get_theta(landmark[3], landmark[2], landmark[0])
+        theta8 = get_theta(landmark[4], landmark[1], landmark[2])
+        #print(theta1, theta2, theta3, theta4, theta5, theta6, theta7, theta8)
+        left_score = 0.0
+        right_score = 0.0
+        up_score = 0.0
+        down_score = 0.0
+        if theta1 <= 0.0:
+            left_score = 10.0
+        elif theta2 <= 0.0:
+            right_score = 10.0
+        else:
+            left_score = theta2 / theta1
+            right_score = theta1 / theta2
+        if theta3 <= 10.0 or theta4 <= 10.0:
+            up_score = 10.0
+        else:
+            up_score = max(theta1 / theta3, theta2 / theta4)
+        if theta5 <= 10.0 or theta6 <= 10.0:
+            down_score = 10.0
+        else:
+            down_score = max(theta7 / theta5, theta8 / theta6)
+        mleft = (landmark[0][0] + landmark[3][0]) / 2
+        mright = (landmark[1][0] + landmark[4][0]) / 2
+        box_center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
+        ret = 0
+        if left_score >= 3.0:
+            ret = 1
+        if ret == 0 and left_score >= 2.0:
+            if mright <= box_center[0]:
+                ret = 1
+        if ret == 0 and right_score >= 3.0:
+            ret = 2
+        if ret == 0 and right_score >= 2.0:
+            if mleft >= box_center[0]:
+                ret = 2
+        if ret == 0 and up_score >= 2.0:
+            ret = 3
+        if ret == 0 and down_score >= 5.0:
+            ret = 4
+        return ret, left_score, right_score, up_score, down_score
+
+    @staticmethod
+    def _filter_boxes(boxes, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        return keep
+
+    @staticmethod
+    def _filter_boxes2(boxes, max_size, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        if max_size > 0:
+            keep = np.where(np.minimum(ws, hs) < max_size)[0]
+        elif min_size > 0:
+            keep = np.where(np.maximum(ws, hs) > min_size)[0]
+        return keep
+
+    @staticmethod
+    def _clip_pad(tensor, pad_shape):
+        """
+      Clip boxes of the pad area.
+      :param tensor: [n, c, H, W]
+      :param pad_shape: [h, w]
+      :return: [n, c, h, w]
+      """
+        H, W = tensor.shape[2:]
+        h, w = pad_shape
+
+        if h < H or w < W:
+            tensor = tensor[:, :, :h, :w].copy()
+
+        return tensor
+
+    @staticmethod
+    def bbox_pred(boxes, box_deltas):
+        """
+      Transform the set of class-agnostic boxes into class-specific boxes
+      by applying the predicted offsets (box_deltas)
+      :param boxes: !important [N 4]
+      :param box_deltas: [N, 4 * num_classes]
+      :return: [N 4 * num_classes]
+      """
+        if boxes.shape[0] == 0:
+            return np.zeros((0, box_deltas.shape[1]))
+
+        boxes = boxes.astype(np.float, copy=False)
+        widths = boxes[:, 2] - boxes[:, 0] + 1.0
+        heights = boxes[:, 3] - boxes[:, 1] + 1.0
+        ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+        ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+        dx = box_deltas[:, 0:1]
+        dy = box_deltas[:, 1:2]
+        dw = box_deltas[:, 2:3]
+        dh = box_deltas[:, 3:4]
+
+        pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+        pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+        pred_w = np.exp(dw) * widths[:, np.newaxis]
+        pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+        pred_boxes = np.zeros(box_deltas.shape)
+        # x1
+        pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+        # y1
+        pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+        # x2
+        pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+        # y2
+        pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+        if box_deltas.shape[1] > 4:
+            pred_boxes[:, 4:] = box_deltas[:, 4:]
+
+        return pred_boxes
+
+    @staticmethod
+    def landmark_pred(boxes, landmark_deltas):
+        if boxes.shape[0] == 0:
+            return np.zeros((0, landmark_deltas.shape[1]))
+        boxes = boxes.astype(np.float, copy=False)
+        widths = boxes[:, 2] - boxes[:, 0] + 1.0
+        heights = boxes[:, 3] - boxes[:, 1] + 1.0
+        ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+        ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+        pred = landmark_deltas.copy()
+        for i in range(5):
+            pred[:, i, 0] = landmark_deltas[:, i, 0] * widths + ctr_x
+            pred[:, i, 1] = landmark_deltas[:, i, 1] * heights + ctr_y
+        return pred
+        #preds = []
+        #for i in range(landmark_deltas.shape[1]):
+        #  if i%2==0:
+        #    pred = (landmark_deltas[:,i]*widths + ctr_x)
+        #  else:
+        #    pred = (landmark_deltas[:,i]*heights + ctr_y)
+        #  preds.append(pred)
+        #preds = np.vstack(preds).transpose()
+        #return preds
+
+    def bbox_vote(self, det):
+        #order = det[:, 4].ravel().argsort()[::-1]
+        #det = det[order, :]
+        if det.shape[0] == 0:
+            return np.zeros((0, 5))
+            #dets = np.array([[10, 10, 20, 20, 0.002]])
+            #det = np.empty(shape=[0, 5])
+        dets = None
+        while det.shape[0] > 0:
+            if dets is not None and dets.shape[0] >= 750:
+                break
+            # IOU
+            area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+            xx1 = np.maximum(det[0, 0], det[:, 0])
+            yy1 = np.maximum(det[0, 1], det[:, 1])
+            xx2 = np.minimum(det[0, 2], det[:, 2])
+            yy2 = np.minimum(det[0, 3], det[:, 3])
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            o = inter / (area[0] + area[:] - inter)
+
+            # nms
+            merge_index = np.where(o >= self.nms_threshold)[0]
+            det_accu = det[merge_index, :]
+            det = np.delete(det, merge_index, 0)
+            if merge_index.shape[0] <= 1:
+                if det.shape[0] == 0:
+                    try:
+                        dets = np.row_stack((dets, det_accu))
+                    except:
+                        dets = det_accu
+                continue
+            det_accu[:,
+                     0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:],
+                                                       (1, 4))
+            max_score = np.max(det_accu[:, 4])
+            det_accu_sum = np.zeros((1, 5))
+            det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(
+                det_accu[:, -1:])
+            det_accu_sum[:, 4] = max_score
+            if dets is None:
+                dets = det_accu_sum
+            else:
+                dets = np.row_stack((dets, det_accu_sum))
+        dets = dets[0:750, :]
+        return dets
diff --git a/insightface/detection/retinaface/test.py b/insightface/detection/retinaface/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b88c82ba61bbc7744b3e137bd6a74c4f94b53442
--- /dev/null
+++ b/insightface/detection/retinaface/test.py
@@ -0,0 +1,63 @@
+import cv2
+import sys
+import numpy as np
+import datetime
+import os
+import glob
+from retinaface import RetinaFace
+
+thresh = 0.8
+scales = [1024, 1980]
+
+count = 1
+
+gpuid = 0
+detector = RetinaFace('./model/R50', 0, gpuid, 'net3')
+
+img = cv2.imread('t1.jpg')
+print(img.shape)
+im_shape = img.shape
+target_size = scales[0]
+max_size = scales[1]
+im_size_min = np.min(im_shape[0:2])
+im_size_max = np.max(im_shape[0:2])
+#im_scale = 1.0
+#if im_size_min>target_size or im_size_max>max_size:
+im_scale = float(target_size) / float(im_size_min)
+# prevent bigger axis from being more than max_size:
+if np.round(im_scale * im_size_max) > max_size:
+    im_scale = float(max_size) / float(im_size_max)
+
+print('im_scale', im_scale)
+
+scales = [im_scale]
+flip = False
+
+for c in range(count):
+    faces, landmarks = detector.detect(img,
+                                       thresh,
+                                       scales=scales,
+                                       do_flip=flip)
+    print(c, faces.shape, landmarks.shape)
+
+if faces is not None:
+    print('find', faces.shape[0], 'faces')
+    for i in range(faces.shape[0]):
+        #print('score', faces[i][4])
+        box = faces[i].astype(np.int)
+        #color = (255,0,0)
+        color = (0, 0, 255)
+        cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), color, 2)
+        if landmarks is not None:
+            landmark5 = landmarks[i].astype(np.int)
+            #print(landmark.shape)
+            for l in range(landmark5.shape[0]):
+                color = (0, 0, 255)
+                if l == 0 or l == 3:
+                    color = (0, 255, 0)
+                cv2.circle(img, (landmark5[l][0], landmark5[l][1]), 1, color,
+                           2)
+
+    filename = './detector_test.jpg'
+    print('writing', filename)
+    cv2.imwrite(filename, img)
diff --git a/insightface/detection/retinaface/test_widerface.py b/insightface/detection/retinaface/test_widerface.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c2f83d90696b852880546ba8f8f68949ada9ac
--- /dev/null
+++ b/insightface/detection/retinaface/test_widerface.py
@@ -0,0 +1,259 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import os
+import time
+import numpy as np
+import mxnet as mx
+from mxnet import ndarray as nd
+import cv2
+from rcnn.logger import logger
+#from rcnn.config import config, default, generate_config
+#from rcnn.tools.test_rcnn import test_rcnn
+#from rcnn.tools.test_rpn import test_rpn
+from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes, landmark_pred
+from rcnn.processing.generate_anchor import generate_anchors_fpn, anchors_plane
+from rcnn.processing.nms import gpu_nms_wrapper
+from rcnn.processing.bbox_transform import bbox_overlaps
+from rcnn.dataset import retinaface
+from retinaface import RetinaFace
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Test widerface by retinaface detector')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default='net3',
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default='retinaface',
+                        type=str)
+    parser.add_argument('--image-set',
+                        help='image_set name',
+                        default='val',
+                        type=str)
+    parser.add_argument('--root-path',
+                        help='output data folder',
+                        default='./data',
+                        type=str)
+    parser.add_argument('--dataset-path',
+                        help='dataset path',
+                        default='./data/retinaface',
+                        type=str)
+    parser.add_argument('--gpu',
+                        help='GPU device to test with',
+                        default=0,
+                        type=int)
+    # testing
+    parser.add_argument('--prefix',
+                        help='model to test with',
+                        default='',
+                        type=str)
+    parser.add_argument('--epoch',
+                        help='model to test with',
+                        default=0,
+                        type=int)
+    parser.add_argument('--output',
+                        help='output folder',
+                        default='./wout',
+                        type=str)
+    parser.add_argument('--nocrop', help='', action='store_true')
+    parser.add_argument('--thresh',
+                        help='valid detection threshold',
+                        default=0.02,
+                        type=float)
+    parser.add_argument('--mode',
+                        help='test mode, 0 for fast, 1 for accurate',
+                        default=1,
+                        type=int)
+    #parser.add_argument('--pyramid', help='enable pyramid test', action='store_true')
+    #parser.add_argument('--bbox-vote', help='', action='store_true')
+    parser.add_argument('--part', help='', default=0, type=int)
+    parser.add_argument('--parts', help='', default=1, type=int)
+    args = parser.parse_args()
+    return args
+
+
+detector = None
+args = None
+imgid = -1
+
+
+def get_boxes(roi, pyramid):
+    global imgid
+    im = cv2.imread(roi['image'])
+    do_flip = False
+    if not pyramid:
+        target_size = 1200
+        max_size = 1600
+        #do_flip = True
+        target_size = 1504
+        max_size = 2000
+        target_size = 1600
+        max_size = 2150
+        im_shape = im.shape
+        im_size_min = np.min(im_shape[0:2])
+        im_size_max = np.max(im_shape[0:2])
+        im_scale = float(target_size) / float(im_size_min)
+        # prevent bigger axis from being more than max_size:
+        if np.round(im_scale * im_size_max) > max_size:
+            im_scale = float(max_size) / float(im_size_max)
+        scales = [im_scale]
+    else:
+        do_flip = True
+        #TEST_SCALES = [500, 800, 1200, 1600]
+        TEST_SCALES = [500, 800, 1100, 1400, 1700]
+        target_size = 800
+        max_size = 1200
+        im_shape = im.shape
+        im_size_min = np.min(im_shape[0:2])
+        im_size_max = np.max(im_shape[0:2])
+        im_scale = float(target_size) / float(im_size_min)
+        # prevent bigger axis from being more than max_size:
+        if np.round(im_scale * im_size_max) > max_size:
+            im_scale = float(max_size) / float(im_size_max)
+        scales = [
+            float(scale) / target_size * im_scale for scale in TEST_SCALES
+        ]
+    boxes, landmarks = detector.detect(im,
+                                       threshold=args.thresh,
+                                       scales=scales,
+                                       do_flip=do_flip)
+    #print(boxes.shape, landmarks.shape)
+    if imgid >= 0 and imgid < 100:
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        for i in range(boxes.shape[0]):
+            box = boxes[i]
+            ibox = box[0:4].copy().astype(np.int)
+            cv2.rectangle(im, (ibox[0], ibox[1]), (ibox[2], ibox[3]),
+                          (255, 0, 0), 2)
+            #print('box', ibox)
+            #if len(ibox)>5:
+            #  for l in range(5):
+            #    pp = (ibox[5+l*2], ibox[6+l*2])
+            #    cv2.circle(im, (pp[0], pp[1]), 1, (0, 0, 255), 1)
+            blur = box[5]
+            k = "%.3f" % blur
+            cv2.putText(im, k, (ibox[0] + 2, ibox[1] + 14), font, 0.6,
+                        (0, 255, 0), 2)
+            #landmarks = box[6:21].reshape( (5,3) )
+            if landmarks is not None:
+                for l in range(5):
+                    color = (0, 255, 0)
+                    landmark = landmarks[i][l]
+                    pp = (int(landmark[0]), int(landmark[1]))
+                    if landmark[2] - 0.5 < 0.0:
+                        color = (0, 0, 255)
+                    cv2.circle(im, (pp[0], pp[1]), 1, color, 2)
+        filename = './testimages/%d.jpg' % imgid
+        cv2.imwrite(filename, im)
+        print(filename, 'wrote')
+        imgid += 1
+
+    return boxes
+
+
+def test(args):
+    print('test with', args)
+    global detector
+    output_folder = args.output
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
+    detector = RetinaFace(args.prefix,
+                          args.epoch,
+                          args.gpu,
+                          network=args.network,
+                          nocrop=args.nocrop,
+                          vote=args.bbox_vote)
+    imdb = eval(args.dataset)(args.image_set, args.root_path,
+                              args.dataset_path)
+    roidb = imdb.gt_roidb()
+    gt_overlaps = np.zeros(0)
+    overall = [0.0, 0.0]
+    gt_max = np.array((0.0, 0.0))
+    num_pos = 0
+    print('roidb size', len(roidb))
+
+    for i in range(len(roidb)):
+        if i % args.parts != args.part:
+            continue
+        #if i%10==0:
+        #  print('processing', i, file=sys.stderr)
+        roi = roidb[i]
+        boxes = get_boxes(roi, args.pyramid)
+        if 'boxes' in roi:
+            gt_boxes = roi['boxes'].copy()
+            gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] +
+                        1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
+            num_pos += gt_boxes.shape[0]
+
+            overlaps = bbox_overlaps(boxes.astype(np.float),
+                                     gt_boxes.astype(np.float))
+            #print(im_info, gt_boxes.shape, boxes.shape, overlaps.shape, file=sys.stderr)
+
+            _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+
+            if boxes.shape[0] > 0:
+                _gt_overlaps = overlaps.max(axis=0)
+                #print('max_overlaps', _gt_overlaps, file=sys.stderr)
+                for j in range(len(_gt_overlaps)):
+                    if _gt_overlaps[j] > 0.5:
+                        continue
+                    #print(j, 'failed', gt_boxes[j],  'max_overlap:', _gt_overlaps[j], file=sys.stderr)
+
+                # append recorded IoU coverage level
+                found = (_gt_overlaps > 0.5).sum()
+                recall = found / float(gt_boxes.shape[0])
+                #print('recall', _recall, gt_boxes.shape[0], boxes.shape[0], gt_areas, 'num:', i, file=sys.stderr)
+                overall[0] += found
+                overall[1] += gt_boxes.shape[0]
+                #gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+                #_recall = (gt_overlaps >= threshold).sum() / float(num_pos)
+                recall_all = float(overall[0]) / overall[1]
+                #print('recall_all', _recall, file=sys.stderr)
+                print('[%d]' % i,
+                      'recall',
+                      recall, (gt_boxes.shape[0], boxes.shape[0]),
+                      'all:',
+                      recall_all,
+                      file=sys.stderr)
+        else:
+            print('[%d]' % i, 'detect %d faces' % boxes.shape[0])
+
+        _vec = roidb[i]['image'].split('/')
+        out_dir = os.path.join(output_folder, _vec[-2])
+        if not os.path.exists(out_dir):
+            os.mkdir(out_dir)
+        out_file = os.path.join(out_dir, _vec[-1].replace('jpg', 'txt'))
+        with open(out_file, 'w') as f:
+            name = '/'.join(roidb[i]['image'].split('/')[-2:])
+            f.write("%s\n" % (name))
+            f.write("%d\n" % (boxes.shape[0]))
+            for b in range(boxes.shape[0]):
+                box = boxes[b]
+                f.write(
+                    "%d %d %d %d %g \n" %
+                    (box[0], box[1], box[2] - box[0], box[3] - box[1], box[4]))
+
+
+def main():
+    global args
+    args = parse_args()
+    args.pyramid = False
+    args.bbox_vote = False
+    if args.mode == 1:
+        args.pyramid = True
+        args.bbox_vote = True
+    elif args.mode == 2:
+        args.pyramid = True
+        args.bbox_vote = False
+    logger.info('Called with argument: %s' % args)
+    test(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface/train.py b/insightface/detection/retinaface/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbcc70b9f288e0affcc756bf895beeb3ad9886f
--- /dev/null
+++ b/insightface/detection/retinaface/train.py
@@ -0,0 +1,502 @@
+from __future__ import print_function
+import sys
+import argparse
+import os
+import pprint
+import re
+import mxnet as mx
+import numpy as np
+from mxnet.module import Module
+import mxnet.optimizer as optimizer
+
+from rcnn.logger import logger
+from rcnn.config import config, default, generate_config
+from rcnn.symbol import *
+from rcnn.core import callback, metric
+from rcnn.core.loader import CropLoader, CropLoader2
+from rcnn.core.module import MutableModule
+from rcnn.utils.load_data import load_gt_roidb, merge_roidb, filter_roidb
+from rcnn.utils.load_model import load_param
+
+
+def get_fixed_params(symbol, fixed_param):
+    if not config.LAYER_FIX:
+        return []
+    fixed_param_names = []
+    #for name in symbol.list_arguments():
+    #  for f in fixed_param:
+    #    if re.match(f, name):
+    #      fixed_param_names.append(name)
+    #pre = 'mobilenetv20_features_linearbottleneck'
+    idx = 0
+    for name in symbol.list_arguments():
+        #print(idx, name)
+        if idx < 7 and name != 'data':
+            fixed_param_names.append(name)
+        #elif name.startswith('stage1_'):
+        #  fixed_param_names.append(name)
+        if name.find('upsampling') >= 0:
+            fixed_param_names.append(name)
+
+        idx += 1
+    return fixed_param_names
+
+
+def train_net(args,
+              ctx,
+              pretrained,
+              epoch,
+              prefix,
+              begin_epoch,
+              end_epoch,
+              lr=0.001,
+              lr_step='5'):
+    # setup config
+    #init_config()
+    #print(config)
+    # setup multi-gpu
+
+    input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx)
+
+    # print config
+    logger.info(pprint.pformat(config))
+
+    # load dataset and prepare imdb for training
+    image_sets = [iset for iset in args.image_set.split('+')]
+    roidbs = [
+        load_gt_roidb(args.dataset,
+                      image_set,
+                      args.root_path,
+                      args.dataset_path,
+                      flip=not args.no_flip) for image_set in image_sets
+    ]
+    #roidb = merge_roidb(roidbs)
+    #roidb = filter_roidb(roidb)
+    roidb = roidbs[0]
+
+    # load symbol
+    #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS)
+    #feat_sym = sym.get_internals()['rpn_cls_score_output']
+    #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle,
+    #                          ctx=ctx, work_load_list=args.work_load_list,
+    #                          feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES,
+    #                          anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING)
+
+    # load and initialize params
+    sym = None
+    if len(pretrained) == 0:
+        arg_params = {}
+        aux_params = {}
+    else:
+        logger.info('loading %s,%d' % (pretrained, epoch))
+        sym, arg_params, aux_params = mx.model.load_checkpoint(
+            pretrained, epoch)
+        #arg_params, aux_params = load_param(pretrained, epoch, convert=True)
+        #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']:
+        #  _k = k+"_weight"
+        #  if _k in arg_shape_dict:
+        #    v = 0.001 if _k.startswith('bbox_') else 0.01
+        #    arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k])
+        #    print('init %s with normal %.5f'%(_k,v))
+        #  _k = k+"_bias"
+        #  if _k in arg_shape_dict:
+        #    arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k])
+        #    print('init %s with zero'%(_k))
+
+    sym = eval('get_' + args.network + '_train')(sym)
+    #print(sym.get_internals())
+    feat_sym = []
+    for stride in config.RPN_FEAT_STRIDE:
+        feat_sym.append(
+            sym.get_internals()['face_rpn_cls_score_stride%s_output' % stride])
+
+    train_data = CropLoader(feat_sym,
+                            roidb,
+                            batch_size=input_batch_size,
+                            shuffle=not args.no_shuffle,
+                            ctx=ctx,
+                            work_load_list=args.work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]),
+                                max([v[1] for v in config.SCALES])))]
+    #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
+    max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
+    max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5)))
+    logger.info('providing maximum shape %s %s' %
+                (max_data_shape, max_label_shape))
+
+    # infer shape
+    data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
+    arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+    out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
+    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
+    logger.info('output shape %s' % pprint.pformat(out_shape_dict))
+
+    for k in arg_shape_dict:
+        v = arg_shape_dict[k]
+        if k.find('upsampling') >= 0:
+            print('initializing upsampling_weight', k)
+            arg_params[k] = mx.nd.zeros(shape=v)
+            init = mx.init.Initializer()
+            init._init_bilinear(k, arg_params[k])
+            #print(args[k])
+
+    # check parameter shapes
+    #for k in sym.list_arguments():
+    #    if k in data_shape_dict:
+    #        continue
+    #    assert k in arg_params, k + ' not initialized'
+    #    assert arg_params[k].shape == arg_shape_dict[k], \
+    #        'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
+    #for k in sym.list_auxiliary_states():
+    #    assert k in aux_params, k + ' not initialized'
+    #    assert aux_params[k].shape == aux_shape_dict[k], \
+    #        'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)
+
+    fixed_param_prefix = config.FIXED_PARAMS
+    # create solver
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    fixed_param_names = get_fixed_params(sym, fixed_param_prefix)
+    print('fixed', fixed_param_names, file=sys.stderr)
+    mod = Module(sym,
+                 data_names=data_names,
+                 label_names=label_names,
+                 logger=logger,
+                 context=ctx,
+                 work_load_list=args.work_load_list,
+                 fixed_param_names=fixed_param_names)
+
+    # metric
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    mid = 0
+    for m in range(len(config.RPN_FEAT_STRIDE)):
+        stride = config.RPN_FEAT_STRIDE[m]
+        #mid = m*MSTEP
+        _metric = metric.RPNAccMetric(pred_idx=mid,
+                                      label_idx=mid + 1,
+                                      name='RPNAcc_s%s' % stride)
+        eval_metrics.add(_metric)
+        mid += 2
+        #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1)
+        #eval_metrics.add(_metric)
+
+        _metric = metric.RPNL1LossMetric(loss_idx=mid,
+                                         weight_idx=mid + 1,
+                                         name='RPNL1Loss_s%s' % stride)
+        eval_metrics.add(_metric)
+        mid += 2
+        if config.FACE_LANDMARK:
+            _metric = metric.RPNL1LossMetric(loss_idx=mid,
+                                             weight_idx=mid + 1,
+                                             name='RPNLandMarkL1Loss_s%s' %
+                                             stride)
+            eval_metrics.add(_metric)
+            mid += 2
+        if config.HEAD_BOX:
+            _metric = metric.RPNAccMetric(pred_idx=mid,
+                                          label_idx=mid + 1,
+                                          name='RPNAcc_head_s%s' % stride)
+            eval_metrics.add(_metric)
+            mid += 2
+            #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1)
+            #eval_metrics.add(_metric)
+
+            _metric = metric.RPNL1LossMetric(loss_idx=mid,
+                                             weight_idx=mid + 1,
+                                             name='RPNL1Loss_head_s%s' %
+                                             stride)
+            eval_metrics.add(_metric)
+            mid += 2
+        if config.CASCADE > 0:
+            for _idx in range(config.CASCADE):
+                if stride in config.CASCADE_CLS_STRIDES:
+                    _metric = metric.RPNAccMetric(pred_idx=mid,
+                                                  label_idx=mid + 1,
+                                                  name='RPNAccCAS%d_s%s' %
+                                                  (_idx, stride))
+                    eval_metrics.add(_metric)
+                    mid += 2
+                if stride in config.CASCADE_BBOX_STRIDES:
+                    _metric = metric.RPNL1LossMetric(
+                        loss_idx=mid,
+                        weight_idx=mid + 1,
+                        name='RPNL1LossCAS%d_s%s' % (_idx, stride))
+                    eval_metrics.add(_metric)
+                    mid += 2
+
+    # callback
+    #means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES)
+    #stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES)
+    #epoch_end_callback = callback.do_checkpoint(prefix, means, stds)
+    epoch_end_callback = None
+    # decide learning rate
+    #base_lr = lr
+    #lr_factor = 0.1
+    #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
+
+    lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
+    lr_epoch_diff = [
+        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
+    ]
+    lr_iters = [
+        int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff
+    ]
+    iter_per_epoch = int(len(roidb) / input_batch_size)
+
+    lr_steps = []
+    if len(lr_iters) == 5:
+        factors = [0.5, 0.5, 0.4, 0.1, 0.1]
+        for i in range(5):
+            lr_steps.append((lr_iters[i], factors[i]))
+    elif len(lr_iters) == 8:  #warmup
+        for li in lr_iters[0:5]:
+            lr_steps.append((li, 1.5849))
+        for li in lr_iters[5:]:
+            lr_steps.append((li, 0.1))
+    else:
+        for li in lr_iters:
+            lr_steps.append((li, 0.1))
+    #lr_steps = [ (10,0.1), (20, 0.1) ] #XXX
+
+    end_epoch = 10000
+    logger.info('lr %f lr_epoch_diff %s lr_steps %s' %
+                (lr, lr_epoch_diff, lr_steps))
+    # optimizer
+    opt = optimizer.SGD(learning_rate=lr,
+                        momentum=0.9,
+                        wd=args.wd,
+                        rescale_grad=1.0 / len(ctx),
+                        clip_gradient=None)
+    initializer = mx.init.Xavier()
+    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
+
+    train_data = mx.io.PrefetchingIter(train_data)
+
+    _cb = mx.callback.Speedometer(train_data.batch_size,
+                                  frequent=args.frequent,
+                                  auto_reset=False)
+    global_step = [0]
+
+    def save_model(epoch):
+        arg, aux = mod.get_params()
+        all_layers = mod.symbol.get_internals()
+        outs = []
+        for stride in config.RPN_FEAT_STRIDE:
+            num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
+            if config.CASCADE > 0:
+                _name = 'face_rpn_cls_score_stride%d_output' % (stride)
+                cls_pred = all_layers[_name]
+                cls_pred = mx.symbol.Reshape(data=cls_pred,
+                                             shape=(0, 2, -1, 0))
+
+                cls_pred = mx.symbol.SoftmaxActivation(data=cls_pred,
+                                                       mode="channel")
+                cls_pred = mx.symbol.Reshape(data=cls_pred,
+                                             shape=(0, 2 * num_anchors, -1, 0))
+                outs.append(cls_pred)
+                _name = 'face_rpn_bbox_pred_stride%d_output' % stride
+                rpn_bbox_pred = all_layers[_name]
+                outs.append(rpn_bbox_pred)
+                if config.FACE_LANDMARK:
+                    _name = 'face_rpn_landmark_pred_stride%d_output' % stride
+                    rpn_landmark_pred = all_layers[_name]
+                    outs.append(rpn_landmark_pred)
+                for casid in range(config.CASCADE):
+                    if stride in config.CASCADE_CLS_STRIDES:
+                        _name = 'face_rpn_cls_score_stride%d_cas%d_output' % (
+                            stride, casid)
+                        cls_pred = all_layers[_name]
+                        cls_pred = mx.symbol.Reshape(data=cls_pred,
+                                                     shape=(0, 2, -1, 0))
+                        cls_pred = mx.symbol.SoftmaxActivation(data=cls_pred,
+                                                               mode="channel")
+                        cls_pred = mx.symbol.Reshape(data=cls_pred,
+                                                     shape=(0, 2 * num_anchors,
+                                                            -1, 0))
+                        outs.append(cls_pred)
+                    if stride in config.CASCADE_BBOX_STRIDES:
+                        _name = 'face_rpn_bbox_pred_stride%d_cas%d_output' % (
+                            stride, casid)
+                        bbox_pred = all_layers[_name]
+                        outs.append(bbox_pred)
+            else:
+                _name = 'face_rpn_cls_score_stride%d_output' % stride
+                rpn_cls_score = all_layers[_name]
+
+                # prepare rpn data
+                rpn_cls_score_reshape = mx.symbol.Reshape(
+                    data=rpn_cls_score,
+                    shape=(0, 2, -1, 0),
+                    name="face_rpn_cls_score_reshape_stride%d" % stride)
+
+                rpn_cls_prob = mx.symbol.SoftmaxActivation(
+                    data=rpn_cls_score_reshape,
+                    mode="channel",
+                    name="face_rpn_cls_prob_stride%d" % stride)
+                rpn_cls_prob_reshape = mx.symbol.Reshape(
+                    data=rpn_cls_prob,
+                    shape=(0, 2 * num_anchors, -1, 0),
+                    name='face_rpn_cls_prob_reshape_stride%d' % stride)
+                _name = 'face_rpn_bbox_pred_stride%d_output' % stride
+                rpn_bbox_pred = all_layers[_name]
+                outs.append(rpn_cls_prob_reshape)
+                outs.append(rpn_bbox_pred)
+                if config.FACE_LANDMARK:
+                    _name = 'face_rpn_landmark_pred_stride%d_output' % stride
+                    rpn_landmark_pred = all_layers[_name]
+                    outs.append(rpn_landmark_pred)
+        _sym = mx.sym.Group(outs)
+        mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux)
+
+    def _batch_callback(param):
+        #global global_step
+        _cb(param)
+        global_step[0] += 1
+        mbatch = global_step[0]
+        for step in lr_steps:
+            if mbatch == step[0]:
+                opt.lr *= step[1]
+                print('lr change to',
+                      opt.lr,
+                      ' in batch',
+                      mbatch,
+                      file=sys.stderr)
+                break
+
+        if mbatch % iter_per_epoch == 0:
+            print('saving checkpoint', mbatch, file=sys.stderr)
+            save_model(0)
+        if mbatch == lr_steps[-1][0]:
+            print('saving final checkpoint', mbatch, file=sys.stderr)
+            save_model(0)
+            #arg, aux = mod.get_params()
+            #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux)
+            sys.exit(0)
+
+    # train
+    mod.fit(train_data,
+            eval_metric=eval_metrics,
+            epoch_end_callback=epoch_end_callback,
+            batch_end_callback=_batch_callback,
+            kvstore=args.kvstore,
+            optimizer=opt,
+            initializer=initializer,
+            allow_missing=True,
+            arg_params=arg_params,
+            aux_params=aux_params,
+            begin_epoch=begin_epoch,
+            num_epoch=end_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train RetinaFace')
+    # general
+    parser.add_argument('--network',
+                        help='network name',
+                        default=default.network,
+                        type=str)
+    parser.add_argument('--dataset',
+                        help='dataset name',
+                        default=default.dataset,
+                        type=str)
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset)
+    parser.add_argument('--image_set',
+                        help='image_set name',
+                        default=default.image_set,
+                        type=str)
+    parser.add_argument('--root_path',
+                        help='output data folder',
+                        default=default.root_path,
+                        type=str)
+    parser.add_argument('--dataset_path',
+                        help='dataset path',
+                        default=default.dataset_path,
+                        type=str)
+    # training
+    parser.add_argument('--frequent',
+                        help='frequency of logging',
+                        default=default.frequent,
+                        type=int)
+    parser.add_argument('--kvstore',
+                        help='the kv-store type',
+                        default=default.kvstore,
+                        type=str)
+    parser.add_argument('--work_load_list',
+                        help='work load for different devices',
+                        default=None,
+                        type=list)
+    parser.add_argument('--no_flip',
+                        help='disable flip images',
+                        action='store_true')
+    parser.add_argument('--no_shuffle',
+                        help='disable random shuffle',
+                        action='store_true')
+    # e2e
+    #parser.add_argument('--gpus', help='GPU device to train with', default='0,1,2,3', type=str)
+    parser.add_argument('--pretrained',
+                        help='pretrained model prefix',
+                        default=default.pretrained,
+                        type=str)
+    parser.add_argument('--pretrained_epoch',
+                        help='pretrained model epoch',
+                        default=default.pretrained_epoch,
+                        type=int)
+    parser.add_argument('--prefix',
+                        help='new model prefix',
+                        default=default.prefix,
+                        type=str)
+    parser.add_argument('--begin_epoch',
+                        help='begin epoch of training, use with resume',
+                        default=0,
+                        type=int)
+    parser.add_argument('--end_epoch',
+                        help='end epoch of training',
+                        default=default.end_epoch,
+                        type=int)
+    parser.add_argument('--lr',
+                        help='base learning rate',
+                        default=default.lr,
+                        type=float)
+    parser.add_argument('--lr_step',
+                        help='learning rate steps (in epoch)',
+                        default=default.lr_step,
+                        type=str)
+    parser.add_argument('--wd',
+                        help='weight decay',
+                        default=default.wd,
+                        type=float)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logger.info('Called with argument: %s' % args)
+    #ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    train_net(args,
+              ctx,
+              args.pretrained,
+              args.pretrained_epoch,
+              args.prefix,
+              args.begin_epoch,
+              args.end_epoch,
+              lr=args.lr,
+              lr_step=args.lr_step)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/retinaface_anticov/README.md b/insightface/detection/retinaface_anticov/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..97b3b2449bd09c1c00444d2b208843b3d0c1fd22
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/README.md
@@ -0,0 +1,36 @@
+# RetinaFace Anti Cov Face Detector
+
+## Introduction
+
+RetinaFace-Anti-Cov is a customized one stage face detector to help people protect themselves from CovID-19.
+
+![demoimg1](https://insightface.ai/assets/img/github/cov_test.jpg)
+
+
+## Testing
+
+Please check ``test.py`` for testing.
+
+Make sure that you set ``network='net3l'`` instead of ``'net3'`` for 'mnet_cov2' model, otherwise you will get incorrect landmarks.
+
+## Pretrained Models
+
+~~MobileNet0.25([baidu cloud](https://pan.baidu.com/s/1p8n4R2W-9WmmBWxYQEFcWg),code: fmfm)~~
+
+Better: MobileNet0.25 ([baidu cloud](https://pan.baidu.com/s/16ihzPxjTObdbv0D6P6LmEQ), code: j3b6, [dropbox](https://www.dropbox.com/s/6rhhxsbh2qik65k/cov2.zip?dl=0))
+
+
+
+## References
+
+```
+  
+@inproceedings{deng2019retinaface,
+title={RetinaFace: Single-stage Dense Face Localisation in the Wild},
+author={Deng, Jiankang and Guo, Jia and Yuxiang, Zhou and Jinke Yu and Irene Kotsia and Zafeiriou, Stefanos},
+booktitle={arxiv},
+year={2019}
+}
+```
+
+
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/__init__.py b/insightface/detection/retinaface_anticov/rcnn/processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/assign_levels.py b/insightface/detection/retinaface_anticov/rcnn/processing/assign_levels.py
new file mode 100755
index 0000000000000000000000000000000000000000..012d73d2134cc50aee3aba73641c520084538621
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/assign_levels.py
@@ -0,0 +1,36 @@
+from rcnn.config import config
+import numpy as np
+
+
+def compute_assign_targets(rois, threshold):
+    rois_area = np.sqrt(
+        (rois[:, 2] - rois[:, 0] + 1) * (rois[:, 3] - rois[:, 1] + 1))
+    num_rois = np.shape(rois)[0]
+    assign_levels = np.zeros(num_rois, dtype=np.uint8)
+    for i, stride in enumerate(config.RCNN_FEAT_STRIDE):
+        thd = threshold[i]
+        idx = np.logical_and(thd[1] <= rois_area, rois_area < thd[0])
+        assign_levels[idx] = stride
+
+    assert 0 not in assign_levels, "All rois should assign to specify levels."
+    return assign_levels
+
+
+def add_assign_targets(roidb):
+    """
+    given roidb, add ['assign_level']
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    """
+    print 'add assign targets'
+    assert len(roidb) > 0
+    assert 'boxes' in roidb[0]
+
+    area_threshold = [[np.inf, 448], [448, 224], [224, 112], [112, 0]]
+
+    assert len(config.RCNN_FEAT_STRIDE) == len(area_threshold)
+
+    num_images = len(roidb)
+    for im_i in range(num_images):
+        rois = roidb[im_i]['boxes']
+        roidb[im_i]['assign_levels'] = compute_assign_targets(
+            rois, area_threshold)
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/bbox_regression.py b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eaf917a6f3a2282c3fabb929b270441329b5198
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_regression.py
@@ -0,0 +1,263 @@
+"""
+This file has functions about generating bounding box regression targets
+"""
+
+from ..pycocotools.mask import encode
+import numpy as np
+
+from ..logger import logger
+from .bbox_transform import bbox_overlaps, bbox_transform
+from rcnn.config import config
+import math
+import cv2
+import PIL.Image as Image
+import threading
+import Queue
+
+
+def compute_bbox_regression_targets(rois, overlaps, labels):
+    """
+    given rois, overlaps, gt labels, compute bounding box regression targets
+    :param rois: roidb[i]['boxes'] k * 4
+    :param overlaps: roidb[i]['max_overlaps'] k * 1
+    :param labels: roidb[i]['max_classes'] k * 1
+    :return: targets[i][class, dx, dy, dw, dh] k * 5
+    """
+    # Ensure ROIs are floats
+    rois = rois.astype(np.float, copy=False)
+
+    # Sanity check
+    if len(rois) != len(overlaps):
+        logger.warning('bbox regression: len(rois) != len(overlaps)')
+
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        logger.warning('bbox regression: len(gt_inds) == 0')
+
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
+    targets[ex_inds, 0] = labels[ex_inds]
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
+    return targets
+
+
+def add_bbox_regression_targets(roidb):
+    """
+    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    :return: means, std variances of targets
+    """
+    logger.info('bbox regression: add bounding box regression targets')
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0]
+
+    num_images = len(roidb)
+    num_classes = roidb[0]['gt_overlaps'].shape[1]
+    for im_i in range(num_images):
+        rois = roidb[im_i]['boxes']
+        max_overlaps = roidb[im_i]['max_overlaps']
+        max_classes = roidb[im_i]['max_classes']
+        roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(
+            rois, max_overlaps, max_classes)
+
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # use fixed / precomputed means and stds instead of empirical values
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
+    else:
+        # compute mean, std values
+        class_counts = np.zeros((num_classes, 1)) + 1e-14
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in range(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in range(1, num_classes):
+                cls_indexes = np.where(targets[:, 0] == cls)[0]
+                if cls_indexes.size > 0:
+                    class_counts[cls] += cls_indexes.size
+                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
+                    squared_sums[cls, :] += (targets[cls_indexes,
+                                                     1:]**2).sum(axis=0)
+
+        means = sums / class_counts
+        # var(x) = E(x^2) - E(x)^2
+        stds = np.sqrt(squared_sums / class_counts - means**2)
+
+    # normalized targets
+    for im_i in range(num_images):
+        targets = roidb[im_i]['bbox_targets']
+        for cls in range(1, num_classes):
+            cls_indexes = np.where(targets[:, 0] == cls)[0]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :]
+            roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :]
+
+    return means.ravel(), stds.ravel()
+
+
+def expand_bbox_regression_targets(bbox_targets_data, num_classes):
+    """
+    expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets
+    :param bbox_targets_data: [k * 5]
+    :param num_classes: number of classes
+    :return: bbox target processed [k * 4 num_classes]
+    bbox_weights ! only foreground boxes have bbox regression computation!
+    """
+    classes = bbox_targets_data[:, 0]
+    bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32)
+    bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    indexes = np.where(classes > 0)[0]
+    for index in indexes:
+        cls = classes[index]
+        start = int(4 * cls)
+        end = start + 4
+        bbox_targets[index, start:end] = bbox_targets_data[index, 1:]
+        bbox_weights[index, start:end] = config.TRAIN.BBOX_WEIGHTS
+    return bbox_targets, bbox_weights
+
+
+def compute_mask_and_label(ex_rois, ex_labels, seg, flipped):
+    # assert os.path.exists(seg_gt), 'Path does not exist: {}'.format(seg_gt)
+    # im = Image.open(seg_gt)
+    # pixel = list(im.getdata())
+    # pixel = np.array(pixel).reshape([im.size[1], im.size[0]])
+    im = Image.open(seg)
+    pixel = list(im.getdata())
+    ins_seg = np.array(pixel).reshape([im.size[1], im.size[0]])
+    if flipped:
+        ins_seg = ins_seg[:, ::-1]
+    rois = ex_rois
+    n_rois = ex_rois.shape[0]
+    label = ex_labels
+    class_id = config.CLASS_ID
+    mask_target = np.zeros((n_rois, 28, 28), dtype=np.int8)
+    mask_label = np.zeros((n_rois), dtype=np.int8)
+    for n in range(n_rois):
+        target = ins_seg[int(rois[n, 1]):int(rois[n, 3]),
+                         int(rois[n, 0]):int(rois[n, 2])]
+        ids = np.unique(target)
+        ins_id = 0
+        max_count = 0
+        for id in ids:
+            if math.floor(id / 1000) == class_id[int(label[int(n)])]:
+                px = np.where(ins_seg == int(id))
+                x_min = np.min(px[1])
+                y_min = np.min(px[0])
+                x_max = np.max(px[1])
+                y_max = np.max(px[0])
+                x1 = max(rois[n, 0], x_min)
+                y1 = max(rois[n, 1], y_min)
+                x2 = min(rois[n, 2], x_max)
+                y2 = min(rois[n, 3], y_max)
+                iou = (x2 - x1) * (y2 - y1)
+                iou = iou / ((rois[n, 2] - rois[n, 0]) *
+                             (rois[n, 3] - rois[n, 1]) + (x_max - x_min) *
+                             (y_max - y_min) - iou)
+                if iou > max_count:
+                    ins_id = id
+                    max_count = iou
+
+        if max_count == 0:
+            continue
+        # print max_count
+        mask = np.zeros(target.shape)
+        idx = np.where(target == ins_id)
+        mask[idx] = 1
+        mask = cv2.resize(mask, (28, 28), interpolation=cv2.INTER_NEAREST)
+
+        mask_target[n] = mask
+        mask_label[n] = label[int(n)]
+    return mask_target, mask_label
+
+
+def compute_bbox_mask_targets_and_label(rois, overlaps, labels, seg, flipped):
+    """
+    given rois, overlaps, gt labels, seg, compute bounding box mask targets
+    :param rois: roidb[i]['boxes'] k * 4
+    :param overlaps: roidb[i]['max_overlaps'] k * 1
+    :param labels: roidb[i]['max_classes'] k * 1
+    :return: targets[i][class, dx, dy, dw, dh] k * 5
+    """
+    # Ensure ROIs are floats
+    rois = rois.astype(np.float, copy=False)
+
+    # Sanity check
+    if len(rois) != len(overlaps):
+        print 'bbox regression: this should not happen'
+
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        print 'something wrong : zero ground truth rois'
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    mask_targets, mask_label = compute_mask_and_label(ex_rois, labels[ex_inds],
+                                                      seg, flipped)
+    return mask_targets, mask_label, ex_inds
+
+
+def add_mask_targets(roidb):
+    """
+    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
+    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
+    :return: means, std variances of targets
+    """
+    print 'add bounding box mask targets'
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0]
+
+    num_images = len(roidb)
+
+    # Multi threads processing
+    im_quene = Queue.Queue(maxsize=0)
+    for im_i in range(num_images):
+        im_quene.put(im_i)
+
+    def process():
+        while not im_quene.empty():
+            im_i = im_quene.get()
+            print "-----process img {}".format(im_i)
+            rois = roidb[im_i]['boxes']
+            max_overlaps = roidb[im_i]['max_overlaps']
+            max_classes = roidb[im_i]['max_classes']
+            ins_seg = roidb[im_i]['ins_seg']
+            flipped = roidb[im_i]['flipped']
+            roidb[im_i]['mask_targets'], roidb[im_i]['mask_labels'], roidb[im_i]['mask_inds'] = \
+                compute_bbox_mask_targets_and_label(rois, max_overlaps, max_classes, ins_seg, flipped)
+
+    threads = [threading.Thread(target=process, args=()) for i in range(10)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    # Single thread
+    # for im_i in range(num_images):
+    #     print "-----processing img {}".format(im_i)
+    #     rois = roidb[im_i]['boxes']
+    #     max_overlaps = roidb[im_i]['max_overlaps']
+    #     max_classes = roidb[im_i]['max_classes']
+    #     ins_seg = roidb[im_i]['ins_seg']
+    #     # roidb[im_i]['mask_targets'] = compute_bbox_mask_targets(rois, max_overlaps, max_classes, ins_seg)
+    #     roidb[im_i]['mask_targets'], roidb[im_i]['mask_labels'], roidb[im_i]['mask_inds'] = \
+    #         compute_bbox_mask_targets_and_label(rois, max_overlaps, max_classes, ins_seg)
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee3646ab3bea9ed7b0c5f378d388ef84ac857fb
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py
@@ -0,0 +1,223 @@
+import numpy as np
+from ..cython.bbox import bbox_overlaps_cython
+#from rcnn.config import config
+
+
+def bbox_overlaps(boxes, query_boxes):
+    return bbox_overlaps_cython(boxes, query_boxes)
+
+
+def bbox_overlaps_py(boxes, query_boxes):
+    """
+    determine overlaps between boxes and query_boxes
+    :param boxes: n * 4 bounding boxes
+    :param query_boxes: k * 4 bounding boxes
+    :return: overlaps: n * k overlaps
+    """
+    n_ = boxes.shape[0]
+    k_ = query_boxes.shape[0]
+    overlaps = np.zeros((n_, k_), dtype=np.float)
+    for k in range(k_):
+        query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] +
+                          1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(n_):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(
+                boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(
+                    boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    box_area = (boxes[n, 2] - boxes[n, 0] +
+                                1) * (boxes[n, 3] - boxes[n, 1] + 1)
+                    all_area = float(box_area + query_box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / all_area
+    return overlaps
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4* num_classes]
+    :param im_shape: tuple of 2
+    :return: [N, 4* num_classes]
+    """
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def nonlinear_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    if gt_rois.shape[1] <= 4:
+        targets = np.vstack(
+            (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+        return targets
+    else:
+        targets = [targets_dx, targets_dy, targets_dw, targets_dh]
+        #if config.USE_BLUR:
+        #  for i in range(4, gt_rois.shape[1]):
+        #    t = gt_rois[:,i]
+        #    targets.append(t)
+        targets = np.vstack(targets).transpose()
+        return targets
+
+
+def landmark_transform(ex_rois, gt_rois):
+
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    targets = []
+    for i in range(gt_rois.shape[1]):
+        for j in range(gt_rois.shape[2]):
+            #if not config.USE_OCCLUSION and j==2:
+            #  continue
+            if j == 2:
+                continue
+            if j == 0:  #w
+                target = (gt_rois[:, i, j] - ex_ctr_x) / (ex_widths + 1e-14)
+            elif j == 1:  #h
+                target = (gt_rois[:, i, j] - ex_ctr_y) / (ex_heights + 1e-14)
+            else:  #visibile
+                target = gt_rois[:, i, j]
+            targets.append(target)
+
+    targets = np.vstack(targets).transpose()
+    return targets
+
+
+def nonlinear_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+    dx = box_deltas[:, 0::4]
+    dy = box_deltas[:, 1::4]
+    dw = box_deltas[:, 2::4]
+    dh = box_deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+    return pred_boxes
+
+
+def landmark_pred(boxes, landmark_deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, landmark_deltas.shape[1]))
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+    preds = []
+    for i in range(landmark_deltas.shape[1]):
+        if i % 2 == 0:
+            pred = (landmark_deltas[:, i] * widths + ctr_x)
+        else:
+            pred = (landmark_deltas[:, i] * heights + ctr_y)
+        preds.append(pred)
+    preds = np.vstack(preds).transpose()
+    return preds
+
+
+def iou_transform(ex_rois, gt_rois):
+    """ return bbox targets, IoU loss uses gt_rois as gt """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+    return gt_rois
+
+
+def iou_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    dx1 = box_deltas[:, 0::4]
+    dy1 = box_deltas[:, 1::4]
+    dx2 = box_deltas[:, 2::4]
+    dy2 = box_deltas[:, 3::4]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis]
+    # y1
+    pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis]
+    # x2
+    pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis]
+    # y2
+    pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis]
+
+    return pred_boxes
+
+
+# define bbox_transform and bbox_pred
+bbox_transform = nonlinear_transform
+bbox_pred = nonlinear_pred
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py.orig b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py.orig
new file mode 100644
index 0000000000000000000000000000000000000000..ce3ab59621d6242e3be912e8fcc84d1d3a4e11ee
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/bbox_transform.py.orig
@@ -0,0 +1,265 @@
+import numpy as np
+from ..cython.bbox import bbox_overlaps_cython
+<<<<<<< HEAD
+=======
+from ..config import config
+>>>>>>> eb555e492a6b6f2004d64ae5be07f2d823b92bd6
+
+
+def bbox_overlaps(boxes, query_boxes):
+    return bbox_overlaps_cython(boxes, query_boxes)
+
+
+def bbox_overlaps_py(boxes, query_boxes):
+    """
+    determine overlaps between boxes and query_boxes
+    :param boxes: n * 4 bounding boxes
+    :param query_boxes: k * 4 bounding boxes
+    :return: overlaps: n * k overlaps
+    """
+    n_ = boxes.shape[0]
+    k_ = query_boxes.shape[0]
+    overlaps = np.zeros((n_, k_), dtype=np.float)
+    for k in range(k_):
+        query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(n_):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    box_area = (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1)
+                    all_area = float(box_area + query_box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / all_area
+    return overlaps
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4* num_classes]
+    :param im_shape: tuple of 2
+    :return: [N, 4* num_classes]
+    """
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
+
+
+def nonlinear_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    if gt_rois.shape[1]<=4:
+      targets = np.vstack(
+          (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+      return targets
+    else:
+      targets = [targets_dx, targets_dy, targets_dw, targets_dh]
+      #if config.USE_BLUR:
+      #  for i in range(4, gt_rois.shape[1]):
+      #    t = gt_rois[:,i]
+      #    targets.append(t)
+      targets = np.vstack(targets).transpose()
+      return targets
+
+<<<<<<< HEAD
+def landmark_transform(ex_rois, gt_rois):
+    from rcnn.config import config
+
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+    assert gt_rois.shape[1]==5 or gt_rois.shape[1]==6
+=======
+def landmark_transform(ex_rois, gt_landmarks):
+
+    assert ex_rois.shape[0] == gt_landmarks.shape[0], 'inconsistent rois number'
+>>>>>>> eb555e492a6b6f2004d64ae5be07f2d823b92bd6
+
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    ex_widths = ex_widths[:,np.newaxis]
+    ex_heights = ex_heights[:,np.newaxis]
+    ex_ctr_x = ex_ctr_x[:,np.newaxis]
+    ex_ctr_y = ex_ctr_y[:,np.newaxis]
+
+    targets = np.zeros_like(gt_landmarks)
+    targets[:,:,0] = (gt_landmarks[:,:,0] - ex_ctr_x) / (ex_widths + 1e-14)
+    targets[:,:,1] = (gt_landmarks[:,:,1] - ex_ctr_y) / (ex_heights + 1e-14)
+    if config.dim_landmark==3:
+        targets[:,:,2] = gt_landmarks[:,:,2] / 100.0
+    targets = targets.reshape( (targets.shape[0], -1) )
+    
+    
+<<<<<<< HEAD
+    targets = []
+    for i in range(gt_rois.shape[1]):
+      for j in range(config.dim_landmark):
+        #if not config.USE_OCCLUSION and j==2:
+        #  continue
+        if i<5:
+          if j==0: #w
+            target = (gt_rois[:,i,j] - ex_ctr_x) / (ex_widths + 1e-14)
+          elif j==1: #h
+            target = (gt_rois[:,i,j] - ex_ctr_y) / (ex_heights + 1e-14)
+          else: #visibile
+            target = gt_rois[:,i,j]
+          targets.append(target)
+        else: #may be pose
+          target = gt_rois[:,i,j]
+          targets.append(target)
+
+
+    targets = np.vstack(targets).transpose()
+=======
+    #targets = []
+    #for i in range(gt_landmarks.shape[1]):
+    #  for j in range(config.dim_landmark):
+    #    #if not config.USE_OCCLUSION and j==2:
+    #    #  continue
+    #    #if j==2:
+    #    #  continue
+    #    if j==0: #w
+    #      target = (gt_rois[:,i,j] - ex_ctr_x) / (ex_widths + 1e-14)
+    #    elif j==1: #h
+    #      target = (gt_rois[:,i,j] - ex_ctr_y) / (ex_heights + 1e-14)
+    #    else: #visibile
+    #      target = gt_rois[:,i,j] / 100.0
+    #    targets.append(target)
+
+
+    #targets = np.vstack(targets).transpose()
+>>>>>>> eb555e492a6b6f2004d64ae5be07f2d823b92bd6
+    return targets
+
+
+def nonlinear_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+    dx = box_deltas[:, 0::4]
+    dy = box_deltas[:, 1::4]
+    dw = box_deltas[:, 2::4]
+    dh = box_deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+    return pred_boxes
+
+def landmark_pred(boxes, landmark_deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, landmark_deltas.shape[1]))
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+    preds = []
+    for i in range(landmark_deltas.shape[1]):
+      if i==0:
+        pred = (landmark_deltas[:,i]*widths + ctr_x)
+      elif i==1:
+        pred = (landmark_deltas[:,i]*heights + ctr_y)
+      else:
+        pred = landmark_deltas[:,i]
+      preds.append(pred)
+    preds = np.vstack(preds).transpose()
+    return preds
+
+def iou_transform(ex_rois, gt_rois):
+    """ return bbox targets, IoU loss uses gt_rois as gt """
+    assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
+    return gt_rois
+
+
+def iou_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    dx1 = box_deltas[:, 0::4]
+    dy1 = box_deltas[:, 1::4]
+    dx2 = box_deltas[:, 2::4]
+    dy2 = box_deltas[:, 3::4]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis]
+    # y1
+    pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis]
+    # x2
+    pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis]
+    # y2
+    pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis]
+
+    return pred_boxes
+
+
+# define bbox_transform and bbox_pred
+bbox_transform = nonlinear_transform
+bbox_pred = nonlinear_pred
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/generate_anchor.py b/insightface/detection/retinaface_anticov/rcnn/processing/generate_anchor.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c5ada2a635186d31911cf1faa7df0e6a65d7ff
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/generate_anchor.py
@@ -0,0 +1,135 @@
+"""
+Generate base anchors on index 0
+"""
+from __future__ import print_function
+import sys
+from builtins import range
+import numpy as np
+from ..cython.anchors import anchors_cython
+#from ..config import config
+
+
+def anchors_plane(feat_h, feat_w, stride, base_anchor):
+    return anchors_cython(feat_h, feat_w, stride, base_anchor)
+
+
+def generate_anchors(base_size=16,
+                     ratios=[0.5, 1, 2],
+                     scales=2**np.arange(3, 6),
+                     stride=16,
+                     dense_anchor=False):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([
+        _scale_enum(ratio_anchors[i, :], scales)
+        for i in range(ratio_anchors.shape[0])
+    ])
+    if dense_anchor:
+        assert stride % 2 == 0
+        anchors2 = anchors.copy()
+        anchors2[:, :] += int(stride / 2)
+        anchors = np.vstack((anchors, anchors2))
+    #print('GA',base_anchor.shape, ratio_anchors.shape, anchors.shape)
+    return anchors
+
+
+#def generate_anchors_fpn(base_size=[64,32,16,8,4], ratios=[0.5, 1, 2], scales=8):
+#    """
+#    Generate anchor (reference) windows by enumerating aspect ratios X
+#    scales wrt a reference (0, 0, 15, 15) window.
+#    """
+#    anchors = []
+#    _ratios = ratios.reshape( (len(base_size), -1) )
+#    _scales = scales.reshape( (len(base_size), -1) )
+#    for i,bs in enumerate(base_size):
+#      __ratios = _ratios[i]
+#      __scales = _scales[i]
+#      #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+#      r = generate_anchors(bs, __ratios, __scales)
+#      #print('anchors_fpn', r.shape, file=sys.stderr)
+#      anchors.append(r)
+#    return anchors
+
+
+def generate_anchors_fpn(dense_anchor=False, cfg=None):
+    #assert(False)
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+    if cfg is None:
+        from ..config import config
+        cfg = config.RPN_ANCHOR_CFG
+    RPN_FEAT_STRIDE = []
+    for k in cfg:
+        RPN_FEAT_STRIDE.append(int(k))
+    RPN_FEAT_STRIDE = sorted(RPN_FEAT_STRIDE, reverse=True)
+    anchors = []
+    for k in RPN_FEAT_STRIDE:
+        v = cfg[str(k)]
+        bs = v['BASE_SIZE']
+        __ratios = np.array(v['RATIOS'])
+        __scales = np.array(v['SCALES'])
+        stride = int(k)
+        #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+        r = generate_anchors(bs, __ratios, __scales, stride, dense_anchor)
+        #print('anchors_fpn', r.shape, file=sys.stderr)
+        anchors.append(r)
+
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/insightface/detection/retinaface_anticov/rcnn/processing/nms.py b/insightface/detection/retinaface_anticov/rcnn/processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32d92d0ff738f7ad4f8ecc180ec04423a9a0a73
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/rcnn/processing/nms.py
@@ -0,0 +1,67 @@
+import numpy as np
+from ..cython.cpu_nms import cpu_nms
+try:
+    from ..cython.gpu_nms import gpu_nms
+except ImportError:
+    gpu_nms = None
+
+
+def py_nms_wrapper(thresh):
+    def _nms(dets):
+        return nms(dets, thresh)
+
+    return _nms
+
+
+def cpu_nms_wrapper(thresh):
+    def _nms(dets):
+        return cpu_nms(dets, thresh)
+
+    return _nms
+
+
+def gpu_nms_wrapper(thresh, device_id):
+    def _nms(dets):
+        return gpu_nms(dets, thresh, device_id)
+
+    if gpu_nms is not None:
+        return _nms
+    else:
+        return cpu_nms_wrapper(thresh)
+
+
+def nms(dets, thresh):
+    """
+    greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+    :param dets: [[x1, y1, x2, y2 score]]
+    :param thresh: retain overlap < thresh
+    :return: indexes to keep
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/insightface/detection/retinaface_anticov/retinaface_cov.py b/insightface/detection/retinaface_anticov/retinaface_cov.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4d1c1ac497dc00d05ad63fbe1c265b39ea26473
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/retinaface_cov.py
@@ -0,0 +1,752 @@
+from __future__ import print_function
+import sys
+import os
+import datetime
+import time
+import numpy as np
+import mxnet as mx
+from mxnet import ndarray as nd
+import cv2
+#from rcnn import config
+#from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes, landmark_pred
+from rcnn.processing.bbox_transform import clip_boxes
+from rcnn.processing.generate_anchor import generate_anchors_fpn, anchors_plane
+from rcnn.processing.nms import gpu_nms_wrapper, cpu_nms_wrapper
+from rcnn.processing.bbox_transform import bbox_overlaps
+
+
+class RetinaFaceCoV:
+    def __init__(self,
+                 prefix,
+                 epoch,
+                 ctx_id=0,
+                 network='net3',
+                 nms=0.4,
+                 nocrop=False):
+        self.ctx_id = ctx_id
+        self.network = network
+        self.nms_threshold = nms
+        self.nocrop = nocrop
+        self.debug = False
+        self.fpn_keys = []
+        self.anchor_cfg = None
+        pixel_means = [0.0, 0.0, 0.0]
+        pixel_stds = [1.0, 1.0, 1.0]
+        pixel_scale = 1.0
+        self.bbox_stds = [1.0, 1.0, 1.0, 1.0]
+        self.landmark_std = 1.0
+        self.preprocess = False
+        _ratio = (1., )
+        fmc = 3
+        if network == 'ssh' or network == 'vgg':
+            pixel_means = [103.939, 116.779, 123.68]
+            self.preprocess = True
+        elif network == 'net3':
+            _ratio = (1., )
+        elif network == 'net3l':
+            _ratio = (1., )
+            self.landmark_std = 0.2
+        elif network == 'net3a':
+            _ratio = (1., 1.5)
+        elif network == 'net6':  #like pyramidbox or s3fd
+            fmc = 6
+        elif network == 'net5':  #retinaface
+            fmc = 5
+        elif network == 'net5a':
+            fmc = 5
+            _ratio = (1., 1.5)
+        elif network == 'net4':
+            fmc = 4
+        elif network == 'net4a':
+            fmc = 4
+            _ratio = (1., 1.5)
+        elif network == 'x5':
+            fmc = 5
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        elif network == 'x3':
+            fmc = 3
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        elif network == 'x3a':
+            fmc = 3
+            _ratio = (1., 1.5)
+            pixel_means = [103.52, 116.28, 123.675]
+            pixel_stds = [57.375, 57.12, 58.395]
+        else:
+            assert False, 'network setting error %s' % network
+
+        if fmc == 3:
+            self._feat_stride_fpn = [32, 16, 8]
+            self.anchor_cfg = {
+                '32': {
+                    'SCALES': (32, 16),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (8, 4),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 4:
+            self._feat_stride_fpn = [32, 16, 8, 4]
+            self.anchor_cfg = {
+                '32': {
+                    'SCALES': (32, 16),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (8, 4),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '4': {
+                    'SCALES': (2, 1),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 6:
+            self._feat_stride_fpn = [128, 64, 32, 16, 8, 4]
+            self.anchor_cfg = {
+                '128': {
+                    'SCALES': (32, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '64': {
+                    'SCALES': (16, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '32': {
+                    'SCALES': (8, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '16': {
+                    'SCALES': (4, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '8': {
+                    'SCALES': (2, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+                '4': {
+                    'SCALES': (1, ),
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                },
+            }
+        elif fmc == 5:
+            self._feat_stride_fpn = [64, 32, 16, 8, 4]
+            self.anchor_cfg = {}
+            _ass = 2.0**(1.0 / 3)
+            _basescale = 1.0
+            for _stride in [4, 8, 16, 32, 64]:
+                key = str(_stride)
+                value = {
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'ALLOWED_BORDER': 9999
+                }
+                scales = []
+                for _ in range(3):
+                    scales.append(_basescale)
+                    _basescale *= _ass
+                value['SCALES'] = tuple(scales)
+                self.anchor_cfg[key] = value
+
+        #print(self._feat_stride_fpn, self.anchor_cfg)
+
+        for s in self._feat_stride_fpn:
+            self.fpn_keys.append('stride%s' % s)
+
+        dense_anchor = False
+        #self._anchors_fpn = dict(zip(self.fpn_keys, generate_anchors_fpn(base_size=fpn_base_size, scales=self._scales, ratios=self._ratios)))
+        self._anchors_fpn = dict(
+            zip(
+                self.fpn_keys,
+                generate_anchors_fpn(dense_anchor=dense_anchor,
+                                     cfg=self.anchor_cfg)))
+        for k in self._anchors_fpn:
+            v = self._anchors_fpn[k].astype(np.float32)
+            self._anchors_fpn[k] = v
+
+        self._num_anchors = dict(
+            zip(self.fpn_keys,
+                [anchors.shape[0] for anchors in self._anchors_fpn.values()]))
+        #self._bbox_pred = nonlinear_pred
+        #self._landmark_pred = landmark_pred
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        if self.ctx_id >= 0:
+            self.ctx = mx.gpu(self.ctx_id)
+            self.nms = gpu_nms_wrapper(self.nms_threshold, self.ctx_id)
+        else:
+            self.ctx = mx.cpu()
+            self.nms = cpu_nms_wrapper(self.nms_threshold)
+        self.pixel_means = np.array(pixel_means, dtype=np.float32)
+        self.pixel_stds = np.array(pixel_stds, dtype=np.float32)
+        self.pixel_scale = float(pixel_scale)
+        #print('means', self.pixel_means)
+        self.use_landmarks = True
+        #print('use_landmarks', self.use_landmarks)
+        self.cascade = 0
+
+        if self.debug:
+            c = len(sym) // len(self._feat_stride_fpn)
+            sym = sym[(c * 0):]
+            self._feat_stride_fpn = [32, 16, 8]
+        #print('sym size:', len(sym))
+
+        image_size = (640, 640)
+        self.model = mx.mod.Module(symbol=sym,
+                                   context=self.ctx,
+                                   label_names=None)
+        self.model.bind(data_shapes=[('data', (1, 3, image_size[0],
+                                               image_size[1]))],
+                        for_training=False)
+        self.model.set_params(arg_params, aux_params)
+
+    def get_input(self, img):
+        im = img.astype(np.float32)
+        im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+        for i in range(3):
+            im_tensor[
+                0,
+                i, :, :] = (im[:, :, 2 - i] / self.pixel_scale -
+                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
+        #if self.debug:
+        #  timeb = datetime.datetime.now()
+        #  diff = timeb - timea
+        #  print('X2 uses', diff.total_seconds(), 'seconds')
+        data = nd.array(im_tensor)
+        return data
+
+    def detect(self, img, threshold=0.5, scales=[1.0], do_flip=False):
+        #print('in_detect', threshold, scales, do_flip, do_nms)
+        proposals_list = []
+        scores_list = []
+        mask_scores_list = []
+        landmarks_list = []
+        strides_list = []
+        timea = datetime.datetime.now()
+        flips = [0]
+        if do_flip:
+            flips = [0, 1]
+
+        imgs = [img]
+        if isinstance(img, list):
+            imgs = img
+        for img in imgs:
+            for im_scale in scales:
+                for flip in flips:
+                    if im_scale != 1.0:
+                        im = cv2.resize(img,
+                                        None,
+                                        None,
+                                        fx=im_scale,
+                                        fy=im_scale,
+                                        interpolation=cv2.INTER_LINEAR)
+                    else:
+                        im = img.copy()
+                    if flip:
+                        im = im[:, ::-1, :]
+                    if self.nocrop:
+                        if im.shape[0] % 32 == 0:
+                            h = im.shape[0]
+                        else:
+                            h = (im.shape[0] // 32 + 1) * 32
+                        if im.shape[1] % 32 == 0:
+                            w = im.shape[1]
+                        else:
+                            w = (im.shape[1] // 32 + 1) * 32
+                        _im = np.zeros((h, w, 3), dtype=np.float32)
+                        _im[0:im.shape[0], 0:im.shape[1], :] = im
+                        im = _im
+                    else:
+                        im = im.astype(np.float32)
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X1 uses', diff.total_seconds(), 'seconds')
+                    #self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))], for_training=False)
+                    #im_info = [im.shape[0], im.shape[1], im_scale]
+                    im_info = [im.shape[0], im.shape[1]]
+                    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+                    for i in range(3):
+                        im_tensor[0, i, :, :] = (
+                            im[:, :, 2 - i] / self.pixel_scale -
+                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X2 uses', diff.total_seconds(), 'seconds')
+                    data = nd.array(im_tensor)
+                    db = mx.io.DataBatch(data=(data, ),
+                                         provide_data=[('data', data.shape)])
+                    if self.debug:
+                        timeb = datetime.datetime.now()
+                        diff = timeb - timea
+                        print('X3 uses', diff.total_seconds(), 'seconds')
+                    self.model.forward(db, is_train=False)
+                    net_out = self.model.get_outputs()
+                    #post_nms_topN = self._rpn_post_nms_top_n
+                    #min_size_dict = self._rpn_min_size_fpn
+
+                    sym_idx = 0
+
+                    for _idx, s in enumerate(self._feat_stride_fpn):
+                        #if len(scales)>1 and s==32 and im_scale==scales[-1]:
+                        #  continue
+                        _key = 'stride%s' % s
+                        stride = int(s)
+                        is_cascade = False
+                        #if self.vote and stride==4 and len(scales)>2 and (im_scale==scales[0]):
+                        #  continue
+                        #print('getting', im_scale, stride, idx, len(net_out), data.shape, file=sys.stderr)
+                        scores = net_out[sym_idx].asnumpy()
+                        type_scores = net_out[sym_idx + 3].asnumpy()
+                        print(scores.shape, type_scores.shape)
+                        if self.debug:
+                            timeb = datetime.datetime.now()
+                            diff = timeb - timea
+                            print('A uses', diff.total_seconds(), 'seconds')
+                        A = self._num_anchors['stride%s' % s]
+                        #print(scores.shape)
+                        #print('scores',stride, scores.shape, file=sys.stderr)
+                        scores = scores[:, A:, :, :]
+                        mask_scores = type_scores[:, A * 2:, :, :]  #x, A, x, x
+
+                        bbox_deltas = net_out[sym_idx + 1].asnumpy()
+
+                        #if DEBUG:
+                        #    print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+                        #    print 'scale: {}'.format(im_info[2])
+
+                        #_height, _width = int(im_info[0] / stride), int(im_info[1] / stride)
+                        height, width = bbox_deltas.shape[
+                            2], bbox_deltas.shape[3]
+
+                        K = height * width
+                        anchors_fpn = self._anchors_fpn['stride%s' % s]
+                        anchors = anchors_plane(height, width, stride,
+                                                anchors_fpn)
+                        #print((height, width), (_height, _width), anchors.shape, bbox_deltas.shape, scores.shape, file=sys.stderr)
+                        anchors = anchors.reshape((K * A, 4))
+                        #print('num_anchors', self._num_anchors['stride%s'%s], file=sys.stderr)
+                        #print('HW', (height, width), file=sys.stderr)
+                        #print('anchors_fpn', anchors_fpn.shape, file=sys.stderr)
+                        #print('anchors', anchors.shape, file=sys.stderr)
+                        #print('bbox_deltas', bbox_deltas.shape, file=sys.stderr)
+                        #print('scores', scores.shape, file=sys.stderr)
+
+                        #scores = self._clip_pad(scores, (height, width))
+                        scores = scores.transpose((0, 2, 3, 1)).reshape(
+                            (-1, 1))
+                        mask_scores = mask_scores.transpose(
+                            (0, 2, 3, 1)).reshape((-1, 1))
+
+                        #print('pre', bbox_deltas.shape, height, width)
+                        #bbox_deltas = self._clip_pad(bbox_deltas, (height, width))
+                        #print('after', bbox_deltas.shape, height, width)
+                        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1))
+                        bbox_pred_len = bbox_deltas.shape[3] // A
+                        #print(bbox_deltas.shape)
+                        bbox_deltas = bbox_deltas.reshape((-1, bbox_pred_len))
+                        bbox_deltas[:,
+                                    0::4] = bbox_deltas[:, 0::
+                                                        4] * self.bbox_stds[0]
+                        bbox_deltas[:,
+                                    1::4] = bbox_deltas[:, 1::
+                                                        4] * self.bbox_stds[1]
+                        bbox_deltas[:,
+                                    2::4] = bbox_deltas[:, 2::
+                                                        4] * self.bbox_stds[2]
+                        bbox_deltas[:,
+                                    3::4] = bbox_deltas[:, 3::
+                                                        4] * self.bbox_stds[3]
+                        proposals = self.bbox_pred(anchors, bbox_deltas)
+
+                        proposals = clip_boxes(proposals, im_info[:2])
+
+                        #if self.vote:
+                        #  if im_scale>1.0:
+                        #    keep = self._filter_boxes2(proposals, 160*im_scale, -1)
+                        #  else:
+                        #    keep = self._filter_boxes2(proposals, -1, 100*im_scale)
+                        #  if stride==4:
+                        #    keep = self._filter_boxes2(proposals, 12*im_scale, -1)
+                        #    proposals = proposals[keep, :]
+                        #    scores = scores[keep]
+
+                        #keep = self._filter_boxes(proposals, min_size_dict['stride%s'%s] * im_info[2])
+                        #proposals = proposals[keep, :]
+                        #scores = scores[keep]
+                        #print('333', proposals.shape)
+                        if stride == 4 and self.decay4 < 1.0:
+                            scores *= self.decay4
+
+                        scores_ravel = scores.ravel()
+                        #mask_scores_ravel = mask_scores.ravel()
+                        #print('__shapes', proposals.shape, scores_ravel.shape)
+                        #print('max score', np.max(scores_ravel))
+                        order = np.where(scores_ravel >= threshold)[0]
+                        #_scores = scores_ravel[order]
+                        #_order = _scores.argsort()[::-1]
+                        #order = order[_order]
+                        proposals = proposals[order, :]
+                        scores = scores[order]
+                        mask_scores = mask_scores[order]
+                        if flip:
+                            oldx1 = proposals[:, 0].copy()
+                            oldx2 = proposals[:, 2].copy()
+                            proposals[:, 0] = im.shape[1] - oldx2 - 1
+                            proposals[:, 2] = im.shape[1] - oldx1 - 1
+
+                        proposals[:, 0:4] /= im_scale
+
+                        proposals_list.append(proposals)
+                        scores_list.append(scores)
+                        mask_scores_list.append(mask_scores)
+
+                        landmark_deltas = net_out[sym_idx + 2].asnumpy()
+                        #landmark_deltas = self._clip_pad(landmark_deltas, (height, width))
+                        landmark_pred_len = landmark_deltas.shape[1] // A
+                        landmark_deltas = landmark_deltas.transpose(
+                            (0, 2, 3, 1)).reshape(
+                                (-1, 5, landmark_pred_len // 5))
+                        landmark_deltas *= self.landmark_std
+                        #print(landmark_deltas.shape, landmark_deltas)
+                        landmarks = self.landmark_pred(anchors,
+                                                       landmark_deltas)
+                        landmarks = landmarks[order, :]
+
+                        if flip:
+                            landmarks[:, :,
+                                      0] = im.shape[1] - landmarks[:, :, 0] - 1
+                            #for a in range(5):
+                            #  oldx1 = landmarks[:, a].copy()
+                            #  landmarks[:,a] = im.shape[1] - oldx1 - 1
+                            order = [1, 0, 2, 4, 3]
+                            flandmarks = landmarks.copy()
+                            for idx, a in enumerate(order):
+                                flandmarks[:, idx, :] = landmarks[:, a, :]
+                                #flandmarks[:, idx*2] = landmarks[:,a*2]
+                                #flandmarks[:, idx*2+1] = landmarks[:,a*2+1]
+                            landmarks = flandmarks
+                        landmarks[:, :, 0:2] /= im_scale
+                        #landmarks /= im_scale
+                        #landmarks = landmarks.reshape( (-1, landmark_pred_len) )
+                        landmarks_list.append(landmarks)
+                        #proposals = np.hstack((proposals, landmarks))
+                        sym_idx += 4
+
+        if self.debug:
+            timeb = datetime.datetime.now()
+            diff = timeb - timea
+            print('B uses', diff.total_seconds(), 'seconds')
+        proposals = np.vstack(proposals_list)
+        landmarks = None
+        if proposals.shape[0] == 0:
+            landmarks = np.zeros((0, 5, 2))
+            return np.zeros((0, 6)), landmarks
+        scores = np.vstack(scores_list)
+        mask_scores = np.vstack(mask_scores_list)
+        #print('shapes', proposals.shape, scores.shape)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        #if config.TEST.SCORE_THRESH>0.0:
+        #  _count = np.sum(scores_ravel>config.TEST.SCORE_THRESH)
+        #  order = order[:_count]
+        proposals = proposals[order, :]
+        scores = scores[order]
+        mask_scores = mask_scores[order]
+        landmarks = np.vstack(landmarks_list)
+        landmarks = landmarks[order].astype(np.float32, copy=False)
+
+        pre_det = np.hstack((proposals[:, 0:4], scores)).astype(np.float32,
+                                                                copy=False)
+        keep = self.nms(pre_det)
+        det = np.hstack((pre_det, mask_scores))
+        det = det[keep, :]
+        landmarks = landmarks[keep]
+
+        if self.debug:
+            timeb = datetime.datetime.now()
+            diff = timeb - timea
+            print('C uses', diff.total_seconds(), 'seconds')
+        return det, landmarks
+
+    def detect_center(self, img, threshold=0.5, scales=[1.0], do_flip=False):
+        det, landmarks = self.detect(img, threshold, scales, do_flip)
+        if det.shape[0] == 0:
+            return None, None
+        bindex = 0
+        if det.shape[0] > 1:
+            img_size = np.asarray(img.shape)[0:2]
+            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                           det[:, 1])
+            img_center = img_size / 2
+            offsets = np.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                                 (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            bindex = np.argmax(bounding_box_size - offset_dist_squared *
+                               2.0)  # some extra weight on the centering
+        bbox = det[bindex, :]
+        landmark = landmarks[bindex, :, :]
+        return bbox, landmark
+
+    @staticmethod
+    def check_large_pose(landmark, bbox):
+        assert landmark.shape == (5, 2)
+        assert len(bbox) == 4
+
+        def get_theta(base, x, y):
+            vx = x - base
+            vy = y - base
+            vx[1] *= -1
+            vy[1] *= -1
+            tx = np.arctan2(vx[1], vx[0])
+            ty = np.arctan2(vy[1], vy[0])
+            d = ty - tx
+            d = np.degrees(d)
+            #print(vx, tx, vy, ty, d)
+            #if d<-1.*math.pi:
+            #  d+=2*math.pi
+            #elif d>math.pi:
+            #  d-=2*math.pi
+            if d < -180.0:
+                d += 360.
+            elif d > 180.0:
+                d -= 360.0
+            return d
+
+        landmark = landmark.astype(np.float32)
+
+        theta1 = get_theta(landmark[0], landmark[3], landmark[2])
+        theta2 = get_theta(landmark[1], landmark[2], landmark[4])
+        #print(va, vb, theta2)
+        theta3 = get_theta(landmark[0], landmark[2], landmark[1])
+        theta4 = get_theta(landmark[1], landmark[0], landmark[2])
+        theta5 = get_theta(landmark[3], landmark[4], landmark[2])
+        theta6 = get_theta(landmark[4], landmark[2], landmark[3])
+        theta7 = get_theta(landmark[3], landmark[2], landmark[0])
+        theta8 = get_theta(landmark[4], landmark[1], landmark[2])
+        #print(theta1, theta2, theta3, theta4, theta5, theta6, theta7, theta8)
+        left_score = 0.0
+        right_score = 0.0
+        up_score = 0.0
+        down_score = 0.0
+        if theta1 <= 0.0:
+            left_score = 10.0
+        elif theta2 <= 0.0:
+            right_score = 10.0
+        else:
+            left_score = theta2 / theta1
+            right_score = theta1 / theta2
+        if theta3 <= 10.0 or theta4 <= 10.0:
+            up_score = 10.0
+        else:
+            up_score = max(theta1 / theta3, theta2 / theta4)
+        if theta5 <= 10.0 or theta6 <= 10.0:
+            down_score = 10.0
+        else:
+            down_score = max(theta7 / theta5, theta8 / theta6)
+        mleft = (landmark[0][0] + landmark[3][0]) / 2
+        mright = (landmark[1][0] + landmark[4][0]) / 2
+        box_center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
+        ret = 0
+        if left_score >= 3.0:
+            ret = 1
+        if ret == 0 and left_score >= 2.0:
+            if mright <= box_center[0]:
+                ret = 1
+        if ret == 0 and right_score >= 3.0:
+            ret = 2
+        if ret == 0 and right_score >= 2.0:
+            if mleft >= box_center[0]:
+                ret = 2
+        if ret == 0 and up_score >= 2.0:
+            ret = 3
+        if ret == 0 and down_score >= 5.0:
+            ret = 4
+        return ret, left_score, right_score, up_score, down_score
+
+    @staticmethod
+    def _filter_boxes(boxes, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        return keep
+
+    @staticmethod
+    def _filter_boxes2(boxes, max_size, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        if max_size > 0:
+            keep = np.where(np.minimum(ws, hs) < max_size)[0]
+        elif min_size > 0:
+            keep = np.where(np.maximum(ws, hs) > min_size)[0]
+        return keep
+
+    @staticmethod
+    def _clip_pad(tensor, pad_shape):
+        """
+      Clip boxes of the pad area.
+      :param tensor: [n, c, H, W]
+      :param pad_shape: [h, w]
+      :return: [n, c, h, w]
+      """
+        H, W = tensor.shape[2:]
+        h, w = pad_shape
+
+        if h < H or w < W:
+            tensor = tensor[:, :, :h, :w].copy()
+
+        return tensor
+
+    @staticmethod
+    def bbox_pred(boxes, box_deltas):
+        """
+      Transform the set of class-agnostic boxes into class-specific boxes
+      by applying the predicted offsets (box_deltas)
+      :param boxes: !important [N 4]
+      :param box_deltas: [N, 4 * num_classes]
+      :return: [N 4 * num_classes]
+      """
+        if boxes.shape[0] == 0:
+            return np.zeros((0, box_deltas.shape[1]))
+
+        boxes = boxes.astype(np.float, copy=False)
+        widths = boxes[:, 2] - boxes[:, 0] + 1.0
+        heights = boxes[:, 3] - boxes[:, 1] + 1.0
+        ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+        ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+        dx = box_deltas[:, 0:1]
+        dy = box_deltas[:, 1:2]
+        dw = box_deltas[:, 2:3]
+        dh = box_deltas[:, 3:4]
+
+        pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+        pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+        pred_w = np.exp(dw) * widths[:, np.newaxis]
+        pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+        pred_boxes = np.zeros(box_deltas.shape)
+        # x1
+        pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+        # y1
+        pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+        # x2
+        pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+        # y2
+        pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+        if box_deltas.shape[1] > 4:
+            pred_boxes[:, 4:] = box_deltas[:, 4:]
+
+        return pred_boxes
+
+    @staticmethod
+    def landmark_pred(boxes, landmark_deltas):
+        if boxes.shape[0] == 0:
+            return np.zeros((0, landmark_deltas.shape[1]))
+        boxes = boxes.astype(np.float, copy=False)
+        widths = boxes[:, 2] - boxes[:, 0] + 1.0
+        heights = boxes[:, 3] - boxes[:, 1] + 1.0
+        ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+        ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+        pred = landmark_deltas.copy()
+        for i in range(5):
+            pred[:, i, 0] = landmark_deltas[:, i, 0] * widths + ctr_x
+            pred[:, i, 1] = landmark_deltas[:, i, 1] * heights + ctr_y
+        return pred
+        #preds = []
+        #for i in range(landmark_deltas.shape[1]):
+        #  if i%2==0:
+        #    pred = (landmark_deltas[:,i]*widths + ctr_x)
+        #  else:
+        #    pred = (landmark_deltas[:,i]*heights + ctr_y)
+        #  preds.append(pred)
+        #preds = np.vstack(preds).transpose()
+        #return preds
+
+    def vote(self, det):
+        #order = det[:, 4].ravel().argsort()[::-1]
+        #det = det[order, :]
+        if det.shape[0] == 0:
+            return np.zeros((0, 5))
+            #dets = np.array([[10, 10, 20, 20, 0.002]])
+            #det = np.empty(shape=[0, 5])
+        dets = None
+        while det.shape[0] > 0:
+            if dets is not None and dets.shape[0] >= 750:
+                break
+            # IOU
+            area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+            xx1 = np.maximum(det[0, 0], det[:, 0])
+            yy1 = np.maximum(det[0, 1], det[:, 1])
+            xx2 = np.minimum(det[0, 2], det[:, 2])
+            yy2 = np.minimum(det[0, 3], det[:, 3])
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            o = inter / (area[0] + area[:] - inter)
+
+            # nms
+            merge_index = np.where(o >= self.nms_threshold)[0]
+            det_accu = det[merge_index, :]
+            det = np.delete(det, merge_index, 0)
+            if merge_index.shape[0] <= 1:
+                if det.shape[0] == 0:
+                    try:
+                        dets = np.row_stack((dets, det_accu))
+                    except:
+                        dets = det_accu
+                continue
+            det_accu[:,
+                     0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:],
+                                                       (1, 4))
+            max_score = np.max(det_accu[:, 4])
+            det_accu_sum = np.zeros((1, 5))
+            det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(
+                det_accu[:, -1:])
+            det_accu_sum[:, 4] = max_score
+            if dets is None:
+                dets = det_accu_sum
+            else:
+                dets = np.row_stack((dets, det_accu_sum))
+        dets = dets[0:750, :]
+        return dets
diff --git a/insightface/detection/retinaface_anticov/test.py b/insightface/detection/retinaface_anticov/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c282b6db3cf699ab11e416b23c13c05bd0745b
--- /dev/null
+++ b/insightface/detection/retinaface_anticov/test.py
@@ -0,0 +1,66 @@
+import cv2
+import sys
+import numpy as np
+import datetime
+import os
+import glob
+from retinaface_cov import RetinaFaceCoV
+
+thresh = 0.8
+mask_thresh = 0.2
+scales = [640, 1080]
+
+count = 1
+
+gpuid = 0
+#detector = RetinaFaceCoV('./model/mnet_cov1', 0, gpuid, 'net3')
+detector = RetinaFaceCoV('./model/mnet_cov2', 0, gpuid, 'net3l')
+
+img = cv2.imread('n1.jpg')
+print(img.shape)
+im_shape = img.shape
+target_size = scales[0]
+max_size = scales[1]
+im_size_min = np.min(im_shape[0:2])
+im_size_max = np.max(im_shape[0:2])
+#im_scale = 1.0
+#if im_size_min>target_size or im_size_max>max_size:
+im_scale = float(target_size) / float(im_size_min)
+# prevent bigger axis from being more than max_size:
+if np.round(im_scale * im_size_max) > max_size:
+    im_scale = float(max_size) / float(im_size_max)
+
+print('im_scale', im_scale)
+
+scales = [im_scale]
+flip = False
+
+for c in range(count):
+    faces, landmarks = detector.detect(img,
+                                       thresh,
+                                       scales=scales,
+                                       do_flip=flip)
+
+if faces is not None:
+    print('find', faces.shape[0], 'faces')
+    for i in range(faces.shape[0]):
+        #print('score', faces[i][4])
+        face = faces[i]
+        box = face[0:4].astype(np.int)
+        mask = face[5]
+        print(i, box, mask)
+        #color = (255,0,0)
+        if mask >= mask_thresh:
+            color = (0, 0, 255)
+        else:
+            color = (0, 255, 0)
+        cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), color, 2)
+        landmark5 = landmarks[i].astype(np.int)
+        #print(landmark.shape)
+        for l in range(landmark5.shape[0]):
+            color = (255, 0, 0)
+            cv2.circle(img, (landmark5[l][0], landmark5[l][1]), 1, color, 2)
+
+    filename = './cov_test.jpg'
+    print('writing', filename)
+    cv2.imwrite(filename, img)
diff --git a/insightface/detection/scrfd/LICENSE b/insightface/detection/scrfd/LICENSE
new file mode 100755
index 0000000000000000000000000000000000000000..04adf5cbc620ad190547b092fa449e36df5f7bf4
--- /dev/null
+++ b/insightface/detection/scrfd/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018-2019 Open-MMLab. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2019 Open-MMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/insightface/detection/scrfd/README.md b/insightface/detection/scrfd/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..43c26f8fed521b1190cc7ea83b0303c326806254
--- /dev/null
+++ b/insightface/detection/scrfd/README.md
@@ -0,0 +1,183 @@
+## Introduction
+
+SCRFD is an efficient high accuracy face detection approach which initially described in [Arxiv](https://arxiv.org/abs/2105.04714), and accepted by ICLR-2022.
+
+Try out the Gradio Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/hysts/insightface-SCRFD)
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/scrfd_evelope.jpg" width="400" alt="prcurve"/>
+
+## Performance
+
+Precision, flops and infer time are all evaluated on **VGA resolution**.
+
+#### ResNet family
+
+| Method              | Backbone        | Easy  | Medium | Hard  | \#Params(M) | \#Flops(G) | Infer(ms) |
+| ------------------- | --------------- | ----- | ------ | ----- | ----------- | ---------- | --------- |
+| DSFD (CVPR19)       | ResNet152       | 94.29 | 91.47  | 71.39 | 120.06      | 259.55     | 55.6      |
+| RetinaFace (CVPR20) | ResNet50        | 94.92 | 91.90  | 64.17 | 29.50       | 37.59      | 21.7      |
+| HAMBox (CVPR20)     | ResNet50        | 95.27 | 93.76  | 76.75 | 30.24       | 43.28      | 25.9      |
+| TinaFace (Arxiv20)  | ResNet50        | 95.61 | 94.25  | 81.43 | 37.98       | 172.95     | 38.9      |
+| - | - | - | - | - | - | - | - |
+| ResNet-34GF         | ResNet50        | 95.64 | 94.22  | 84.02 | 24.81       | 34.16      | 11.8      |
+| **SCRFD-34GF**      | Bottleneck Res  | 96.06 | 94.92  | 85.29 | 9.80        | 34.13      | 11.7      |
+| ResNet-10GF         | ResNet34x0.5    | 94.69 | 92.90  | 80.42 | 6.85        | 10.18      | 6.3       |
+| **SCRFD-10GF**      | Basic Res       | 95.16 | 93.87  | 83.05 | 3.86        | 9.98       | 4.9       |
+| ResNet-2.5GF        | ResNet34x0.25   | 93.21 | 91.11  | 74.47 | 1.62        | 2.57       | 5.4       |
+| **SCRFD-2.5GF**     | Basic Res       | 93.78 | 92.16  | 77.87 | 0.67        | 2.53       | 4.2       |
+
+
+#### Mobile family
+
+| Method              | Backbone        | Easy  | Medium | Hard  | \#Params(M) | \#Flops(G) | Infer(ms) |
+| ------------------- | --------------- | ----- | ------ | ----- | ----------- | ---------- | --------- |
+| RetinaFace (CVPR20) | MobileNet0.25   | 87.78 | 81.16  | 47.32 | 0.44        | 0.802      | 7.9       |
+| FaceBoxes (IJCB17)  | -               | 76.17 | 57.17  | 24.18 | 1.01        | 0.275      | 2.5       |
+| - | - | - | - | - | - | - | - |
+| MobileNet-0.5GF     | MobileNetx0.25  | 90.38 | 87.05  | 66.68 | 0.37        | 0.507      | 3.7       |
+| **SCRFD-0.5GF**     | Depth-wise Conv | 90.57 | 88.12  | 68.51 | 0.57        | 0.508      | 3.6       |
+
+
+**X64 CPU Performance of SCRFD-0.5GF:**
+
+| Test-Input-Size         | CPU Single-Thread   | Easy  | Medium | Hard  |
+| ----------------------- | -----------------   | ----- | ------ | ----- |
+| Original-Size(scale1.0) | -                   | 90.91 | 89.49  | 82.03 |
+| 640x480                 | 28.3ms              | 90.57 | 88.12  | 68.51 |
+| 320x240                 | 11.4ms              | -     | -      | -     |
+
+*precision and infer time are evaluated on AMD Ryzen 9 3950X, using the simple PyTorch CPU inference by setting `OMP_NUM_THREADS=1` (no mkldnn).*
+
+## Installation
+
+Please refer to [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/en/get_started.md#installation) for installation.
+ 
+  1. Install [mmcv](https://github.com/open-mmlab/mmcv). (mmcv-full==1.2.6 and 1.3.3 was tested)
+  2. Install build requirements and then install mmdet.
+       ```
+       pip install -r requirements/build.txt
+       pip install -v -e .  # or "python setup.py develop"
+       ```
+
+## Data preparation
+
+### WIDERFace:
+  1. Download WIDERFace datasets and put it under `data/retinaface`.
+  2. Download annotation files from [gdrive](https://drive.google.com/file/d/1UW3KoApOhusyqSHX96yEDRYiNkd3Iv3Z/view?usp=sharing) and put them under `data/retinaface/`
+ 
+   ```
+     data/retinaface/
+         train/
+             images/
+             labelv2.txt
+         val/
+             images/
+             labelv2.txt
+             gt/
+                 *.mat
+             
+   ```
+ 
+
+#### Annotation Format 
+
+*please refer to labelv2.txt for detail*
+
+For each image:
+  ```
+  # <image_path> image_width image_height
+  bbox_x1 bbox_y1 bbox_x2 bbox_y2 (<keypoint,3>*N)
+  ...
+  ...
+  # <image_path> image_width image_height
+  bbox_x1 bbox_y1 bbox_x2 bbox_y2 (<keypoint,3>*N)
+  ...
+  ...
+  ```
+Keypoints can be ignored if there is bbox annotation only.
+
+
+## Training
+
+Example training command, with 4 GPUs:
+```
+CUDA_VISIBLE_DEVICES="0,1,2,3" PORT=29701 bash ./tools/dist_train.sh ./configs/scrfd/scrfd_1g.py 4
+```
+
+## WIDERFace Evaluation
+
+We use a pure python evaluation script without Matlab.
+
+```
+GPU=0
+GROUP=scrfd
+TASK=scrfd_2.5g
+CUDA_VISIBLE_DEVICES="$GPU" python -u tools/test_widerface.py ./configs/"$GROUP"/"$TASK".py ./work_dirs/"$TASK"/model.pth --mode 0 --out wouts
+```
+
+
+## Pretrained-Models
+
+|      Name      | Easy  | Medium | Hard  | FLOPs | Params(M) | Infer(ms) | Link                                                         |
+| :------------: | ----- | ------ | ----- | ----- | --------- | --------- | ------------------------------------------------------------ |
+|   SCRFD_500M   | 90.57 | 88.12  | 68.51 | 500M  | 0.57      | 3.6       | [download](https://1drv.ms/u/s!AswpsDO2toNKqyYWxScdiTITY4TQ?e=DjXof9) |
+|    SCRFD_1G    | 92.38 | 90.57  | 74.80 | 1G    | 0.64      | 4.1       | [download](https://1drv.ms/u/s!AswpsDO2toNKqyPVLI44ahNBsOMR?e=esPrBL) |
+|   SCRFD_2.5G   | 93.78 | 92.16  | 77.87 | 2.5G  | 0.67      | 4.2       | [download](https://1drv.ms/u/s!AswpsDO2toNKqyTIXnzB1ujPq4th?e=5t1VNv) |
+|   SCRFD_10G    | 95.16 | 93.87  | 83.05 | 10G   | 3.86      | 4.9       | [download](https://1drv.ms/u/s!AswpsDO2toNKqyUKwTiwXv2kaa8o?e=umfepO) |
+|   SCRFD_34G    | 96.06 | 94.92  | 85.29 | 34G   | 9.80      | 11.7      | [download](https://1drv.ms/u/s!AswpsDO2toNKqyKZwFebVlmlOvzz?e=V2rqUy) |
+| SCRFD_500M_KPS | 90.97 | 88.44  | 69.49 | 500M  | 0.57      | 3.6      | [download](https://1drv.ms/u/s!AswpsDO2toNKri_NDM0GIkPpkE2f?e=JkebJo) |
+| SCRFD_2.5G_KPS | 93.80 | 92.02  | 77.13 | 2.5G  | 0.82      | 4.3       | [download](https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm) |
+| SCRFD_10G_KPS  | 95.40 | 94.01  | 82.80 | 10G   | 4.23      | 5.0       | [download](https://1drv.ms/u/s!AswpsDO2toNKqycsF19UbaCWaLWx?e=F6i5Vm) |
+
+mAP, FLOPs and inference latency are all evaluated on VGA resolution.
+``_KPS`` means the model includes 5 keypoints prediction.
+
+## Convert to ONNX
+
+Please refer to `tools/scrfd2onnx.py`
+
+Generated onnx model can accept dynamic input as default.
+
+You can also set specific input shape by pass ``--shape 640 640``, then output onnx model can be optimized by onnx-simplifier.
+
+
+## Inference
+
+Please refer to `tools/scrfd.py` which uses onnxruntime to do inference.
+
+## Network Search
+
+For two-steps search as we described in paper, we target hard mAP on how we select best candidate models.
+
+We provide an example for searching SCRFD-2.5GF in this repo as below.
+
+1. For searching backbones: 
+
+    ```
+    python search_tools/generate_configs_2.5g.py --mode 1
+    ```
+   Where ``mode==1`` means searching backbone only. For other parameters, please check the code.
+2. After step-1 done, there will be ``configs/scrfdgen2.5g/scrfdgen2.5g_1.py`` to ``configs/scrfdgen2.5g/scrfdgen2.5g_64.py`` if ``num_configs`` is set to 64.
+3. Do training for every generated configs for 80 epochs, please check ``search_tools/search_train.sh``
+4. Test WIDERFace precision for every generated configs, using ``search_tools/search_test.sh``.
+5. Select the top accurate config as the base template(assume the 10-th config is the best), then do the overall network search. 
+    ```
+    python search_tools/generate_configs_2.5g.py --mode 2 --template 10
+    ```
+6. Test these new generated configs again and select the top accurate one(s).
+
+
+## Acknowledgments
+
+We thank [nihui](https://github.com/nihui) for the excellent [mobile-phone demo](https://github.com/nihui/ncnn-android-scrfd).
+
+## Demo
+
+1. [ncnn-android-scrfd](https://github.com/nihui/ncnn-android-scrfd)
+2. [scrfd-MNN C++](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/mnn/cv/mnn_scrfd.cpp)
+3. [scrfd-TNN C++](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/tnn/cv/tnn_scrfd.cpp)
+4. [scrfd-NCNN C++](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ncnn/cv/ncnn_scrfd.cpp)
+5. [scrfd-ONNXRuntime C++](https://github.com/DefTruth/lite.ai.toolkit/blob/main/lite/ort/cv/scrfd.cpp)
+6. [TensorRT Python](https://github.com/SthPhoenix/InsightFace-REST/blob/master/src/api_trt/modules/model_zoo/detectors/scrfd.py)
+7. [Modelscope demo for rotated face](https://modelscope.cn/models/damo/cv_resnet_facedetection_scrfd10gkps/summary)
+8. [Modelscope demo for card detection](https://modelscope.cn/models/damo/cv_resnet_carddetection_scrfd34gkps/summary)
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_detection.py b/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_detection.py
new file mode 100755
index 0000000000000000000000000000000000000000..156aca02588a96a4e279de2e647864b0739e476d
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_detection.py
@@ -0,0 +1,55 @@
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/instancesonly_filtered_gtFine_train.json',
+            img_prefix=data_root + 'leftImg8bit/train/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root + 'leftImg8bit/val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_test.json',
+        img_prefix=data_root + 'leftImg8bit/test/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_instance.py b/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_instance.py
new file mode 100755
index 0000000000000000000000000000000000000000..3c5472aab09acdd5efa2cee206d94824f06058f9
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/cityscapes_instance.py
@@ -0,0 +1,55 @@
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize', img_scale=[(2048, 800), (2048, 1024)], keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/instancesonly_filtered_gtFine_train.json',
+            img_prefix=data_root + 'leftImg8bit/train/',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        img_prefix=data_root + 'leftImg8bit/val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_test.json',
+        img_prefix=data_root + 'leftImg8bit/test/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/coco_detection.py b/insightface/detection/scrfd/configs/_base_/datasets/coco_detection.py
new file mode 100755
index 0000000000000000000000000000000000000000..09a75c404687223c71dcdf0abc7af827f2e498a6
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/coco_detection.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/coco_instance.py b/insightface/detection/scrfd/configs/_base_/datasets/coco_instance.py
new file mode 100755
index 0000000000000000000000000000000000000000..f6ea4f4562a8118275a444879a884717b55caa15
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/coco_instance_semantic.py b/insightface/detection/scrfd/configs/_base_/datasets/coco_instance_semantic.py
new file mode 100755
index 0000000000000000000000000000000000000000..f7c072ec92731af85952840128f6527bc799913a
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/coco_instance_semantic.py
@@ -0,0 +1,53 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 8),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        seg_prefix=data_root + 'stuffthingmaps/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/deepfashion.py b/insightface/detection/scrfd/configs/_base_/datasets/deepfashion.py
new file mode 100755
index 0000000000000000000000000000000000000000..308b4b2ac4d9e3516ba4a57e9d3b6af91e97f24b
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/deepfashion.py
@@ -0,0 +1,53 @@
+# dataset settings
+dataset_type = 'DeepFashionDataset'
+data_root = 'data/DeepFashion/In-shop/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(750, 1101), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(750, 1101),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=1,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=train_pipeline,
+        data_root=data_root),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/DeepFashion_segmentation_query.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=test_pipeline,
+        data_root=data_root),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'annotations/DeepFashion_segmentation_gallery.json',
+        img_prefix=data_root + 'Img/',
+        pipeline=test_pipeline,
+        data_root=data_root))
+evaluation = dict(interval=5, metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/lvis_v0.5_instance.py b/insightface/detection/scrfd/configs/_base_/datasets/lvis_v0.5_instance.py
new file mode 100755
index 0000000000000000000000000000000000000000..f3da861d6df05b8da58f361815892a416987a927
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/lvis_v0.5_instance.py
@@ -0,0 +1,23 @@
+_base_ = 'coco_instance.py'
+dataset_type = 'LVISV05Dataset'
+data_root = 'data/lvis_v0.5/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/lvis_v0.5_train.json',
+            img_prefix=data_root + 'train2017/')),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v0.5_val.json',
+        img_prefix=data_root + 'val2017/'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v0.5_val.json',
+        img_prefix=data_root + 'val2017/'))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/lvis_v1_instance.py b/insightface/detection/scrfd/configs/_base_/datasets/lvis_v1_instance.py
new file mode 100755
index 0000000000000000000000000000000000000000..e8c5d1b14594a6ea38b215635686c04995338ed7
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/lvis_v1_instance.py
@@ -0,0 +1,23 @@
+_base_ = 'coco_instance.py'
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'annotations/lvis_v1_train.json',
+            img_prefix=data_root)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/lvis_v1_val.json',
+        img_prefix=data_root))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/retinaface.py b/insightface/detection/scrfd/configs/_base_/datasets/retinaface.py
new file mode 100755
index 0000000000000000000000000000000000000000..4c5b3696115727fef9a28f60ad11941faa959cd9
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/retinaface.py
@@ -0,0 +1,85 @@
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop',
+         crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0]),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1100, 1650),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+#evaluation = dict(interval=1, metric='bbox')
+evaluation = dict(interval=10, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/voc0712.py b/insightface/detection/scrfd/configs/_base_/datasets/voc0712.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae09acdd5c9580217815300abbad9f08b71b37ed
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/voc0712.py
@@ -0,0 +1,55 @@
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1000, 600),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=[
+                data_root + 'VOC2007/ImageSets/Main/trainval.txt',
+                data_root + 'VOC2012/ImageSets/Main/trainval.txt'
+            ],
+            img_prefix=[data_root + 'VOC2007/', data_root + 'VOC2012/'],
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
+        img_prefix=data_root + 'VOC2007/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/_base_/datasets/wider_face.py b/insightface/detection/scrfd/configs/_base_/datasets/wider_face.py
new file mode 100755
index 0000000000000000000000000000000000000000..d1d649be42bca2955fb56a784fe80bcc2fdce4e1
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/datasets/wider_face.py
@@ -0,0 +1,63 @@
+# dataset settings
+dataset_type = 'WIDERFaceDataset'
+data_root = 'data/WIDERFace/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=60,
+    workers_per_gpu=2,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root + 'train.txt',
+            img_prefix=data_root + 'WIDER_train/',
+            min_size=17,
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'val.txt',
+        img_prefix=data_root + 'WIDER_val/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'val.txt',
+        img_prefix=data_root + 'WIDER_val/',
+        pipeline=test_pipeline))
diff --git a/insightface/detection/scrfd/configs/_base_/default_runtime.py b/insightface/detection/scrfd/configs/_base_/default_runtime.py
new file mode 100755
index 0000000000000000000000000000000000000000..594de8dcc99b9e4fc0208f327a05910a95a1793c
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/default_runtime.py
@@ -0,0 +1,14 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/insightface/detection/scrfd/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..f90b78cef38815b004175d94eee023d3b5ef5e25
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=[
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.6,
+                min_pos_iou=0.6,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.7,
+                min_pos_iou=0.7,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)
+    ])
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100,
+        mask_thr_binary=0.5))
diff --git a/insightface/detection/scrfd/configs/_base_/models/cascade_rcnn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/cascade_rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..303276b845fecd041d093e240046de08b6016638
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/cascade_rcnn_r50_fpn.py
@@ -0,0 +1,183 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=[
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.6,
+                min_pos_iou=0.6,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.7,
+                min_pos_iou=0.7,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)
+    ])
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/insightface/detection/scrfd/configs/_base_/models/fast_rcnn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/fast_rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..b8d9570deeaaf0cf42b0e16619a1dfc22d38ae5d
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/fast_rcnn_r50_fpn.py
@@ -0,0 +1,62 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=False,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_c4.py b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a381636382bdd82dc7650e199ef26a3602513e3
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
@@ -0,0 +1,116 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=12000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=False,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=6000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py
new file mode 100755
index 0000000000000000000000000000000000000000..5b4e4c3d663d84cc124c2389c53a3026dbdc451f
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_caffe_dc5.py
@@ -0,0 +1,107 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        strides=(1, 2, 2, 1),
+        dilations=(1, 1, 1, 2),
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=2048,
+        feat_channels=2048,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=2048,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=2048,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=12000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=False,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=6000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..338a5c6b604d4bfe316ad35ab51d6b997f74ba9e
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/faster_rcnn_r50_fpn.py
@@ -0,0 +1,111 @@
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=False,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100)
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+)
diff --git a/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
new file mode 100755
index 0000000000000000000000000000000000000000..b9b29b0b99de34caadd1d906b1b9367659524c89
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
@@ -0,0 +1,127 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='MaskRCNN',
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=None,
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=0,
+            in_channels=2048,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=12000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=False,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        mask_size=14,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=6000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100,
+        mask_thr_binary=0.5))
diff --git a/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..4472bd0a80d7426278cbb05ab4be9bf411eaef0f
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            match_low_quality=True,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        mask_size=28,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100,
+        mask_thr_binary=0.5))
diff --git a/insightface/detection/scrfd/configs/_base_/models/retinanet_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/retinanet_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..a08b14f60992a8a5c00c668b37eb9a4dbf0ac7a3
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/retinanet_r50_fpn.py
@@ -0,0 +1,60 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.4,
+        min_pos_iou=0,
+        ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    nms=dict(type='nms', iou_threshold=0.5),
+    max_per_img=100)
diff --git a/insightface/detection/scrfd/configs/_base_/models/rpn_r50_caffe_c4.py b/insightface/detection/scrfd/configs/_base_/models/rpn_r50_caffe_c4.py
new file mode 100755
index 0000000000000000000000000000000000000000..bd5d665e0331711adfb2cb3eeea113ed4762e5db
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/rpn_r50_caffe_c4.py
@@ -0,0 +1,58 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=None,
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=12000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
diff --git a/insightface/detection/scrfd/configs/_base_/models/rpn_r50_fpn.py b/insightface/detection/scrfd/configs/_base_/models/rpn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..13e96191deb243d1f625d99ac85bf17503f1f8a8
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/rpn_r50_fpn.py
@@ -0,0 +1,60 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0))
diff --git a/insightface/detection/scrfd/configs/_base_/models/ssd300.py b/insightface/detection/scrfd/configs/_base_/models/ssd300.py
new file mode 100755
index 0000000000000000000000000000000000000000..ee7cf3adc8aaced804031196c3901f90b0b0d140
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/models/ssd300.py
@@ -0,0 +1,49 @@
+# model settings
+input_size = 300
+model = dict(
+    type='SingleStageDetector',
+    pretrained='open-mmlab://vgg16_caffe',
+    backbone=dict(
+        type='SSDVGG',
+        input_size=input_size,
+        depth=16,
+        with_last_pool=False,
+        ceil_mode=True,
+        out_indices=(3, 4),
+        out_feature_indices=(22, 34),
+        l2_norm_scale=20),
+    neck=None,
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(512, 1024, 512, 256, 256, 256),
+        num_classes=80,
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])))
+cudnn_benchmark = True
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.,
+        ignore_iof_thr=-1,
+        gt_max_assign_all=False),
+    smoothl1_beta=1.,
+    allowed_border=-1,
+    pos_weight=-1,
+    neg_pos_ratio=3,
+    debug=False)
+test_cfg = dict(
+    nms=dict(type='nms', iou_threshold=0.45),
+    min_bbox_size=0,
+    score_thr=0.02,
+    max_per_img=200)
diff --git a/insightface/detection/scrfd/configs/_base_/schedules/schedule_1x.py b/insightface/detection/scrfd/configs/_base_/schedules/schedule_1x.py
new file mode 100755
index 0000000000000000000000000000000000000000..12694c87aa0a9fedd9badd4aff2b23280403f15f
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
diff --git a/insightface/detection/scrfd/configs/_base_/schedules/schedule_20e.py b/insightface/detection/scrfd/configs/_base_/schedules/schedule_20e.py
new file mode 100755
index 0000000000000000000000000000000000000000..0559030c24ed097d86918bbd589a6a12f8dd8bd5
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/schedules/schedule_20e.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 19])
+total_epochs = 20
diff --git a/insightface/detection/scrfd/configs/_base_/schedules/schedule_2x.py b/insightface/detection/scrfd/configs/_base_/schedules/schedule_2x.py
new file mode 100755
index 0000000000000000000000000000000000000000..e34095ff2b5ffdb1f9ba07380a6948504715e3d8
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[16, 22])
+total_epochs = 24
diff --git a/insightface/detection/scrfd/configs/_base_/schedules/schedule_retinaface_sgd.py b/insightface/detection/scrfd/configs/_base_/schedules/schedule_retinaface_sgd.py
new file mode 100755
index 0000000000000000000000000000000000000000..8512bc5d82a2808bf54c12b7dabc8925f3a3b07e
--- /dev/null
+++ b/insightface/detection/scrfd/configs/_base_/schedules/schedule_retinaface_sgd.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[55, 68])
+total_epochs = 80
diff --git a/insightface/detection/scrfd/configs/scrfd/base_1.3g.py b/insightface/detection/scrfd/configs/scrfd/base_1.3g.py
new file mode 100755
index 0000000000000000000000000000000000000000..9a28b937c6e4665fdb49642c52d1698def99124f
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_1.3g.py
@@ -0,0 +1,185 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    #dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        #img_scale=(1100, 1650),
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            #dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=18,
+        base_channels=16,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        #frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[64, 128, 256, 512],
+        in_channels=[16, 32, 64, 128],
+        out_channels=32,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=32,
+        stacked_convs=2,
+        feat_channels=64,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share = True,
+        strides_share = True,
+        scale_mode = 2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=80)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/base_10g.py b/insightface/detection/scrfd/configs/scrfd/base_10g.py
new file mode 100755
index 0000000000000000000000000000000000000000..c857c98fc19174641db72bbf04f49fc9cd706ef0
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_10g.py
@@ -0,0 +1,191 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop',
+         crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        #img_scale=(1100, 1650),
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            #dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+
+
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1e',
+        #depth=0,
+        #block_cfg=dict(
+        #    block='BasicBlock',
+        #    stage_blocks=(3, 4, 2, 3),
+        #    stage_planes=[24, 40, 40, 96]),
+        #base_channels=24,
+        depth=34,
+        base_channels=32,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        #frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[24, 40, 40, 96],
+        in_channels=[32, 64, 128, 256],
+        out_channels=128,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=128,
+        stacked_convs=2,
+        feat_channels=160,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/base_1g.py b/insightface/detection/scrfd/configs/scrfd/base_1g.py
new file mode 100755
index 0000000000000000000000000000000000000000..55ec81770ca7b3a54bc55bfa1bab08df6eb50daf
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_1g.py
@@ -0,0 +1,185 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop',
+         crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(3, 3, 7, 3), stage_planes=[16, 16, 32, 64, 128,
+                                                     256]),
+        #frozen_stages=1,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        #norm_eval=False,
+        #style='pytorch',
+        ),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[64, 128, 256, 512],
+        in_channels=[32, 64, 128, 256],
+        out_channels=64,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=64,
+        stacked_convs=2,
+        feat_channels=128,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share = True,
+        strides_share = True,
+        dw_conv = True,
+        scale_mode = 2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            #octave_base_scale=3,
+            #scales_per_octave=1,
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=80)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/base_2.5g.py b/insightface/detection/scrfd/configs/scrfd/base_2.5g.py
new file mode 100755
index 0000000000000000000000000000000000000000..d37d56acca2515114df3743dd9975c9fe5d08bee
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_2.5g.py
@@ -0,0 +1,185 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    #dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        #img_scale=(1100, 1650),
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            #dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=34,
+        base_channels=16,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        #frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[64, 128, 256, 512],
+        in_channels=[16, 32, 64, 128],
+        out_channels=48,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=48,
+        stacked_convs=2,
+        feat_channels=96,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share = True,
+        strides_share = True,
+        scale_mode = 2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=80)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/base_34g.py b/insightface/detection/scrfd/configs/scrfd/base_34g.py
new file mode 100755
index 0000000000000000000000000000000000000000..da807cd1433586559a30e091833e7ddfbc5ad914
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_34g.py
@@ -0,0 +1,185 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop',
+         crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            #dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=56,
+        base_channels=64,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        #frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        norm_eval=False,
+        style='pytorch'),
+
+    #backbone=dict(type='Res2Net', depth=50, scales=4, base_width=26),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[64, 512, 1024, 2048],
+        #in_channels=[16, 32, 64, 128],
+        #in_channels=[32, 64, 128, 256],
+        out_channels=128,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=128,
+        stacked_convs=2,
+        feat_channels=256,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 5
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/base_500m.py b/insightface/detection/scrfd/configs/scrfd/base_500m.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd83b2876e3733af018be3608f403774c96e2f6f
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/base_500m.py
@@ -0,0 +1,185 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(type='LoadAnnotations', with_bbox=True),
+    #dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    #dict(type='RandomFlip', flip_ratio=0.5),
+    #dict(type='Normalize', **img_norm_cfg),
+    #dict(type='Pad', size_divisor=32),
+    #dict(type='DefaultFormatBundle'),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_keypoints', 'gt_labels']),
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop',
+         crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+    #dict(type='LoadImageFromFile'),
+    #dict(
+    #    type='MultiScaleFlipAug',
+    #    img_scale=(1333, 800),
+    #    flip=False,
+    #    transforms=[
+    #        dict(type='Resize', keep_ratio=True),
+    #        dict(type='RandomFlip'),
+    #        dict(type='Normalize', **img_norm_cfg),
+    #        dict(type='Pad', size_divisor=32),
+    #        dict(type='ImageToTensor', keys=['img']),
+    #        dict(type='Collect', keys=['img']),
+    #    ])
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    #pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(2, 2, 6, 3), stage_planes=[8, 16, 32, 64, 128,
+                                                     256]),
+        #frozen_stages=1,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        #norm_eval=False,
+        #style='pytorch',
+        ),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[64, 128, 256, 512],
+        in_channels=[32, 64, 128, 256],
+        out_channels=32,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=32,
+        stacked_convs=2,
+        feat_channels=80,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share = True,
+        strides_share = True,
+        dw_conv = True,
+        scale_mode = 2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            #octave_base_scale=3,
+            #scales_per_octave=1,
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=80)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_1.3g.py b/insightface/detection/scrfd/configs/scrfd/scrfd_1.3g.py
new file mode 100755
index 0000000000000000000000000000000000000000..41c7be2392f71a14c868b56d24538edd3396b612
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_1.3g.py
@@ -0,0 +1,235 @@
+lr_mult = 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0],
+        bbox_clip_border=False),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        keep_ratio=False,
+        bbox_clip_border=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 5, 3, 2),
+            stage_planes=[16, 32, 40, 72]),
+        base_channels=16,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[16, 32, 40, 72],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=48,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9, mode=0),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_10g.py b/insightface/detection/scrfd/configs/scrfd/scrfd_10g.py
new file mode 100755
index 0000000000000000000000000000000000000000..ad8cb60c060aebc7e2f5497e5d02d29d2825b77c
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_10g.py
@@ -0,0 +1,235 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0],
+        bbox_clip_border=False),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        keep_ratio=False,
+        bbox_clip_border=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 4, 2, 3),
+            stage_planes=[56, 88, 88, 224]),
+        base_channels=56,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[56, 88, 88, 224],
+        out_channels=56,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=56,
+        stacked_convs=3,
+        feat_channels=80,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_10g_bnkps.py b/insightface/detection/scrfd/configs/scrfd/scrfd_10g_bnkps.py
new file mode 100755
index 0000000000000000000000000000000000000000..668da0d244448bef4c61d833b792eaa0e749b983
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_10g_bnkps.py
@@ -0,0 +1,235 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0],
+        bbox_clip_border=False),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        keep_ratio=False,
+        bbox_clip_border=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 4, 2, 3),
+            stage_planes=[56, 88, 88, 224]),
+        base_channels=56,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[56, 88, 88, 224],
+        out_channels=56,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=56,
+        stacked_convs=3,
+        feat_channels=80,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=False,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=True,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_1g.py b/insightface/detection/scrfd/configs/scrfd/scrfd_1g.py
new file mode 100755
index 0000000000000000000000000000000000000000..8a739744975f9f7fbfff60bd5b46e435f86686fd
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_1g.py
@@ -0,0 +1,217 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80*4)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ]),
+            dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(3, 2, 1, 5),
+            stage_planes=[32, 48, 48, 160, 216, 312])),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[48, 160, 216, 312],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=96,
+        norm_cfg=dict(type='GN', num_groups=8, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        dw_conv=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2],
+            base_sizes=[16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80*2, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_1gbn.py b/insightface/detection/scrfd/configs/scrfd/scrfd_1gbn.py
new file mode 100755
index 0000000000000000000000000000000000000000..2082624ea40295317d54a4493b231d2ad12648a9
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_1gbn.py
@@ -0,0 +1,218 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80*4)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ]),
+            dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(3, 2, 1, 5),
+            stage_planes=[32, 48, 48, 160, 216, 312])),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[48, 160, 216, 312],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=96,
+        #norm_cfg=dict(type='GN', num_groups=8, requires_grad=True),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        cls_reg_share=True,
+        strides_share=False,
+        dw_conv=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2],
+            base_sizes=[16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80*2, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g.py b/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g.py
new file mode 100755
index 0000000000000000000000000000000000000000..a0c6a3b9f723b0d5dc8c46885057213ea6b2de98
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g.py
@@ -0,0 +1,235 @@
+lr_mult = 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0],
+        bbox_clip_border=False),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        keep_ratio=False,
+        bbox_clip_border=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 5, 3, 2),
+            stage_planes=[24, 48, 48, 80]),
+        base_channels=24,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[24, 48, 48, 80],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=64,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9, mode=0),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g_bnkps.py b/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g_bnkps.py
new file mode 100755
index 0000000000000000000000000000000000000000..7cbed91a772700e317c5850d532b0b63a3ce20f9
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_2.5g_bnkps.py
@@ -0,0 +1,235 @@
+lr_mult = 8
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0],
+        bbox_clip_border=False),
+    dict(
+        type='Resize',
+        img_scale=(640, 640),
+        keep_ratio=False,
+        bbox_clip_border=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 5, 3, 2),
+            stage_planes=[24, 48, 48, 80]),
+        base_channels=24,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[24, 48, 48, 80],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=64,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=False,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=True,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9, mode=0),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_34g.py b/insightface/detection/scrfd/configs/scrfd/scrfd_34g.py
new file mode 100755
index 0000000000000000000000000000000000000000..8420101371bb31611fcac245b378d58775e4f9a6
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_34g.py
@@ -0,0 +1,225 @@
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ]),
+            dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='Bottleneck',
+            stage_blocks=(17, 16, 2, 8),
+            stage_planes=[56, 56, 144, 184]),
+        base_channels=56,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[224, 224, 576, 736],
+        out_channels=128,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=128,
+        stacked_convs=2,
+        feat_channels=256,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2],
+            base_sizes=[16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_500m.py b/insightface/detection/scrfd/configs/scrfd/scrfd_500m.py
new file mode 100755
index 0000000000000000000000000000000000000000..4c75a53774df5059d09d8bd3af783ec6716a27aa
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_500m.py
@@ -0,0 +1,218 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=3,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ]),
+            dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(2, 3, 2, 6), stage_planes=[16, 16, 40, 72, 152,
+                                                     288])),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[40, 72, 152, 288],
+        out_channels=16,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=16,
+        stacked_convs=2,
+        feat_channels=64,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=True,
+        dw_conv=True,
+        scale_mode=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2],
+            base_sizes=[16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=False,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd/scrfd_500m_bnkps.py b/insightface/detection/scrfd/configs/scrfd/scrfd_500m_bnkps.py
new file mode 100755
index 0000000000000000000000000000000000000000..6fdeab39e32681885989de3eb45e7d5416801fef
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd/scrfd_500m_bnkps.py
@@ -0,0 +1,218 @@
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_mult = 8
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=80)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = 'data/retinaface/train/'
+val_root = 'data/retinaface/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(
+        type='RandomSquareCrop',
+        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Normalize',
+        mean=[127.5, 127.5, 127.5],
+        std=[128.0, 128.0, 128.0],
+        to_rgb=True),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+            'gt_keypointss'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='Pad', size=(640, 640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/train/labelv2.txt',
+        img_prefix='data/retinaface/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ]),
+            dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/retinaface/val/labelv2.txt',
+        img_prefix='data/retinaface/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='MobileNetV1',
+        block_cfg=dict(
+            stage_blocks=(2, 3, 2, 6), stage_planes=[16, 16, 40, 72, 152,
+                                                     288])),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[40, 72, 152, 288],
+        out_channels=16,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=16,
+        stacked_convs=2,
+        feat_channels=64,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=False,
+        dw_conv=True,
+        scale_mode=0,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2],
+            base_sizes=[16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=True,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+epoch_multi = 1
+evaluation = dict(interval=80, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfd_crowdhuman/scrfd_crowdhuman_2.5g_bnkps.py b/insightface/detection/scrfd/configs/scrfd_crowdhuman/scrfd_crowdhuman_2.5g_bnkps.py
new file mode 100755
index 0000000000000000000000000000000000000000..61f63c339314f4d1cc3a4beffb5bf2da93b4ebf9
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfd_crowdhuman/scrfd_crowdhuman_2.5g_bnkps.py
@@ -0,0 +1,189 @@
+lr_mult = 4
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2500,
+    warmup_ratio=0.001,
+    step=[55*lr_mult, 68*lr_mult])
+total_epochs = 80*lr_mult
+checkpoint_config = dict(interval=total_epochs)
+log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/crowdhuman/'
+train_root = 'data/crowdhuman/train/'
+val_root = 'data/crowdhuman/val/'
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/crowdhuman/train/label_fullbody.txt',
+        img_prefix='data/crowdhuman/train/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile', to_float32=True),
+            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+            dict(
+                type='RandomSquareCrop',
+                crop_choice=[
+                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
+                ],
+                bbox_clip_border=False),
+            dict(
+                type='Resize',
+                img_scale=(640, 640),
+                keep_ratio=False,
+                bbox_clip_border=False),
+            dict(type='RandomFlip', flip_ratio=0.5),
+            dict(
+                type='PhotoMetricDistortion',
+                brightness_delta=32,
+                contrast_range=(0.5, 1.5),
+                saturation_range=(0.5, 1.5),
+                hue_delta=18),
+            dict(
+                type='Normalize',
+                mean=[127.5, 127.5, 127.5],
+                std=[128.0, 128.0, 128.0],
+                to_rgb=True),
+            dict(type='DefaultFormatBundle'),
+            dict(
+                type='Collect',
+                keys=[
+                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
+                    'gt_keypointss'
+                ])
+        ]),
+    val=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/crowdhuman/val/label_fullbody.txt',
+        img_prefix='data/crowdhuman/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    #dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]),
+    test=dict(
+        type='RetinaFaceDataset',
+        ann_file='data/crowdhuman/val/label_fullbody.txt',
+        img_prefix='data/crowdhuman/val/images/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='MultiScaleFlipAug',
+                img_scale=(640, 640),
+                flip=False,
+                transforms=[
+                    dict(type='Resize', keep_ratio=True),
+                    #dict(type='RandomFlip', flip_ratio=0.0),
+                    dict(
+                        type='Normalize',
+                        mean=[127.5, 127.5, 127.5],
+                        std=[128.0, 128.0, 128.0],
+                        to_rgb=True),
+                    dict(type='Pad', size=(640, 640), pad_val=0),
+                    dict(type='ImageToTensor', keys=['img']),
+                    dict(type='Collect', keys=['img'])
+                ])
+        ]))
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0,
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 5, 3, 2),
+            stage_planes=[24, 48, 48, 80]),
+        base_channels=24,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        in_channels=[24, 48, 48, 80],
+        out_channels=24,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=24,
+        stacked_convs=2,
+        feat_channels=64,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share=True,
+        strides_share=False,
+        scale_mode=2,
+        #anchor_generator=dict(
+        #    type='AnchorGenerator',
+        #    ratios=[1.0],
+        #    scales = [1,2],
+        #    base_sizes = [16, 64, 256],
+        #    strides=[8, 16, 32]),
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[2.0],
+            scales = [3],
+            base_sizes = [8, 16, 32, 64, 128],
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=False,
+        reg_max=8,
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps=True,
+        loss_kps=dict(
+            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
+        train_cfg=dict(
+            assigner=dict(type='ATSSAssigner', topk=9),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        test_cfg=dict(
+            nms_pre=-1,
+            min_bbox_size=0,
+            score_thr=0.02,
+            nms=dict(type='nms', iou_threshold=0.45),
+            max_per_img=-1)))
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9, mode=0),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+evaluation = dict(interval=40, metric='mAP')
diff --git a/insightface/detection/scrfd/configs/scrfdgen2.5g/scrfdgen2.5g_0.py b/insightface/detection/scrfd/configs/scrfdgen2.5g/scrfdgen2.5g_0.py
new file mode 100755
index 0000000000000000000000000000000000000000..b1052eff58da20bd0bfca9f2558fd5891cb99e59
--- /dev/null
+++ b/insightface/detection/scrfd/configs/scrfdgen2.5g/scrfdgen2.5g_0.py
@@ -0,0 +1,167 @@
+_base_ = [
+    #'../_base_/datasets/retinaface.py',
+    '../_base_/schedules/schedule_retinaface_sgd.py', '../_base_/default_runtime.py'
+]
+dataset_type = 'RetinaFaceDataset'
+data_root = 'data/retinaface/'
+train_root = data_root+'train/'
+val_root = data_root+'val/'
+#img_norm_cfg = dict(
+#    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
+train_pipeline = [
+
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
+    dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
+    #dict(type='RandomSquareCrop', crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0]),
+    dict(type='Resize', img_scale=(640, 640), keep_ratio=False),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_keypointss']),
+    #dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore']),
+]
+test_pipeline = [
+
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        #img_scale=(1100, 1650),
+        img_scale=(640, 640),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip', flip_ratio=0.0),
+            dict(type='Normalize', **img_norm_cfg),
+            #dict(type='Pad', size_divisor=32, pad_val=0),
+            dict(type='Pad', size=(640,640), pad_val=0),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=train_root + 'labelv2.txt',
+        #ann_file=train_root + 'label_wo.txt',
+        img_prefix=train_root+ 'images/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=val_root + 'labelv2.txt',
+        img_prefix=val_root+ 'images/',
+        pipeline=test_pipeline),
+    )
+model = dict(
+    type='SCRFD',
+    backbone=dict(
+        type='ResNetV1e',
+        depth=0, #refer to depth-34
+        block_cfg=dict(
+            block='BasicBlock',
+            stage_blocks=(3, 4, 6, 3),
+            stage_planes=[16, 32, 64, 128]),
+        base_channels=16,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        #frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        #norm_eval=True,
+        norm_eval=False,
+        style='pytorch'),
+    neck=dict(
+        type='PAFPN',
+        #in_channels=[64, 128, 256, 512],
+        in_channels=[16, 32, 64, 128],
+        out_channels=48,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=3),
+    bbox_head=dict(
+        type='SCRFDHead',
+        num_classes=1,
+        in_channels=48,
+        stacked_convs=2,
+        feat_channels=96,
+        #norm_cfg=dict(type='BN', requires_grad=True),
+        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
+        cls_reg_share = True,
+        strides_share = True,
+        scale_mode = 2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales = [1,2],
+            base_sizes = [16, 64, 256],
+            strides=[8, 16, 32]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        #loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_dfl=False,
+        reg_max=8,
+        #loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
+        use_kps = False,
+        loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+        )
+    )
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(type='ATSSAssigner', topk=9),
+    #assigner=dict(
+    #    type='MaxIoUAssigner',
+    #    pos_iou_thr=0.5,
+    #    neg_iou_thr=0.3,
+    #    min_pos_iou=0.5,
+    #    ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    #nms_pre=1000,
+    nms_pre=-1,
+    min_bbox_size=0,
+    score_thr=0.02,
+    nms=dict(type='nms', iou_threshold=0.45),
+    max_per_img=-1)
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+epoch_multi = 1
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=0.001,
+    step=[55*epoch_multi, 68*epoch_multi])
+total_epochs = 80*epoch_multi
+#checkpoint_config = dict(interval=1)
+checkpoint_config = dict(interval=80)
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+evaluation = dict(interval=80, metric='mAP')
+
+
+
+
diff --git a/insightface/detection/scrfd/demo/image_demo.py b/insightface/detection/scrfd/demo/image_demo.py
new file mode 100755
index 0000000000000000000000000000000000000000..5fbf93d3b6f20149b9c4ab924890be202ab34946
--- /dev/null
+++ b/insightface/detection/scrfd/demo/image_demo.py
@@ -0,0 +1,26 @@
+from argparse import ArgumentParser
+
+from mmdet.apis import inference_detector, init_detector, show_result_pyplot
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='bbox score threshold')
+    args = parser.parse_args()
+
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+    # test a single image
+    result = inference_detector(model, args.img)
+    # show the results
+    show_result_pyplot(model, args.img, result, score_thr=args.score_thr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/demo/webcam_demo.py b/insightface/detection/scrfd/demo/webcam_demo.py
new file mode 100755
index 0000000000000000000000000000000000000000..5bded14ff6c3ca633ba6af1843d5a32a433f2e06
--- /dev/null
+++ b/insightface/detection/scrfd/demo/webcam_demo.py
@@ -0,0 +1,46 @@
+import argparse
+
+import cv2
+import torch
+
+from mmdet.apis import inference_detector, init_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDetection webcam demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--camera-id', type=int, default=0, help='camera device id')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.5, help='bbox score threshold')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    device = torch.device(args.device)
+
+    model = init_detector(args.config, args.checkpoint, device=device)
+
+    camera = cv2.VideoCapture(args.camera_id)
+
+    print('Press "Esc", "q" or "Q" to exit.')
+    while True:
+        ret_val, img = camera.read()
+        result = inference_detector(model, img)
+
+        ch = cv2.waitKey(1)
+        if ch == 27 or ch == ord('q') or ch == ord('Q'):
+            break
+
+        model.show_result(
+            img, result, score_thr=args.score_thr, wait_time=1, show=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/mmdet/__init__.py b/insightface/detection/scrfd/mmdet/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a8e4bd0e4efcb196c79cfec91943563b60e9dd9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/__init__.py
@@ -0,0 +1,29 @@
+import mmcv
+
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.1.5'
+mmcv_maximum_version = '1.4'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
+
diff --git a/insightface/detection/scrfd/mmdet/apis/__init__.py b/insightface/detection/scrfd/mmdet/apis/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d8035b74877fdeccaa41cbc10a9f1f9924eac85
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/apis/__init__.py
@@ -0,0 +1,10 @@
+from .inference import (async_inference_detector, inference_detector,
+                        init_detector, show_result_pyplot)
+from .test import multi_gpu_test, single_gpu_test
+from .train import get_root_logger, set_random_seed, train_detector
+
+__all__ = [
+    'get_root_logger', 'set_random_seed', 'train_detector', 'init_detector',
+    'async_inference_detector', 'inference_detector', 'show_result_pyplot',
+    'multi_gpu_test', 'single_gpu_test'
+]
diff --git a/insightface/detection/scrfd/mmdet/apis/inference.py b/insightface/detection/scrfd/mmdet/apis/inference.py
new file mode 100755
index 0000000000000000000000000000000000000000..6fa19cd585a509b5647bd4b2bb81383288dc49cb
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/apis/inference.py
@@ -0,0 +1,187 @@
+import warnings
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+import torch
+from mmcv.ops import RoIPool
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+
+from mmdet.core import get_classes
+from mmdet.datasets.pipelines import Compose
+from mmdet.models import build_detector
+
+
+def init_detector(config, checkpoint=None, device='cuda:0', cfg_options=None):
+    """Initialize a detector from config file.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        cfg_options (dict): Options to override some settings in the used
+            config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    config.model.pretrained = None
+    model = build_detector(config.model, test_cfg=config.test_cfg)
+    if checkpoint is not None:
+        map_loc = 'cpu' if device == 'cpu' else None
+        checkpoint = load_checkpoint(model, checkpoint, map_location=map_loc)
+        if 'CLASSES' in checkpoint['meta']:
+            model.CLASSES = checkpoint['meta']['CLASSES']
+        else:
+            warnings.simplefilter('once')
+            warnings.warn('Class names are not saved in the checkpoint\'s '
+                          'meta data, use COCO classes by default.')
+            model.CLASSES = get_classes('coco')
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+class LoadImage(object):
+    """A simple pipeline to load image."""
+
+    def __call__(self, results):
+        """Call function to load images into results.
+
+        Args:
+            results (dict): A result dict contains the file name
+                of the image to be read.
+
+        Returns:
+            dict: ``results`` will be returned containing loaded image.
+        """
+        if isinstance(results['img'], str):
+            results['filename'] = results['img']
+            results['ori_filename'] = results['img']
+        else:
+            results['filename'] = None
+            results['ori_filename'] = None
+        img = mmcv.imread(results['img'])
+        results['img'] = img
+        results['img_fields'] = ['img']
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        return results
+
+
+def inference_detector(model, img):
+    """Inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+            images.
+
+    Returns:
+        If imgs is a str, a generator will be returned, otherwise return the
+        detection results directly.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # prepare data
+    if isinstance(img, np.ndarray):
+        # directly add img
+        data = dict(img=img)
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+    else:
+        # add information into dict
+        data = dict(img_info=dict(filename=img), img_prefix=None)
+    # build the data pipeline
+    test_pipeline = Compose(cfg.data.test.pipeline)
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device])[0]
+    else:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+        # just get the actual data from DataContainer
+        data['img_metas'] = data['img_metas'][0].data
+
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)[0]
+    return result
+
+
+async def async_inference_detector(model, img):
+    """Async inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | ndarray): Either image files or loaded images.
+
+    Returns:
+        Awaitable detection results.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # prepare data
+    if isinstance(img, np.ndarray):
+        # directly add img
+        data = dict(img=img)
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+    else:
+        # add information into dict
+        data = dict(img_info=dict(filename=img), img_prefix=None)
+    # build the data pipeline
+    test_pipeline = Compose(cfg.data.test.pipeline)
+    data = test_pipeline(data)
+    data = scatter(collate([data], samples_per_gpu=1), [device])[0]
+
+    # We don't restore `torch.is_grad_enabled()` value during concurrent
+    # inference since execution can overlap
+    torch.set_grad_enabled(False)
+    result = await model.aforward_test(rescale=True, **data)
+    return result
+
+
+def show_result_pyplot(model,
+                       img,
+                       result,
+                       score_thr=0.3,
+                       fig_size=(15, 10),
+                       title='result',
+                       block=True):
+    """Visualize the detection results on the image.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str or np.ndarray): Image filename or loaded image.
+        result (tuple[list] or list): The detection result, can be either
+            (bbox, segm) or just bbox.
+        score_thr (float): The threshold to visualize the bboxes and masks.
+        fig_size (tuple): Figure size of the pyplot figure.
+        title (str): Title of the pyplot figure.
+        block (bool): Whether to block GUI.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    img = model.show_result(img, result, score_thr=score_thr, show=False)
+    plt.figure(figsize=fig_size)
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.title(title)
+    plt.tight_layout()
+    plt.show(block=block)
diff --git a/insightface/detection/scrfd/mmdet/apis/test.py b/insightface/detection/scrfd/mmdet/apis/test.py
new file mode 100755
index 0000000000000000000000000000000000000000..e54b1b8c24efc448972c31ee5da63041d7f97a47
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/apis/test.py
@@ -0,0 +1,190 @@
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+def single_gpu_test(model,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    show_score_thr=0.3):
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+
+        batch_size = len(result)
+        if show or out_dir:
+            if batch_size == 1 and isinstance(data['img'][0], torch.Tensor):
+                img_tensor = data['img'][0]
+            else:
+                img_tensor = data['img'][0].data[0]
+            img_metas = data['img_metas'][0].data[0]
+            imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+            assert len(imgs) == len(img_metas)
+
+            for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+                h, w, _ = img_meta['img_shape']
+                img_show = img[:h, :w, :]
+
+                ori_h, ori_w = img_meta['ori_shape'][:-1]
+                img_show = mmcv.imresize(img_show, (ori_w, ori_h))
+
+                if out_dir:
+                    out_file = osp.join(out_dir, img_meta['ori_filename'])
+                else:
+                    out_file = None
+
+                model.module.show_result(
+                    img_show,
+                    result[i],
+                    show=show,
+                    out_file=out_file,
+                    score_thr=show_score_thr)
+
+        # encode mask results
+        if isinstance(result[0], tuple):
+            result = [(bbox_results, encode_mask_results(mask_results))
+                      for bbox_results, mask_results in result]
+        results.extend(result)
+
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result[0], tuple):
+                result = [(bbox_results, encode_mask_results(mask_results))
+                          for bbox_results, mask_results in result]
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result)
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_list.append(
+                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
diff --git a/insightface/detection/scrfd/mmdet/apis/train.py b/insightface/detection/scrfd/mmdet/apis/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..ad17a5379888028c793aed8837d90e3644e4e13f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/apis/train.py
@@ -0,0 +1,150 @@
+import random
+
+import numpy as np
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_optimizer)
+from mmcv.utils import build_from_cfg
+
+from mmdet.core import DistEvalHook, EvalHook
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import get_root_logger
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/insightface/detection/scrfd/mmdet/core/__init__.py b/insightface/detection/scrfd/mmdet/core/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b075369096b284f0112cb37e19d6e2d50878b60f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/__init__.py
@@ -0,0 +1,8 @@
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .export import *  # noqa: F401, F403
+from .fp16 import *  # noqa: F401, F403
+from .mask import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
diff --git a/insightface/detection/scrfd/mmdet/core/anchor/__init__.py b/insightface/detection/scrfd/mmdet/core/anchor/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..5838ff3eefb03bc83928fa13848cea9ff8647827
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/anchor/__init__.py
@@ -0,0 +1,11 @@
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               YOLOAnchorGenerator)
+from .builder import ANCHOR_GENERATORS, build_anchor_generator
+from .point_generator import PointGenerator
+from .utils import anchor_inside_flags, calc_region, images_to_levels
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'images_to_levels', 'calc_region',
+    'build_anchor_generator', 'ANCHOR_GENERATORS', 'YOLOAnchorGenerator'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py b/insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py
new file mode 100755
index 0000000000000000000000000000000000000000..29b5ed04b95081b4145d2bd2272dbcda30be00d8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py
@@ -0,0 +1,728 @@
+import mmcv
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import ANCHOR_GENERATORS
+
+
+@ANCHOR_GENERATORS.register_module()
+class AnchorGenerator(object):
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int] | None): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+
+    Examples:
+        >>> from mmdet.core import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_anchors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_anchors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 scales=None,
+                 base_sizes=None,
+                 scale_major=True,
+                 octave_base_scale=None,
+                 scales_per_octave=None,
+                 centers=None,
+                 center_offset=0.):
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: total number of base anchors in a feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool, optional): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors,
+                                  featmap_size,
+                                  stride=(16, 16),
+                                  device='cuda'):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int], optional): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+        feat_h, feat_w = featmap_size
+        # convert Tensor to int, so that we can covert to ONNX correctlly
+        feat_h = int(feat_h)
+        feat_w = int(feat_w)
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size,
+                                 valid_size,
+                                 num_base_anchors,
+                                 device='cuda'):
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps.
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@ANCHOR_GENERATORS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors.
+        input_size (int): Size of feature map, 300 for SSD300,
+            512 for SSD512.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 basesize_ratio_range,
+                 input_size=300,
+                 scale_major=True):
+        assert len(strides) == len(ratios)
+        assert mmcv.is_tuple_of(basesize_ratio_range, float)
+
+        self.strides = [_pair(stride) for stride in strides]
+        self.input_size = input_size
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.basesize_ratio_range = basesize_ratio_range
+
+        # calculate anchor ratios and sizes
+        min_ratio, max_ratio = basesize_ratio_range
+        min_ratio = int(min_ratio * 100)
+        max_ratio = int(max_ratio * 100)
+        step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+        min_sizes = []
+        max_sizes = []
+        for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+            min_sizes.append(int(self.input_size * ratio / 100))
+            max_sizes.append(int(self.input_size * (ratio + step) / 100))
+        if self.input_size == 300:
+            if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                min_sizes.insert(0, int(self.input_size * 7 / 100))
+                max_sizes.insert(0, int(self.input_size * 15 / 100))
+            elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                min_sizes.insert(0, int(self.input_size * 10 / 100))
+                max_sizes.insert(0, int(self.input_size * 20 / 100))
+            else:
+                raise ValueError(
+                    'basesize_ratio_range[0] should be either 0.15'
+                    'or 0.2 when input_size is 300, got '
+                    f'{basesize_ratio_range[0]}.')
+        elif self.input_size == 512:
+            if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                min_sizes.insert(0, int(self.input_size * 4 / 100))
+                max_sizes.insert(0, int(self.input_size * 10 / 100))
+            elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                min_sizes.insert(0, int(self.input_size * 7 / 100))
+                max_sizes.insert(0, int(self.input_size * 15 / 100))
+            else:
+                raise ValueError('basesize_ratio_range[0] should be either 0.1'
+                                 'or 0.15 when input_size is 512, got'
+                                 f' {basesize_ratio_range[0]}.')
+        else:
+            raise ValueError('Only support 300 or 512 in SSDAnchorGenerator'
+                             f', got {self.input_size}.')
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@ANCHOR_GENERATORS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in propotion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+
+    Examples:
+        >>> from mmdet.core import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@ANCHOR_GENERATORS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 basesize_ratio_range,
+                 input_size=300,
+                 scale_major=True):
+        super(LegacySSDAnchorGenerator,
+              self).__init__(strides, ratios, basesize_ratio_range, input_size,
+                             scale_major)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@ANCHOR_GENERATORS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self, strides, base_sizes):
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self, base_sizes_per_level, center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int, int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
+
+    def responsible_flags(self, featmap_sizes, gt_bboxes, device='cuda'):
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            flags = self.single_level_responsible_flags(
+                featmap_sizes[i],
+                gt_bboxes,
+                anchor_stride,
+                self.num_base_anchors[i],
+                device=device)
+            multi_level_responsible_flags.append(flags)
+        return multi_level_responsible_flags
+
+    def single_level_responsible_flags(self,
+                                       featmap_size,
+                                       gt_bboxes,
+                                       stride,
+                                       num_base_anchors,
+                                       device='cuda'):
+        """Generate the responsible flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            stride (tuple(int)): stride of current level
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / stride[0]).long()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / stride[1]).long()
+
+        # row major indexing
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+
+        responsible_grid = torch.zeros(
+            feat_h * feat_w, dtype=torch.uint8, device=device)
+        responsible_grid[gt_bboxes_grid_idx] = 1
+
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid
diff --git a/insightface/detection/scrfd/mmdet/core/anchor/builder.py b/insightface/detection/scrfd/mmdet/core/anchor/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..d79b448ebca9f2b21d455046623172c48c5c3ef0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/anchor/builder.py
@@ -0,0 +1,7 @@
+from mmcv.utils import Registry, build_from_cfg
+
+ANCHOR_GENERATORS = Registry('Anchor generator')
+
+
+def build_anchor_generator(cfg, default_args=None):
+    return build_from_cfg(cfg, ANCHOR_GENERATORS, default_args)
diff --git a/insightface/detection/scrfd/mmdet/core/anchor/point_generator.py b/insightface/detection/scrfd/mmdet/core/anchor/point_generator.py
new file mode 100755
index 0000000000000000000000000000000000000000..e6fbd988c317992c092c68c827dc4c53223b4a4a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/anchor/point_generator.py
@@ -0,0 +1,37 @@
+import torch
+
+from .builder import ANCHOR_GENERATORS
+
+
+@ANCHOR_GENERATORS.register_module()
+class PointGenerator(object):
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self, featmap_size, stride=16, device='cuda'):
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
diff --git a/insightface/detection/scrfd/mmdet/core/anchor/utils.py b/insightface/detection/scrfd/mmdet/core/anchor/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..ab9b53f37f7be1f52fe63c5e53df64ac1303b9e0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/anchor/utils.py
@@ -0,0 +1,71 @@
+import torch
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def anchor_inside_flags(flat_anchors,
+                        valid_flags,
+                        img_shape,
+                        allowed_border=0):
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int, optional): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = valid_flags & \
+            (flat_anchors[:, 0] >= -allowed_border) & \
+            (flat_anchors[:, 1] >= -allowed_border) & \
+            (flat_anchors[:, 2] < img_w + allowed_border) & \
+            (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple): Feature map size used for clipping the boundary.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/__init__.py b/insightface/detection/scrfd/mmdet/core/bbox/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..28a1b9f9559f429bd463847586b28bf18070acd0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/__init__.py
@@ -0,0 +1,27 @@
+from .assigners import (AssignResult, BaseAssigner, CenterRegionAssigner,
+                        MaxIoUAssigner)
+from .builder import build_assigner, build_bbox_coder, build_sampler
+from .coder import (BaseBBoxCoder, DeltaXYWHBBoxCoder, PseudoBBoxCoder,
+                    TBLRBBoxCoder)
+from .iou_calculators import BboxOverlaps2D, bbox_overlaps
+from .samplers import (BaseSampler, CombinedSampler,
+                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
+                       OHEMSampler, PseudoSampler, RandomSampler,
+                       SamplingResult, ScoreHLRSampler)
+from .transforms import (bbox2distance, bbox2result, bbox2roi, kps2distance,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_rescale, bbox_xyxy_to_cxcywh,
+                         distance2bbox, distance2kps, roi2bbox)
+
+__all__ = [
+    'bbox_overlaps', 'BboxOverlaps2D', 'BaseAssigner', 'MaxIoUAssigner',
+    'AssignResult', 'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'build_assigner',
+    'build_sampler', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back',
+    'bbox2roi', 'roi2bbox', 'bbox2result', 
+    'distance2bbox', 'bbox2distance', 'distance2kps', 'kps2distance',
+    'build_bbox_coder', 'BaseBBoxCoder', 'PseudoBBoxCoder',
+    'DeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'CenterRegionAssigner',
+    'bbox_rescale', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/__init__.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b8f0f48d8cfab09ae68ab2797f8ce0a5b8de0f12
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/__init__.py
@@ -0,0 +1,15 @@
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .assign_result import AssignResult
+from .atss_assigner import ATSSAssigner
+from .base_assigner import BaseAssigner
+from .center_region_assigner import CenterRegionAssigner
+from .grid_assigner import GridAssigner
+from .hungarian_assigner import HungarianAssigner
+from .max_iou_assigner import MaxIoUAssigner
+from .point_assigner import PointAssigner
+
+__all__ = [
+    'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult',
+    'PointAssigner', 'ATSSAssigner', 'CenterRegionAssigner', 'GridAssigner',
+    'HungarianAssigner'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/approx_max_iou_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..6d07656d173744426795c81c14c6bcdb4e63a406
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,145 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with an integer indicating the ground truth
+     index. (semi-positive index: gt label (0-based), -1: background)
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               approxs,
+               squares,
+               approxs_per_octave,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, or a semi-positive number.
+        background_label (-1) means negative sample,
+        semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to background_label (-1)
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to background
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            approxs (Tensor): Bounding boxes to be assigned,
+                shape(approxs_per_octave*n, 4).
+            squares (Tensor): Base Bounding boxes to be assigned,
+                shape(n, 4).
+            approxs_per_octave (int): number of approxs per octave
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+
+        if num_squares == 0 or num_gts == 0:
+            # No predictions and/or truth, return empty assignment
+            overlaps = approxs.new(num_gts, num_squares)
+            assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+            return assign_result
+
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(
+            approxs.view(num_squares, approxs_per_octave, 4), 0,
+            1).contiguous().view(-1, 4)
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            num_gts > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = approxs.device
+            approxs = approxs.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+        all_overlaps = self.iou_calculator(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    squares, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, squares, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/assign_result.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/assign_result.py
new file mode 100755
index 0000000000000000000000000000000000000000..4639fbdba0a5b92778e1ab87d61182e54bfb9b6f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/assign_result.py
@@ -0,0 +1,204 @@
+import torch
+
+from mmdet.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assinged to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from mmdet.core.bbox import demodata
+        rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned]
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmdet v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/atss_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/atss_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..d41a2f2cc914fa5e9dce1a05128a257c9591df32
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/atss_assigner.py
@@ -0,0 +1,215 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class ATSSAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (float): number of bbox selected in each level
+    """
+
+    def __init__(self,
+                 topk,
+                 mode=0,
+                 iou_calculator=dict(type='BboxOverlaps2D'),
+                 ignore_iof_thr=-1):
+        self.topk = topk
+        self.mode = mode
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+        self.ignore_iof_thr = ignore_iof_thr
+
+    # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
+
+    def assign(self,
+               bboxes,
+               num_level_bboxes,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as postive
+        6. limit the positive sample's center in gt
+
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        INF = 100000000
+        bboxes = bboxes[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+        #print('AT1:', num_gt, num_bboxes)
+
+        # compute iou between all bbox and gt
+        overlaps = self.iou_calculator(bboxes, gt_bboxes)
+
+        # assign 0 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             0,
+                                             dtype=torch.long)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = torch.stack((gt_cx, gt_cy), dim=1)
+
+        gt_width = gt_bboxes[:,2] - gt_bboxes[:,0]
+        gt_height = gt_bboxes[:,3] - gt_bboxes[:,1]
+        gt_area = torch.sqrt( torch.clamp(gt_width*gt_height, min=1e-4) )
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = torch.stack((bboxes_cx, bboxes_cy), dim=1)
+
+        distances = (bboxes_points[:, None, :] -
+                     gt_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0):
+            ignore_overlaps = self.iou_calculator(
+                bboxes, gt_bboxes_ignore, mode='iof')
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for level, bboxes_per_level in enumerate(num_level_bboxes):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :] #(A,G)
+            selectable_k = min(self.topk, bboxes_per_level)
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selectable_k, dim=0, largest=False)
+            #print('AT-LEVEL:', start_idx, end_idx, bboxes_per_level, topk_idxs_per_level.shape)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = torch.cat(candidate_idxs, dim=0)# candidate anchors (topk*num_level_bboxes, G) = (AK, G)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)] #(AK,G)
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+        #print('CAND:', candidate_idxs.shape, candidate_overlaps.shape, is_pos.shape)
+        #print('BOXES:', bboxes_cx.shape)
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_bboxes_cx = bboxes_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_bboxes_cy = bboxes_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_bboxes_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_bboxes_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].view(-1, num_gt)
+        #is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+        dist_min = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] # (A,G)
+        dist_min.div_(gt_area)
+        #print('ATTT:', l_.shape, t_.shape, dist_min.shape, self.mode)
+        if self.mode==0:
+            is_in_gts = dist_min > 0.001
+        elif self.mode==1:
+            is_in_gts = dist_min > -0.25
+        elif self.mode==2:
+            is_in_gts = dist_min > -0.15
+            #dist_expand = torch.clamp(gt_area / 16.0, min=1.0, max=3.0)
+            #dist_min.mul_(dist_expand)
+            #is_in_gts = dist_min > -0.25
+        elif self.mode==3:
+            dist_expand = torch.clamp(gt_area / 16.0, min=1.0, max=6.0)
+            dist_min.mul_(dist_expand)
+            is_in_gts = dist_min > -0.2
+        elif self.mode==4:
+            dist_expand = torch.clamp(gt_area / 16.0, min=0.5, max=6.0)
+            dist_min.mul_(dist_expand)
+            is_in_gts = dist_min > -0.2
+        elif self.mode==5:
+            dist_div = torch.clamp(gt_area / 16.0, min=0.5, max=3.0)
+            dist_min.div_(dist_div)
+            is_in_gts = dist_min > -0.2
+        else:
+            raise ValueError
+        #print(gt_area.shape, is_in_gts.shape, is_pos.shape)
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/base_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/base_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..2da9e0f4aa55b46e0059a037c18cb58577d04871
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,10 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxe or a negative boxes."""
+        pass
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/center_region_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/center_region_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..488e3b615318787751cab3211e38dd9471c666be
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/center_region_assigner.py
@@ -0,0 +1,335 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def scale_boxes(bboxes, scale):
+    """Expand an array of boxes by a given scale.
+
+    Args:
+        bboxes (Tensor): Shape (m, 4)
+        scale (float): The scale factor of bboxes
+
+    Returns:
+        (Tensor): Shape (m, 4). Scaled bboxes
+    """
+    assert bboxes.size(1) == 4
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_scaled = torch.zeros_like(bboxes)
+    boxes_scaled[:, 0] = x_c - w_half
+    boxes_scaled[:, 2] = x_c + w_half
+    boxes_scaled[:, 1] = y_c - h_half
+    boxes_scaled[:, 3] = y_c + h_half
+    return boxes_scaled
+
+
+def is_located_in(points, bboxes):
+    """Are points located in bboxes.
+
+    Args:
+      points (Tensor): Points, shape: (m, 2).
+      bboxes (Tensor): Bounding boxes, shape: (n, 4).
+
+    Return:
+      Tensor: Flags indicating if points are located in bboxes, shape: (m, n).
+    """
+    assert points.size(1) == 2
+    assert bboxes.size(1) == 4
+    return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \
+           (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0))
+
+
+def bboxes_area(bboxes):
+    """Compute the area of an array of bboxes.
+
+    Args:
+        bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4)
+
+    Returns:
+        Tensor: Area of the bboxes. Shape: (m, )
+    """
+    assert bboxes.size(1) == 4
+    w = (bboxes[:, 2] - bboxes[:, 0])
+    h = (bboxes[:, 3] - bboxes[:, 1])
+    areas = w * h
+    return areas
+
+
+@BBOX_ASSIGNERS.register_module()
+class CenterRegionAssigner(BaseAssigner):
+    """Assign pixels at the center region of a bbox as positive.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+    - -1: negative samples
+    - semi-positive numbers: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_scale (float): Threshold within which pixels are
+          labelled as positive.
+        neg_scale (float): Threshold above which pixels are
+          labelled as positive.
+        min_pos_iof (float): Minimum iof of a pixel with a gt to be
+          labelled as positive. Default: 1e-2
+        ignore_gt_scale (float): Threshold within which the pixels
+          are ignored when the gt is labelled as shadowed. Default: 0.5
+        foreground_dominate (bool): If True, the bbox will be assigned as
+          positive when a gt's kernel region overlaps with another's shadowed
+          (ignored) region, otherwise it is set as ignored. Default to False.
+    """
+
+    def __init__(self,
+                 pos_scale,
+                 neg_scale,
+                 min_pos_iof=1e-2,
+                 ignore_gt_scale=0.5,
+                 foreground_dominate=False,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_scale = pos_scale
+        self.neg_scale = neg_scale
+        self.min_pos_iof = min_pos_iof
+        self.ignore_gt_scale = ignore_gt_scale
+        self.foreground_dominate = foreground_dominate
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def get_gt_priorities(self, gt_bboxes):
+        """Get gt priorities according to their areas.
+
+        Smaller gt has higher priority.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth boxes, shape (k, 4).
+
+        Returns:
+            Tensor: The priority of gts so that gts with larger priority is \
+              more likely to be assigned. Shape (k, )
+        """
+        gt_areas = bboxes_area(gt_bboxes)
+        # Rank all gt bbox areas. Smaller objects has larger priority
+        _, sort_idx = gt_areas.sort(descending=True)
+        sort_idx = sort_idx.argsort()
+        return sort_idx
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to bboxes.
+
+        This method assigns gts to every bbox (proposal/anchor), each bbox \
+        will be assigned with -1, or a semi-positive number. -1 means \
+        negative sample, semi-positive number is the index (0-based) of \
+        assigned gt.
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (tensor, optional): Ground truth bboxes that are
+              labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (tensor, optional): Label of gt_bboxes, shape (num_gts,).
+
+        Returns:
+            :obj:`AssignResult`: The assigned result. Note that \
+              shadowed_labels of shape (N, 2) is also added as an \
+              `assign_result` attribute. `shadowed_labels` is a tensor \
+              composed of N pairs of anchor_ind, class_label], where N \
+              is the number of anchors that lie in the outer region of a \
+              gt, anchor_ind is the shadowed anchor index and class_label \
+              is the shadowed class label.
+
+        Example:
+            >>> self = CenterRegionAssigner(0.2, 0.2)
+            >>> bboxes = torch.Tensor([[0, 0, 10, 10], [10, 10, 20, 20]])
+            >>> gt_bboxes = torch.Tensor([[0, 0, 10, 10]])
+            >>> assign_result = self.assign(bboxes, gt_bboxes)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        # There are in total 5 steps in the pixel assignment
+        # 1. Find core (the center region, say inner 0.2)
+        #     and shadow (the relatively ourter part, say inner 0.2-0.5)
+        #     regions of every gt.
+        # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions
+        # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in
+        #      the image.
+        #    3.1. For overlapping objects, the prior bboxes in gt_core is
+        #           assigned with the object with smallest area
+        # 4. Assign prior bboxes with class label according to its gt id.
+        #    4.1. Assign -1 to prior bboxes lying in shadowed gts
+        #    4.2. Assign positive prior boxes with the corresponding label
+        # 5. Find pixels lying in the shadow of an object and assign them with
+        #      background label, but set the loss weight of its corresponding
+        #      gt to zero.
+        assert bboxes.size(1) == 4, 'bboxes must have size of 4'
+        # 1. Find core positive and shadow region of every gt
+        gt_core = scale_boxes(gt_bboxes, self.pos_scale)
+        gt_shadow = scale_boxes(gt_bboxes, self.neg_scale)
+
+        # 2. Find prior bboxes that lie in gt_core and gt_shadow regions
+        bbox_centers = (bboxes[:, 2:4] + bboxes[:, 0:2]) / 2
+        # The center points lie within the gt boxes
+        is_bbox_in_gt = is_located_in(bbox_centers, gt_bboxes)
+        # Only calculate bbox and gt_core IoF. This enables small prior bboxes
+        #   to match large gts
+        bbox_and_gt_core_overlaps = self.iou_calculator(
+            bboxes, gt_core, mode='iof')
+        # The center point of effective priors should be within the gt box
+        is_bbox_in_gt_core = is_bbox_in_gt & (
+            bbox_and_gt_core_overlaps > self.min_pos_iof)  # shape (n, k)
+
+        is_bbox_in_gt_shadow = (
+            self.iou_calculator(bboxes, gt_shadow, mode='iof') >
+            self.min_pos_iof)
+        # Rule out center effective positive pixels
+        is_bbox_in_gt_shadow &= (~is_bbox_in_gt_core)
+
+        num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+        if num_gts == 0 or num_bboxes == 0:
+            # If no gts exist, assign all pixels to negative
+            assigned_gt_ids = \
+                is_bbox_in_gt_core.new_zeros((num_bboxes,),
+                                             dtype=torch.long)
+            pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2))
+        else:
+            # Step 3: assign a one-hot gt id to each pixel, and smaller objects
+            #    have high priority to assign the pixel.
+            sort_idx = self.get_gt_priorities(gt_bboxes)
+            assigned_gt_ids, pixels_in_gt_shadow = \
+                self.assign_one_hot_gt_indices(is_bbox_in_gt_core,
+                                               is_bbox_in_gt_shadow,
+                                               gt_priority=sort_idx)
+
+        if gt_bboxes_ignore is not None and gt_bboxes_ignore.numel() > 0:
+            # No ground truth or boxes, return empty assignment
+            gt_bboxes_ignore = scale_boxes(
+                gt_bboxes_ignore, scale=self.ignore_gt_scale)
+            is_bbox_in_ignored_gts = is_located_in(bbox_centers,
+                                                   gt_bboxes_ignore)
+            is_bbox_in_ignored_gts = is_bbox_in_ignored_gts.any(dim=1)
+            assigned_gt_ids[is_bbox_in_ignored_gts] = -1
+
+        # 4. Assign prior bboxes with class label according to its gt id.
+        assigned_labels = None
+        shadowed_pixel_labels = None
+        if gt_labels is not None:
+            # Default assigned label is the background (-1)
+            assigned_labels = assigned_gt_ids.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_ids > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds]
+                                                      - 1]
+            # 5. Find pixels lying in the shadow of an object
+            shadowed_pixel_labels = pixels_in_gt_shadow.clone()
+            if pixels_in_gt_shadow.numel() > 0:
+                pixel_idx, gt_idx =\
+                    pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1]
+                assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \
+                    'Some pixels are dually assigned to ignore and gt!'
+                shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1]
+                override = (
+                    assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1])
+                if self.foreground_dominate:
+                    # When a pixel is both positive and shadowed, set it as pos
+                    shadowed_pixel_labels = shadowed_pixel_labels[~override]
+                else:
+                    # When a pixel is both pos and shadowed, set it as shadowed
+                    assigned_labels[pixel_idx[override]] = -1
+                    assigned_gt_ids[pixel_idx[override]] = 0
+
+        assign_result = AssignResult(
+            num_gts, assigned_gt_ids, None, labels=assigned_labels)
+        # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2)
+        assign_result.set_extra_property('shadowed_labels',
+                                         shadowed_pixel_labels)
+        return assign_result
+
+    def assign_one_hot_gt_indices(self,
+                                  is_bbox_in_gt_core,
+                                  is_bbox_in_gt_shadow,
+                                  gt_priority=None):
+        """Assign only one gt index to each prior box.
+
+        Gts with large gt_priority are more likely to be assigned.
+
+        Args:
+            is_bbox_in_gt_core (Tensor): Bool tensor indicating the bbox center
+              is in the core area of a gt (e.g. 0-0.2).
+              Shape: (num_prior, num_gt).
+            is_bbox_in_gt_shadow (Tensor): Bool tensor indicating the bbox
+              center is in the shadowed area of a gt (e.g. 0.2-0.5).
+              Shape: (num_prior, num_gt).
+            gt_priority (Tensor): Priorities of gts. The gt with a higher
+              priority is more likely to be assigned to the bbox when the bbox
+              match with multiple gts. Shape: (num_gt, ).
+
+        Returns:
+            tuple: Returns (assigned_gt_inds, shadowed_gt_inds).
+
+                - assigned_gt_inds: The assigned gt index of each prior bbox \
+                    (i.e. index from 1 to num_gts). Shape: (num_prior, ).
+                - shadowed_gt_inds: shadowed gt indices. It is a tensor of \
+                    shape (num_ignore, 2) with first column being the \
+                    shadowed prior bbox indices and the second column the \
+                    shadowed gt indices (1-based).
+        """
+        num_bboxes, num_gts = is_bbox_in_gt_core.shape
+
+        if gt_priority is None:
+            gt_priority = torch.arange(
+                num_gts, device=is_bbox_in_gt_core.device)
+        assert gt_priority.size(0) == num_gts
+        # The bigger gt_priority, the more preferable to be assigned
+        # The assigned inds are by default 0 (background)
+        assigned_gt_inds = is_bbox_in_gt_core.new_zeros((num_bboxes, ),
+                                                        dtype=torch.long)
+        # Shadowed bboxes are assigned to be background. But the corresponding
+        #   label is ignored during loss calculation, which is done through
+        #   shadowed_gt_inds
+        shadowed_gt_inds = torch.nonzero(is_bbox_in_gt_shadow, as_tuple=False)
+        if is_bbox_in_gt_core.sum() == 0:  # No gt match
+            shadowed_gt_inds[:, 1] += 1  # 1-based. For consistency issue
+            return assigned_gt_inds, shadowed_gt_inds
+
+        # The priority of each prior box and gt pair. If one prior box is
+        #  matched bo multiple gts. Only the pair with the highest priority
+        #  is saved
+        pair_priority = is_bbox_in_gt_core.new_full((num_bboxes, num_gts),
+                                                    -1,
+                                                    dtype=torch.long)
+
+        # Each bbox could match with multiple gts.
+        # The following codes deal with this situation
+        # Matched  bboxes (to any gt). Shape: (num_pos_anchor, )
+        inds_of_match = torch.any(is_bbox_in_gt_core, dim=1)
+        # The matched gt index of each positive bbox. Length >= num_pos_anchor
+        #   , since one bbox could match multiple gts
+        matched_bbox_gt_inds = torch.nonzero(
+            is_bbox_in_gt_core, as_tuple=False)[:, 1]
+        # Assign priority to each bbox-gt pair.
+        pair_priority[is_bbox_in_gt_core] = gt_priority[matched_bbox_gt_inds]
+        _, argmax_priority = pair_priority[inds_of_match].max(dim=1)
+        assigned_gt_inds[inds_of_match] = argmax_priority + 1  # 1-based
+        # Zero-out the assigned anchor box to filter the shadowed gt indices
+        is_bbox_in_gt_core[inds_of_match, argmax_priority] = 0
+        # Concat the shadowed indices due to overlapping with that out side of
+        #   effective scale. shape: (total_num_ignore, 2)
+        shadowed_gt_inds = torch.cat(
+            (shadowed_gt_inds, torch.nonzero(
+                is_bbox_in_gt_core, as_tuple=False)),
+            dim=0)
+        # `is_bbox_in_gt_core` should be changed back to keep arguments intact.
+        is_bbox_in_gt_core[inds_of_match, argmax_priority] = 1
+        # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds`
+        if shadowed_gt_inds.numel() > 0:
+            shadowed_gt_inds[:, 1] += 1
+        return assigned_gt_inds, shadowed_gt_inds
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/grid_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/grid_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..7390ea6370639c939d578c6ebf0f9268499161bc
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/grid_assigner.py
@@ -0,0 +1,155 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class GridAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self, bboxes, box_responsible_flags, gt_bboxes, gt_labels=None):
+        """Assign gt to bboxes. The process is very much like the max iou
+        assigner, except that positive samples are constrained within the cell
+        that the gt boxes fell in.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts <= neg_iou_thr to 0
+        3. for each bbox within a cell, if the iou with its nearest gt >
+            pos_iou_thr and the center of that gt falls inside the cell,
+            assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals within the cell the
+            gt bbox falls in to itself.
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            box_responsible_flags (Tensor): flag to indicate whether box is
+                responsible for prediction, shape(n, )
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+
+        # compute iou between all gt and bboxes
+        overlaps = self.iou_calculator(gt_bboxes, bboxes)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # 2. assign negative: below
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        # shape of max_overlaps == argmax_overlaps == num_bboxes
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps <= self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, (tuple, list)):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0])
+                             & (max_overlaps <= self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: falls into responsible cell and above
+        # positive IOU threshold, the order matters.
+        # the prior condition of comparision is to filter out all
+        # unrelated anchors, i.e. not box_responsible_flags
+        overlaps[:, ~box_responsible_flags.type(torch.bool)] = -1.
+
+        # calculate max_overlaps again, but this time we only consider IOUs
+        # for anchors responsible for prediction
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        pos_inds = (max_overlaps >
+                    self.pos_iou_thr) & box_responsible_flags.type(torch.bool)
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign positive to max overlapped anchors within responsible cell
+        for i in range(num_gts):
+            if gt_max_overlaps[i] > self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \
+                         box_responsible_flags.type(torch.bool)
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                elif box_responsible_flags[gt_argmax_overlaps[i]]:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # assign labels of positive anchors
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/hungarian_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/hungarian_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..224609300f6e7aa3dd296f5ca5e33c1df6372cb4
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/hungarian_assigner.py
@@ -0,0 +1,158 @@
+import torch
+from scipy.optimize import linear_sum_assignment
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from ..transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classfication cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_weight=1.,
+                 bbox_weight=1.,
+                 iou_weight=1.,
+                 iou_calculator=dict(type='BboxOverlaps2D'),
+                 iou_mode='giou'):
+        # defaultly giou cost is used in the official DETR repo.
+        self.iou_mode = iou_mode
+        self.cls_weight = cls_weight
+        self.bbox_weight = bbox_weight
+        self.iou_weight = iou_weight
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification cost.
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be ommitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]  # [num_bboxes, num_gt]
+
+        # regression L1 cost
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = torch.Tensor([img_w, img_h, img_w,
+                               img_h]).unsqueeze(0).to(gt_bboxes.device)
+        gt_bboxes_normalized = gt_bboxes / factor
+        bbox_cost = torch.cdist(
+            bbox_pred, bbox_xyxy_to_cxcywh(gt_bboxes_normalized),
+            p=1)  # [num_bboxes, num_gt]
+
+        # regression iou cost, defaultly giou is used in official DETR.
+        bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = self.iou_calculator(
+            bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so ommitted.
+        iou_cost = -overlaps
+
+        # weighted sum of above three costs
+        cost = self.cls_weight * cls_cost + self.bbox_weight * bbox_cost
+        cost = cost + self.iou_weight * iou_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/max_iou_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/max_iou_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..a99f77e104bb6e0429f0ad1ef9cbe91750258e63
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/max_iou_assigner.py
@@ -0,0 +1,212 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..iou_calculators import build_iou_calculator
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True,
+                 match_low_quality=True,
+                 gpu_assign_thr=-1,
+                 iou_calculator=dict(type='BboxOverlaps2D')):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = build_iou_calculator(iou_calculator)
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> bboxes = torch.Tensor([[0, 0, 10, 10], [10, 10, 20, 20]])
+            >>> gt_bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> assign_result = self.assign(bboxes, gt_bboxes)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = bboxes.device
+            bboxes = bboxes.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+
+        overlaps = self.iou_calculator(gt_bboxes, bboxes)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    bboxes, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, bboxes, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+        """Assign w.r.t. the overlaps of bboxes with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwirte the assigned_gt_inds assigned
+            # in Step 3. Thus, the assigned gt might not be the best one for
+            # prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
+            # bbox 1 will be assigned as the best target for bbox A in step 3.
+            # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
+            # assigned_gt_inds will be overwritten to be bbox B.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/assigners/point_assigner.py b/insightface/detection/scrfd/mmdet/core/bbox/assigners/point_assigner.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb8f5e4edc63f4851e2067034c5e67a3558f31bc
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/assigners/point_assigner.py
@@ -0,0 +1,133 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@BBOX_ASSIGNERS.register_module()
+class PointAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each point.
+
+    Each proposals will be assigned with `0`, or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    """
+
+    def __init__(self, scale=4, pos_num=3):
+        self.scale = scale
+        self.pos_num = pos_num
+
+    def assign(self, points, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to points.
+
+        This method assign a gt bbox to every points set, each points set
+        will be assigned with  the background_label (-1), or a label number.
+        -1 is background, and semi-positive number is the index (0-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every points to the background_label (-1)
+        2. A point is assigned to some gt bbox if
+            (i) the point is within the k closest points to the gt bbox
+            (ii) the distance between this point and the gt is smaller than
+                other gt bboxes
+
+        Args:
+            points (Tensor): points to be assigned, shape(n, 3) while last
+                dimension stands for (x, y, stride).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+                NOTE: currently unused.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_points = points.shape[0]
+        num_gts = gt_bboxes.shape[0]
+
+        if num_gts == 0 or num_points == 0:
+            # If no truth assign everything to the background
+            assigned_gt_inds = points.new_full((num_points, ),
+                                               0,
+                                               dtype=torch.long)
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = points.new_full((num_points, ),
+                                                  -1,
+                                                  dtype=torch.long)
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        points_xy = points[:, :2]
+        points_stride = points[:, 2]
+        points_lvl = torch.log2(
+            points_stride).int()  # [3...,4...,5...,6...,7...]
+        lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
+
+        # assign gt box
+        gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
+        gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
+        scale = self.scale
+        gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
+                          torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
+        gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
+
+        # stores the assigned gt index of each point
+        assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
+        # stores the assigned gt dist (to this point) of each point
+        assigned_gt_dist = points.new_full((num_points, ), float('inf'))
+        points_range = torch.arange(points.shape[0])
+
+        for idx in range(num_gts):
+            gt_lvl = gt_bboxes_lvl[idx]
+            # get the index of points in this level
+            lvl_idx = gt_lvl == points_lvl
+            points_index = points_range[lvl_idx]
+            # get the points in this level
+            lvl_points = points_xy[lvl_idx, :]
+            # get the center point of gt
+            gt_point = gt_bboxes_xy[[idx], :]
+            # get width and height of gt
+            gt_wh = gt_bboxes_wh[[idx], :]
+            # compute the distance between gt center and
+            #   all points in this level
+            points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
+            # find the nearest k points to gt center in this level
+            min_dist, min_dist_index = torch.topk(
+                points_gt_dist, self.pos_num, largest=False)
+            # the index of nearest k points to gt center in this level
+            min_dist_points_index = points_index[min_dist_index]
+            # The less_than_recorded_index stores the index
+            #   of min_dist that is less then the assigned_gt_dist. Where
+            #   assigned_gt_dist stores the dist from previous assigned gt
+            #   (if exist) to each point.
+            less_than_recorded_index = min_dist < assigned_gt_dist[
+                min_dist_points_index]
+            # The min_dist_points_index stores the index of points satisfy:
+            #   (1) it is k nearest to current gt center in this level.
+            #   (2) it is closer to current gt center than other gt center.
+            min_dist_points_index = min_dist_points_index[
+                less_than_recorded_index]
+            # assign the result
+            assigned_gt_inds[min_dist_points_index] = idx + 1
+            assigned_gt_dist[min_dist_points_index] = min_dist[
+                less_than_recorded_index]
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/builder.py b/insightface/detection/scrfd/mmdet/core/bbox/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..682683b62ae55396f24e9f9eea0f8193e2e88de6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/builder.py
@@ -0,0 +1,20 @@
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/__init__.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae455ba8fc0e0727e2d581cdc8f20fceededf99a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/__init__.py
@@ -0,0 +1,13 @@
+from .base_bbox_coder import BaseBBoxCoder
+from .bucketing_bbox_coder import BucketingBBoxCoder
+from .delta_xywh_bbox_coder import DeltaXYWHBBoxCoder
+from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
+from .pseudo_bbox_coder import PseudoBBoxCoder
+from .tblr_bbox_coder import TBLRBBoxCoder
+from .yolo_bbox_coder import YOLOBBoxCoder
+
+__all__ = [
+    'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
+    'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
+    'BucketingBBoxCoder'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/base_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/base_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..6e4272721534127c66ce3443df527d17ae6fa118
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,19 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+        pass
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
+        pass
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/bucketing_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/bucketing_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..e8c450c5fbe90295aefb888cf1e4c24c26fbed5d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/bucketing_bbox_coder.py
@@ -0,0 +1,346 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..builder import BBOX_CODERS
+from ..transforms import bbox_rescale
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class BucketingBBoxCoder(BaseBBoxCoder):
+    """Bucketing BBox Coder for Side-Aware Bounday Localization (SABL).
+
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented here.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_buckets (int): Number of buckets.
+        scale_factor (int): Scale factor of proposals to generate buckets.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset upperbound to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_buckets,
+                 scale_factor,
+                 offset_topk=2,
+                 offset_upperbound=1.0,
+                 cls_ignore_neighbor=True,
+                 clip_border=True):
+        super(BucketingBBoxCoder, self).__init__()
+        self.num_buckets = num_buckets
+        self.scale_factor = scale_factor
+        self.offset_topk = offset_topk
+        self.offset_upperbound = offset_upperbound
+        self.cls_ignore_neighbor = cls_ignore_neighbor
+        self.clip_border = clip_border
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get bucketing estimation and fine regression targets during
+        training.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground truth boxes.
+
+        Returns:
+           encoded_bboxes(tuple[Tensor]): bucketing estimation
+            and fine regression targets and weights
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
+                                     self.scale_factor, self.offset_topk,
+                                     self.offset_upperbound,
+                                     self.cls_ignore_neighbor)
+        return encoded_bboxes
+
+    def decode(self, bboxes, pred_bboxes, max_shape=None):
+        """Apply transformation `pred_bboxes` to `boxes`.
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Predictions for bucketing estimation
+                and fine regression
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert len(pred_bboxes) == 2
+        cls_preds, offset_preds = pred_bboxes
+        assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
+            0) == bboxes.size(0)
+        decoded_bboxes = bucket2bbox(bboxes, cls_preds, offset_preds,
+                                     self.num_buckets, self.scale_factor,
+                                     max_shape, self.clip_border)
+
+        return decoded_bboxes
+
+
+def generat_buckets(proposals, num_buckets, scale_factor=1.0):
+    """Generate buckets w.r.t bucket number and scale factor of proposals.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+
+    Returns:
+        tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
+         t_buckets, d_buckets)
+
+            - bucket_w: Width of buckets on x-axis. Shape (n, ).
+            - bucket_h: Height of buckets on y-axis. Shape (n, ).
+            - l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
+            - r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
+            - t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
+            - d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
+    """
+    proposals = bbox_rescale(proposals, scale_factor)
+
+    # number of buckets in each side
+    side_num = int(np.ceil(num_buckets / 2.0))
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+    px1 = proposals[..., 0]
+    py1 = proposals[..., 1]
+    px2 = proposals[..., 2]
+    py2 = proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    # left buckets
+    l_buckets = px1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # right buckets
+    r_buckets = px2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # top buckets
+    t_buckets = py1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    # down buckets
+    d_buckets = py2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
+
+
+def bbox2bucket(proposals,
+                gt,
+                num_buckets,
+                scale_factor,
+                offset_topk=2,
+                offset_upperbound=1.0,
+                cls_ignore_neighbor=True):
+    """Generate buckets estimation and fine regression targets.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        gt (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset allowance to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
+
+            - offsets: Fine regression targets. \
+                Shape (n, num_buckets*2).
+            - offsets_weights: Fine regression weights. \
+                Shape (n, num_buckets*2).
+            - bucket_labels: Bucketing estimation labels. \
+                Shape (n, num_buckets*2).
+            - cls_weights: Bucketing estimation weights. \
+                Shape (n, num_buckets*2).
+    """
+    assert proposals.size() == gt.size()
+
+    # generate buckets
+    proposals = proposals.float()
+    gt = gt.float()
+    (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
+     d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
+
+    gx1 = gt[..., 0]
+    gy1 = gt[..., 1]
+    gx2 = gt[..., 2]
+    gy2 = gt[..., 3]
+
+    # generate offset targets and weights
+    # offsets from buckets to gts
+    l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
+    r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
+    t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
+    d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
+
+    # select top-k nearset buckets
+    l_topk, l_label = l_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    r_topk, r_label = r_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    t_topk, t_label = t_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    d_topk, d_label = d_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+
+    offset_l_weights = l_offsets.new_zeros(l_offsets.size())
+    offset_r_weights = r_offsets.new_zeros(r_offsets.size())
+    offset_t_weights = t_offsets.new_zeros(t_offsets.size())
+    offset_d_weights = d_offsets.new_zeros(d_offsets.size())
+    inds = torch.arange(0, proposals.size(0)).to(proposals).long()
+
+    # generate offset weights of top-k nearset buckets
+    for k in range(offset_topk):
+        if k >= 1:
+            offset_l_weights[inds, l_label[:,
+                                           k]] = (l_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_r_weights[inds, r_label[:,
+                                           k]] = (r_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_t_weights[inds, t_label[:,
+                                           k]] = (t_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_d_weights[inds, d_label[:,
+                                           k]] = (d_topk[:, k] <
+                                                  offset_upperbound).float()
+        else:
+            offset_l_weights[inds, l_label[:, k]] = 1.0
+            offset_r_weights[inds, r_label[:, k]] = 1.0
+            offset_t_weights[inds, t_label[:, k]] = 1.0
+            offset_d_weights[inds, d_label[:, k]] = 1.0
+
+    offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
+    offsets_weights = torch.cat([
+        offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
+    ],
+                                dim=-1)
+
+    # generate bucket labels and weight
+    side_num = int(np.ceil(num_buckets / 2.0))
+    labels = torch.stack(
+        [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
+
+    batch_size = labels.size(0)
+    bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
+                                                              -1).float()
+    bucket_cls_l_weights = (l_offsets.abs() < 1).float()
+    bucket_cls_r_weights = (r_offsets.abs() < 1).float()
+    bucket_cls_t_weights = (t_offsets.abs() < 1).float()
+    bucket_cls_d_weights = (d_offsets.abs() < 1).float()
+    bucket_cls_weights = torch.cat([
+        bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
+        bucket_cls_d_weights
+    ],
+                                   dim=-1)
+    # ignore second nearest buckets for cls if necessay
+    if cls_ignore_neighbor:
+        bucket_cls_weights = (~((bucket_cls_weights == 1) &
+                                (bucket_labels == 0))).float()
+    else:
+        bucket_cls_weights[:] = 1.0
+    return offsets, offsets_weights, bucket_labels, bucket_cls_weights
+
+
+def bucket2bbox(proposals,
+                cls_preds,
+                offset_preds,
+                num_buckets,
+                scale_factor=1.0,
+                max_shape=None,
+                clip_border=True):
+    """Apply bucketing estimation (cls preds) and fine regression (offset
+    preds) to generate det bboxes.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed. Shape (n, 4)
+        cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
+        offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (bboxes, loc_confidence).
+
+            - bboxes: predicted bboxes. Shape (n, 4)
+            - loc_confidence: localization confidence of predicted bboxes.
+                Shape (n,).
+    """
+
+    side_num = int(np.ceil(num_buckets / 2.0))
+    cls_preds = cls_preds.view(-1, side_num)
+    offset_preds = offset_preds.view(-1, side_num)
+
+    scores = F.softmax(cls_preds, dim=1)
+    score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
+
+    rescaled_proposals = bbox_rescale(proposals, scale_factor)
+
+    pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
+    ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
+    px1 = rescaled_proposals[..., 0]
+    py1 = rescaled_proposals[..., 1]
+    px2 = rescaled_proposals[..., 2]
+    py2 = rescaled_proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    score_inds_l = score_label[0::4, 0]
+    score_inds_r = score_label[1::4, 0]
+    score_inds_t = score_label[2::4, 0]
+    score_inds_d = score_label[3::4, 0]
+    l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
+    r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
+    t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
+    d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
+
+    offsets = offset_preds.view(-1, 4, side_num)
+    inds = torch.arange(proposals.size(0)).to(proposals).long()
+    l_offsets = offsets[:, 0, :][inds, score_inds_l]
+    r_offsets = offsets[:, 1, :][inds, score_inds_r]
+    t_offsets = offsets[:, 2, :][inds, score_inds_t]
+    d_offsets = offsets[:, 3, :][inds, score_inds_d]
+
+    x1 = l_buckets - l_offsets * bucket_w
+    x2 = r_buckets - r_offsets * bucket_w
+    y1 = t_buckets - t_offsets * bucket_h
+    y2 = d_buckets - d_offsets * bucket_h
+
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
+                       dim=-1)
+
+    # bucketing guided rescoring
+    loc_confidence = score_topk[:, 0]
+    top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
+    loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
+    loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
+
+    return bboxes, loc_confidence
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9eb35790512cdc2befde2b0e11d0950aa2a608e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py
@@ -0,0 +1,204 @@
+import numpy as np
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class DeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Delta XYWH BBox coder.
+
+    Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
+    this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
+    decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
+
+    Args:
+        target_means (Sequence[float]): Denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): Denormalizing standard deviation of
+            target for delta coordinates
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 target_means=(0., 0., 0., 0.),
+                 target_stds=(1., 1., 1., 1.),
+                 clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.means = target_means
+        self.stds = target_stds
+        self.clip_border = clip_border
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): Source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(self,
+               bboxes,
+               pred_bboxes,
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means, self.stds,
+                                    max_shape, wh_ratio_clip, self.clip_border)
+
+        return decoded_bboxes
+
+
+def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
+    """Compute deltas of proposals w.r.t. gt.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of :func:`delta2bbox`.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois,
+               deltas,
+               means=(0., 0., 0., 0.),
+               stds=(1., 1., 1., 1.),
+               max_shape=None,
+               wh_ratio_clip=16 / 1000,
+               clip_border=True):
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    means = deltas.new_tensor(means).view(1, -1).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+    return bboxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..74e801a154dce35ee0bf39187035430bbdb4b897
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/legacy_delta_xywh_bbox_coder.py
@@ -0,0 +1,212 @@
+import numpy as np
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Legacy Delta XYWH BBox coder used in MMDet V1.x.
+
+    Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
+    y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
+    back to original bbox (x1, y1, x2, y2).
+
+    Note:
+        The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
+        :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
+        height calculation. We suggest to only use this coder when testing with
+        MMDet V1.x models.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Args:
+        target_means (Sequence[float]): denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): denormalizing standard deviation of
+            target for delta coordinates
+    """
+
+    def __init__(self,
+                 target_means=(0., 0., 0., 0.),
+                 target_stds=(1., 1., 1., 1.)):
+        super(BaseBBoxCoder, self).__init__()
+        self.means = target_means
+        self.stds = target_stds
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
+                                           self.stds)
+        return encoded_bboxes
+
+    def decode(self,
+               bboxes,
+               pred_bboxes,
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
+                                           self.stds, max_shape, wh_ratio_clip)
+
+        return decoded_bboxes
+
+
+def legacy_bbox2delta(proposals,
+                      gt,
+                      means=(0., 0., 0., 0.),
+                      stds=(1., 1., 1., 1.)):
+    """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of `delta2bbox()`
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def legacy_delta2bbox(rois,
+                      deltas,
+                      means=(0., 0., 0., 0.),
+                      stds=(1., 1., 1., 1.),
+                      max_shape=None,
+                      wh_ratio_clip=16 / 1000):
+    """Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of `bbox2delta()`
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.5000, 1.5000],
+                [0.0000, 0.0000, 5.2183, 5.2183],
+                [0.0000, 0.1321, 7.8891, 0.8679],
+                [5.3967, 2.4251, 6.0033, 3.7749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+
+    # The true legacy box coder should +- 0.5 here.
+    # However, current implementation improves the performance when testing
+    # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/pseudo_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/pseudo_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..1c8346f4ae2c7db9719a70c7dc0244e088a9965b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/pseudo_bbox_coder.py
@@ -0,0 +1,18 @@
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class PseudoBBoxCoder(BaseBBoxCoder):
+    """Pseudo bounding box coder."""
+
+    def __init__(self, **kwargs):
+        super(BaseBBoxCoder, self).__init__(**kwargs)
+
+    def encode(self, bboxes, gt_bboxes):
+        """torch.Tensor: return the given ``bboxes``"""
+        return gt_bboxes
+
+    def decode(self, bboxes, pred_bboxes):
+        """torch.Tensor: return the given ``pred_bboxes``"""
+        return pred_bboxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/tblr_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/tblr_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..436670b4c076eeed27ff137804d5006ed4fee343
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/tblr_bbox_coder.py
@@ -0,0 +1,172 @@
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class TBLRBBoxCoder(BaseBBoxCoder):
+    """TBLR BBox coder.
+
+    Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
+    this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        normalizer (list | float): Normalization factor to be
+          divided with when coding the coordinates. If it is a list, it should
+          have length of 4 indicating normalization factor in tblr dims.
+          Otherwise it is a unified float factor for all dims. Default: 4.0
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, normalizer=4.0, clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.normalizer = normalizer
+        self.clip_border = clip_border
+
+    def encode(self, bboxes, gt_bboxes):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
+        bottom, right) order.
+
+        Args:
+            bboxes (torch.Tensor): source boxes, e.g., object proposals.
+            gt_bboxes (torch.Tensor): target of the transformation, e.g.,
+                ground truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bboxes2tblr(
+            bboxes, gt_bboxes, normalizer=self.normalizer)
+        return encoded_bboxes
+
+    def decode(self, bboxes, pred_bboxes, max_shape=None):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = tblr2bboxes(
+            bboxes,
+            pred_bboxes,
+            normalizer=self.normalizer,
+            max_shape=max_shape,
+            clip_border=self.clip_border)
+
+        return decoded_bboxes
+
+
+def bboxes2tblr(priors, gts, normalizer=4.0, normalize_by_wh=True):
+    """Encode ground truth boxes to tblr coordinate.
+
+    It first convert the gt coordinate to tblr format,
+     (top, bottom, left, right), relative to prior box centers.
+     The tblr coordinate may be normalized by the side length of prior bboxes
+     if `normalize_by_wh` is specified as True, and it is then normalized by
+     the `normalizer` factor.
+
+    Args:
+        priors (Tensor): Prior boxes in point form
+            Shape: (num_proposals,4).
+        gts (Tensor): Coords of ground truth for each prior in point-form
+            Shape: (num_proposals, 4).
+        normalizer (Sequence[float] | float): normalization parameter of
+            encoded boxes. If it is a list, it has to have length = 4.
+            Default: 4.0
+        normalize_by_wh (bool): Whether to normalize tblr coordinate by the
+            side length (wh) of prior bboxes.
+
+    Return:
+        encoded boxes (Tensor), Shape: (num_proposals, 4)
+    """
+
+    # dist b/t match center and prior's center
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == gts.size(0)
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    xmin, ymin, xmax, ymax = gts.split(1, dim=1)
+    top = prior_centers[:, 1].unsqueeze(1) - ymin
+    bottom = ymax - prior_centers[:, 1].unsqueeze(1)
+    left = prior_centers[:, 0].unsqueeze(1) - xmin
+    right = xmax - prior_centers[:, 0].unsqueeze(1)
+    loc = torch.cat((top, bottom, left, right), dim=1)
+    if normalize_by_wh:
+        # Normalize tblr by anchor width and height
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc[:, :2] /= h  # tb is normalized by h
+        loc[:, 2:] /= w  # lr is normalized by w
+    # Normalize tblr by the given normalization factor
+    return loc / normalizer
+
+
+def tblr2bboxes(priors,
+                tblr,
+                normalizer=4.0,
+                normalize_by_wh=True,
+                max_shape=None,
+                clip_border=True):
+    """Decode tblr outputs to prediction boxes.
+
+    The process includes 3 steps: 1) De-normalize tblr coordinates by
+    multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
+    prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
+    tblr (top, bottom, left, right) pair relative to the center of priors back
+    to (xmin, ymin, xmax, ymax) coordinate.
+
+    Args:
+        priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
+          Shape: (n,4).
+        tblr (Tensor): Coords of network output in tblr form
+          Shape: (n, 4).
+        normalizer (Sequence[float] | float): Normalization parameter of
+          encoded boxes. By list, it represents the normalization factors at
+          tblr dims. By float, it is the unified normalization factor at all
+          dims. Default: 4.0
+        normalize_by_wh (bool): Whether the tblr coordinates have been
+          normalized by the side length (wh) of prior bboxes.
+        max_shape (tuple, optional): Shape of the image. Decoded bboxes
+          exceeding which will be clamped.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Return:
+        encoded boxes (Tensor), Shape: (n, 4)
+    """
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == tblr.size(0)
+    loc_decode = tblr * normalizer
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    if normalize_by_wh:
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc_decode[:, :2] *= h  # tb
+        loc_decode[:, 2:] *= w  # lr
+    top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=1)
+    xmin = prior_centers[:, 0].unsqueeze(1) - left
+    xmax = prior_centers[:, 0].unsqueeze(1) + right
+    ymin = prior_centers[:, 1].unsqueeze(1) - top
+    ymax = prior_centers[:, 1].unsqueeze(1) + bottom
+    boxes = torch.cat((xmin, ymin, xmax, ymax), dim=1)
+    if clip_border and max_shape is not None:
+        boxes[:, 0].clamp_(min=0, max=max_shape[1])
+        boxes[:, 1].clamp_(min=0, max=max_shape[0])
+        boxes[:, 2].clamp_(min=0, max=max_shape[1])
+        boxes[:, 3].clamp_(min=0, max=max_shape[0])
+    return boxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/coder/yolo_bbox_coder.py b/insightface/detection/scrfd/mmdet/core/bbox/coder/yolo_bbox_coder.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a1dc34fcd3d3920f93ffc908322d440cb7e950a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/coder/yolo_bbox_coder.py
@@ -0,0 +1,86 @@
+import torch
+
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class YOLOBBoxCoder(BaseBBoxCoder):
+    """YOLO BBox coder.
+
+    Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
+    image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
+    cx, cy in [0., 1.], denotes relative center position w.r.t the center of
+    bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
+
+    Args:
+        eps (float): Min value of cx, cy when encoding.
+    """
+
+    def __init__(self, eps=1e-6):
+        super(BaseBBoxCoder, self).__init__()
+        self.eps = eps
+
+    def encode(self, bboxes, gt_bboxes, stride):
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor): Source boxes, e.g., anchors.
+            gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
+                ground-truth boxes.
+            stride (torch.Tensor | int): Stride of bboxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
+        y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
+        w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
+        h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        w_target = torch.log((w_gt / w).clamp(min=self.eps))
+        h_target = torch.log((h_gt / h).clamp(min=self.eps))
+        x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        encoded_bboxes = torch.stack(
+            [x_center_target, y_center_target, w_target, h_target], dim=-1)
+        return encoded_bboxes
+
+    def decode(self, bboxes, pred_bboxes, stride):
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor): Basic boxes, e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        # Get outputs x, y
+        x_center_pred = (pred_bboxes[..., 0] - 0.5) * stride + x_center
+        y_center_pred = (pred_bboxes[..., 1] - 0.5) * stride + y_center
+        w_pred = torch.exp(pred_bboxes[..., 2]) * w
+        h_pred = torch.exp(pred_bboxes[..., 3]) * h
+
+        decoded_bboxes = torch.stack(
+            (x_center_pred - w_pred / 2, y_center_pred - h_pred / 2,
+             x_center_pred + w_pred / 2, y_center_pred + h_pred / 2),
+            dim=-1)
+
+        return decoded_bboxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/demodata.py b/insightface/detection/scrfd/mmdet/core/bbox/demodata.py
new file mode 100755
index 0000000000000000000000000000000000000000..9430858597bcf322917a7a853099334f6e31624c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/demodata.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch
+
+
+def ensure_rng(rng=None):
+    """Simple version of the ``kwarray.ensure_rng``
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/__init__.py b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e71369a58a05fa25e6a754300875fdbb87cb26a5
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/__init__.py
@@ -0,0 +1,4 @@
+from .builder import build_iou_calculator
+from .iou2d_calculator import BboxOverlaps2D, bbox_overlaps
+
+__all__ = ['build_iou_calculator', 'BboxOverlaps2D', 'bbox_overlaps']
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/builder.py b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..09094d7ece46a9f18a28ed0960feac2afa9331bb
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry, build_from_cfg
+
+IOU_CALCULATORS = Registry('IoU calculator')
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, IOU_CALCULATORS, default_args)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/iou2d_calculator.py b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/iou2d_calculator.py
new file mode 100755
index 0000000000000000000000000000000000000000..97adcca5e51a848b43f25c8a5a7e02b7e23be70b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/iou_calculators/iou2d_calculator.py
@@ -0,0 +1,159 @@
+import torch
+
+from .builder import IOU_CALCULATORS
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps2D(object):
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
+                format, or shape (m, 5) in <x1, y1, x2, y2, score> format.
+            bboxes2 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
+                format, shape (m, 5) in <x1, y1, x2, y2, score> format, or be
+                empty. If ``is_aligned `` is ``True``, then m and n must be
+                equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + '()'
+        return repr_str
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/__init__.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..0b06303fe1000e11c5486c40c70606a34a5208e3
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/__init__.py
@@ -0,0 +1,15 @@
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+from .score_hlr_sampler import ScoreHLRSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/base_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/base_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..9ea35def115b49dfdad8a1f7c040ef3cd983b0d1
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/base_sampler.py
@@ -0,0 +1,101 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers."""
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox import RandomSampler
+            >>> from mmdet.core.bbox import AssignResult
+            >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+            >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+            >>> gt_labels = None
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/combined_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/combined_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..564729f0895b1863d94c479a67202438af45f996
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/combined_sampler.py
@@ -0,0 +1,20 @@
+from ..builder import BBOX_SAMPLERS, build_sampler
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class CombinedSampler(BaseSampler):
+    """A sampler that combines positive sampler and negative sampler."""
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = build_sampler(pos_sampler, **kwargs)
+        self.neg_sampler = build_sampler(neg_sampler, **kwargs)
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..c735298487e14e4a0ec42913f25673cccb98a8a0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,55 @@
+import numpy as np
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .random_sampler import RandomSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class InstanceBalancedPosSampler(RandomSampler):
+    """Instance balanced sampler that samples equal number of positive samples
+    for each instance."""
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(
+                    assign_result.gt_inds == i.item(), as_tuple=False)
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..f275e430d1b57c4d9df57387b8f3ae6f0ff68cf1
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,157 @@
+import numpy as np
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .random_sampler import RandomSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        """Sample according to the iou interval.
+
+        Args:
+            max_overlaps (torch.Tensor): IoU between bounding boxes and ground
+                truth boxes.
+            full_set (set(int)): A full set of indices of boxes。
+            num_expected (int): Number of expected samples。
+
+        Returns:
+            np.ndarray: Indices  of samples
+        """
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected negative samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+                # for sampling interval calculation
+                self.floor_thr = 0
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/ohem_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/ohem_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b99f60ef0176f1b7a56665fb0f59272f65b84cd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/ohem_sampler.py
@@ -0,0 +1,107 @@
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from ..transforms import bbox2roi
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class OHEMSampler(BaseSampler):
+    r"""Online Hard Example Mining Sampler described in `Training Region-based
+    Object Detectors with Online Hard Example Mining
+    <https://arxiv.org/abs/1604.03540>`_.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        self.context = context
+        if not hasattr(self.context, 'num_stages'):
+            self.bbox_head = self.context.bbox_head
+        else:
+            self.bbox_head = self.context.bbox_head[self.context.current_stage]
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            if not hasattr(self.context, 'num_stages'):
+                bbox_results = self.context._bbox_forward(feats, rois)
+            else:
+                bbox_results = self.context._bbox_forward(
+                    self.context.current_stage, feats, rois)
+            cls_score = bbox_results['cls_score']
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=rois,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected positive samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of positive samples
+        """
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected negative samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of negative samples
+        """
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_labels = assign_result.labels.new_empty(
+                neg_inds.size(0)).fill_(self.bbox_head.num_classes)
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    neg_labels, feats)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/pseudo_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/pseudo_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..2bd81abcdc62debc14772659d7a171f20bf33364
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,41 @@
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (torch.Tensor): Bounding boxes
+            gt_bboxes (torch.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/random_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/random_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..f34b006e8bb0b55c74aa1c3b792f3664ada93162
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/random_sampler.py
@@ -0,0 +1,78 @@
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+
+
+@BBOX_SAMPLERS.register_module()
+class RandomSampler(BaseSampler):
+    """Random sampler.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int, optional): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool, optional): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        from mmdet.core.bbox import demodata
+        super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                            add_gt_as_proposals)
+        self.rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+    def random_choice(self, gallery, num):
+        """Random select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/sampling_result.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/sampling_result.py
new file mode 100755
index 0000000000000000000000000000000000000000..419a8e39a3c307a7cd9cfd0565a20037ded0d646
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/sampling_result.py
@@ -0,0 +1,152 @@
+import torch
+
+from mmdet.utils import util_mixins
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_bboxes': torch.Size([12, 4]),
+            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
+            'num_gts': 4,
+            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
+            'pos_bboxes': torch.Size([0, 4]),
+            'pos_inds': tensor([], dtype=torch.int64),
+            'pos_is_gt': tensor([], dtype=torch.uint8)
+        })>
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assinged to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
+        from mmdet.core.bbox.assigners.assign_result import AssignResult
+        from mmdet.core.bbox import demodata
+        rng = demodata.ensure_rng(rng)
+
+        # make probabalistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None  # todo
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabalistic?
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/samplers/score_hlr_sampler.py b/insightface/detection/scrfd/mmdet/core/bbox/samplers/score_hlr_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..11d46b97705db60fb6a4eb5fa7da10ac78acb8bc
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/samplers/score_hlr_sampler.py
@@ -0,0 +1,264 @@
+import torch
+from mmcv.ops import nms_match
+
+from ..builder import BBOX_SAMPLERS
+from ..transforms import bbox2roi
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class ScoreHLRSampler(BaseSampler):
+    r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample
+    Attention in Object Detection <https://arxiv.org/abs/1904.04821>`_.
+
+    Score hierarchical local rank (HLR) differentiates with RandomSampler in
+    negative part. It firstly computes Score-HLR in a two-step way,
+    then linearly maps score hlr to the loss weights.
+
+    Args:
+        num (int): Total number of sampled RoIs.
+        pos_fraction (float): Fraction of positive samples.
+        context (:class:`BaseRoIHead`): RoI head that the sampler belongs to.
+        neg_pos_ub (int): Upper bound of the ratio of num negative to num
+            positive, -1 means no upper bound.
+        add_gt_as_proposals (bool): Whether to add ground truth as proposals.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        score_thr (float): Minimum score that a negative sample is to be
+            considered as valid bbox.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 k=0.5,
+                 bias=0,
+                 score_thr=0.05,
+                 iou_thr=0.5,
+                 **kwargs):
+        super().__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals)
+        self.k = k
+        self.bias = bias
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+        self.context = context
+        # context of cascade detectors is a list, so distinguish them here.
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+            self.with_shared_head = context.with_shared_head
+            if self.with_shared_head:
+                self.shared_head = context.shared_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    @staticmethod
+    def random_choice(gallery, num):
+        """Randomly select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten()
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes,
+                    feats=None,
+                    img_meta=None,
+                    **kwargs):
+        """Sample negative samples.
+
+        Score-HLR sampler is done in the following steps:
+        1. Take the maximum positive score prediction of each negative samples
+            as s_i.
+        2. Filter out negative samples whose s_i <= score_thr, the left samples
+            are called valid samples.
+        3. Use NMS-Match to divide valid samples into different groups,
+            samples in the same group will greatly overlap with each other
+        4. Rank the matched samples in two-steps to get Score-HLR.
+            (1) In the same group, rank samples with their scores.
+            (2) In the same score rank across different groups,
+                rank samples with their scores again.
+        5. Linearly map Score-HLR to the final label weights.
+
+        Args:
+            assign_result (:obj:`AssignResult`): result of assigner.
+            num_expected (int): Expected number of samples.
+            bboxes (Tensor): bbox to be sampled.
+            feats (Tensor): Features come from FPN.
+            img_meta (dict): Meta information dictionary.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten()
+        num_neg = neg_inds.size(0)
+        if num_neg == 0:
+            return neg_inds, None
+        with torch.no_grad():
+            neg_bboxes = bboxes[neg_inds]
+            neg_rois = bbox2roi([neg_bboxes])
+            bbox_result = self.context._bbox_forward(feats, neg_rois)
+            cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[
+                'bbox_pred']
+
+            ori_loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=None,
+                labels=neg_inds.new_full((num_neg, ),
+                                         self.bbox_head.num_classes),
+                label_weights=cls_score.new_ones(num_neg),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+
+            # filter out samples with the max score lower than score_thr
+            max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1)
+            valid_inds = (max_score > self.score_thr).nonzero().view(-1)
+            invalid_inds = (max_score <= self.score_thr).nonzero().view(-1)
+            num_valid = valid_inds.size(0)
+            num_invalid = invalid_inds.size(0)
+
+            num_expected = min(num_neg, num_expected)
+            num_hlr = min(num_valid, num_expected)
+            num_rand = num_expected - num_hlr
+            if num_valid > 0:
+                valid_rois = neg_rois[valid_inds]
+                valid_max_score = max_score[valid_inds]
+                valid_argmax_score = argmax_score[valid_inds]
+                valid_bbox_pred = bbox_pred[valid_inds]
+
+                # valid_bbox_pred shape: [num_valid, #num_classes, 4]
+                valid_bbox_pred = valid_bbox_pred.view(
+                    valid_bbox_pred.size(0), -1, 4)
+                selected_bbox_pred = valid_bbox_pred[range(num_valid),
+                                                     valid_argmax_score]
+                pred_bboxes = self.bbox_head.bbox_coder.decode(
+                    valid_rois[:, 1:], selected_bbox_pred)
+                pred_bboxes_with_score = torch.cat(
+                    [pred_bboxes, valid_max_score[:, None]], -1)
+                group = nms_match(pred_bboxes_with_score, self.iou_thr)
+
+                # imp: importance
+                imp = cls_score.new_zeros(num_valid)
+                for g in group:
+                    g_score = valid_max_score[g]
+                    # g_score has already sorted
+                    rank = g_score.new_tensor(range(g_score.size(0)))
+                    imp[g] = num_valid - rank + g_score
+                _, imp_rank_inds = imp.sort(descending=True)
+                _, imp_rank = imp_rank_inds.sort()
+                hlr_inds = imp_rank_inds[:num_expected]
+
+                if num_rand > 0:
+                    rand_inds = torch.randperm(num_invalid)[:num_rand]
+                    select_inds = torch.cat(
+                        [valid_inds[hlr_inds], invalid_inds[rand_inds]])
+                else:
+                    select_inds = valid_inds[hlr_inds]
+
+                neg_label_weights = cls_score.new_ones(num_expected)
+
+                up_bound = max(num_expected, num_valid)
+                imp_weights = (up_bound -
+                               imp_rank[hlr_inds].float()) / up_bound
+                neg_label_weights[:num_hlr] = imp_weights
+                neg_label_weights[num_hlr:] = imp_weights.min()
+                neg_label_weights = (self.bias +
+                                     (1 - self.bias) * neg_label_weights).pow(
+                                         self.k)
+                ori_selected_loss = ori_loss[select_inds]
+                new_loss = ori_selected_loss * neg_label_weights
+                norm_ratio = ori_selected_loss.sum() / new_loss.sum()
+                neg_label_weights *= norm_ratio
+            else:
+                neg_label_weights = cls_score.new_ones(num_expected)
+                select_inds = torch.randperm(num_neg)[:num_expected]
+
+            return neg_inds[select_inds], neg_label_weights
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               img_meta=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            tuple[:obj:`SamplingResult`, Tensor]: Sampling result and negetive
+                label weights.
+        """
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals:
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds, neg_label_weights = self.neg_sampler._sample_neg(
+            assign_result,
+            num_expected_neg,
+            bboxes,
+            img_meta=img_meta,
+            **kwargs)
+
+        return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                              assign_result, gt_flags), neg_label_weights
diff --git a/insightface/detection/scrfd/mmdet/core/bbox/transforms.py b/insightface/detection/scrfd/mmdet/core/bbox/transforms.py
new file mode 100755
index 0000000000000000000000000000000000000000..8bd6d4fb1b99b5af2f427021623e8b09e2f2c7f2
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/bbox/transforms.py
@@ -0,0 +1,270 @@
+import numpy as np
+import torch
+
+
+def bbox_flip(bboxes, img_shape, direction='horizontal'):
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (tuple): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes,
+                 img_shape,
+                 scale_factor,
+                 flip,
+                 flip_direction='horizontal'):
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes,
+                      img_shape,
+                      scale_factor,
+                      flip,
+                      flip_direction='horizontal'):
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (torch.Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        list[torch.Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
+        labels (torch.Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded kps.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return torch.stack(preds, -1)
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+def kps2distance(points, kps, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        kps (Tensor): Shape (n, K), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+
+    preds = []
+    for i in range(0, kps.shape[1], 2):
+        px = kps[:, i] - points[:, i%2]
+        py = kps[:, i+1] - points[:, i%2+1]
+        if max_dis is not None:
+            px = px.clamp(min=0, max=max_dis - eps)
+            py = py.clamp(min=0, max=max_dis - eps)
+        preds.append(px)
+        preds.append(py)
+    return torch.stack(preds, -1)
+
+def bbox_rescale(bboxes, scale_factor=1.0):
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox):
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox):
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/__init__.py b/insightface/detection/scrfd/mmdet/core/evaluation/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..5fef9b346f88a5a705e7a04c0495a463ed45a613
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/__init__.py
@@ -0,0 +1,17 @@
+from .class_names import (cityscapes_classes, coco_classes, dataset_aliases,
+                          get_classes, imagenet_det_classes,
+                          imagenet_vid_classes, voc_classes)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+from .widerface import wider_evaluation, get_widerface_gts
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
+    'DistEvalHook', 'EvalHook', 'average_precision', 'eval_map',
+    'print_map_summary', 'eval_recalls', 'print_recall_summary',
+    'plot_num_recall', 'plot_iou_recall',
+    'wider_evaluation', 'get_widerface_gts',
+]
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/bbox_overlaps.py b/insightface/detection/scrfd/mmdet/core/evaluation/bbox_overlaps.py
new file mode 100755
index 0000000000000000000000000000000000000000..93559ea0f25369d552a5365312fa32b9ffec9226
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(
+            y_end - y_start, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/class_names.py b/insightface/detection/scrfd/mmdet/core/evaluation/class_names.py
new file mode 100755
index 0000000000000000000000000000000000000000..4b8845f3fa1ee6b0c24c764b349d4dc7c6e8fe32
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/class_names.py
@@ -0,0 +1,116 @@
+import mmcv
+
+
+def wider_face_classes():
+    return ['face']
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def cityscapes_classes():
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WDIERFace'],
+    'cityscapes': ['cityscapes']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/eval_hooks.py b/insightface/detection/scrfd/mmdet/core/evaluation/eval_hooks.py
new file mode 100755
index 0000000000000000000000000000000000000000..15d9abd9291e2e0a5395e5e51be3c9e27cb887e9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/eval_hooks.py
@@ -0,0 +1,255 @@
+import os.path as osp
+import warnings
+from math import inf
+
+import mmcv
+from mmcv.runner import Hook
+from torch.utils.data import DataLoader
+
+from mmdet.utils import get_root_logger
+
+
+class EvalHook(Hook):
+    """Evaluation hook.
+
+    Notes:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be effected.
+
+    Attributes:
+        dataloader (DataLoader): A PyTorch dataloader.
+        start (int, optional): Evaluation starting epoch. It enables evaluation
+            before the training starts if ``start`` <= the resuming epoch.
+            If None, whether to evaluate is merely decided by ``interval``.
+            Default: None.
+        interval (int): Evaluation interval (by epochs). Default: 1.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be save in best.json.
+            Options are the evaluation metrics to the test dataset. e.g.,
+            ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance
+            segmentation. ``AR@100`` for proposal recall. If ``save_best`` is
+            ``auto``, the first key will be used. The interval of
+            ``CheckpointHook`` should device EvalHook. Default: None.
+        rule (str, optional): Comparison rule for best score. If set to None,
+            it will infer a reasonable rule. Keys such as 'mAP' or 'AR' will
+            be inferred by 'greater' rule. Keys contain 'loss' will be inferred
+             by 'less' rule. Options are 'greater', 'less'. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    greater_keys = ['mAP', 'AR']
+    less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 save_best=None,
+                 rule=None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError('dataloader must be a pytorch DataLoader, but got'
+                            f' {type(dataloader)}')
+        if not interval > 0:
+            raise ValueError(f'interval must be positive, but got {interval}')
+        if start is not None and start < 0:
+            warnings.warn(
+                f'The evaluation start epoch {start} is smaller than 0, '
+                f'use 0 instead', UserWarning)
+            start = 0
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        assert isinstance(save_best, str) or save_best is None
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_epoch_flag = True
+
+        self.logger = get_root_logger()
+
+        if self.save_best is not None:
+            self._init_rule(rule, self.save_best)
+
+    def _init_rule(self, rule, key_indicator):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                if any(key in key_indicator for key in self.greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator for key in self.less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating a empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training."""
+        if not self.initial_epoch_flag:
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_epoch_flag = False
+
+    def evaluation_flag(self, runner):
+        """Judge whether to perform_evaluation after this epoch.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.start is None:
+            if not self.every_n_epochs(runner, self.interval):
+                # No evaluation during the interval epochs.
+                return False
+        elif (runner.epoch + 1) < self.start:
+            # No evaluation if start is larger than the current epoch.
+            return False
+        else:
+            # Evaluation only at epochs 3, 5, 7... if start==3 and interval==2
+            if (runner.epoch + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def after_train_epoch(self, runner):
+        if not self.evaluation_flag(runner):
+            return
+        from mmdet.apis import single_gpu_test
+        results = single_gpu_test(runner.model, self.dataloader, show=False)
+        key_score = self.evaluate(runner, results)
+        if self.save_best:
+            best_score = runner.meta['hook_msgs'].get(
+                'best_score', self.init_value_map[self.rule])
+            if self.compare_func(key_score, best_score):
+                best_score = key_score
+                runner.meta['hook_msgs']['best_score'] = best_score
+                last_ckpt = runner.meta['hook_msgs']['last_ckpt']
+                runner.meta['hook_msgs']['best_ckpt'] = last_ckpt
+                mmcv.symlink(
+                    last_ckpt,
+                    osp.join(runner.work_dir,
+                             f'best_{self.key_indicator}.pth'))
+                self.logger.info(
+                    f'Now best checkpoint is epoch_{runner.epoch + 1}.pth.'
+                    f'Best {self.key_indicator} is {best_score:0.4f}')
+
+    def evaluate(self, runner, results):
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+        if self.save_best is not None:
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+        else:
+            return None
+
+
+class DistEvalHook(EvalHook):
+    """Distributed evaluation hook.
+
+    Notes:
+        If new arguments are added, tools/test.py may be effected.
+
+    Attributes:
+        dataloader (DataLoader): A PyTorch dataloader.
+        start (int, optional): Evaluation starting epoch. It enables evaluation
+            before the training starts if ``start`` <= the resuming epoch.
+            If None, whether to evaluate is merely decided by ``interval``.
+            Default: None.
+        interval (int): Evaluation interval (by epochs). Default: 1.
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be save in best.json.
+            Options are the evaluation metrics to the test dataset. e.g.,
+            ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance
+            segmentation. ``AR@100`` for proposal recall. If ``save_best`` is
+            ``auto``, the first key will be used. The interval of
+            ``CheckpointHook`` should device EvalHook. Default: None.
+        rule (str | None): Comparison rule for best score. If set to None,
+            it will infer a reasonable rule. Default: 'None'.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 save_best=None,
+                 rule=None,
+                 **eval_kwargs):
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            save_best=save_best,
+            rule=rule,
+            **eval_kwargs)
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def after_train_epoch(self, runner):
+        if not self.evaluation_flag(runner):
+            return
+
+        from mmdet.apis import multi_gpu_test
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+        results = multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            key_score = self.evaluate(runner, results)
+            if self.save_best:
+                best_score = runner.meta['hook_msgs'].get(
+                    'best_score', self.init_value_map[self.rule])
+                if self.compare_func(key_score, best_score):
+                    best_score = key_score
+                    runner.meta['hook_msgs']['best_score'] = best_score
+                    last_ckpt = runner.meta['hook_msgs']['last_ckpt']
+                    runner.meta['hook_msgs']['best_ckpt'] = last_ckpt
+                    mmcv.symlink(
+                        last_ckpt,
+                        osp.join(runner.work_dir,
+                                 f'best_{self.key_indicator}.pth'))
+                    self.logger.info(
+                        f'Now best checkpoint is {last_ckpt}.'
+                        f'Best {self.key_indicator} is {best_score:0.4f}')
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/mean_ap.py b/insightface/detection/scrfd/mmdet/core/evaluation/mean_ap.py
new file mode 100755
index 0000000000000000000000000000000000000000..f44314b423220e4673426b831979b141244c687d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/mean_ap.py
@@ -0,0 +1,469 @@
+from multiprocessing import Pool
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+            each array is (num_scales, m).
+    """
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+         np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlaped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+            each array is (num_scales, m).
+    """
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+         np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Default: None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datsets, e.g.
+            "voc07", "imagenet_det", etc. Default: None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Default: 4.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    pool = Pool(nproc)
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        # compute tp and fp for each image with multiple processes
+        tpfp = pool.starmap(
+            tpfp_fn,
+            zip(cls_dets, cls_gts, cls_gts_ignore,
+                [iou_thr for _ in range(num_imgs)],
+                [area_ranges for _ in range(num_imgs)]))
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
+                    bbox[:, 3] - bbox[:, 1])
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    pool.close()
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif mmcv.is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/recall.py b/insightface/detection/scrfd/mmdet/core/evaluation/recall.py
new file mode 100755
index 0000000000000000000000000000000000000000..ea6277abc1d14024d5234fa38e3182c6a9c7ad3e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/recall.py
@@ -0,0 +1,189 @@
+from collections.abc import Sequence
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/insightface/detection/scrfd/mmdet/core/evaluation/widerface.py b/insightface/detection/scrfd/mmdet/core/evaluation/widerface.py
new file mode 100755
index 0000000000000000000000000000000000000000..a1a3ad375d347fbb64525ed4b8fe03cb5f6fc70a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/evaluation/widerface.py
@@ -0,0 +1,564 @@
+"""
+WiderFace evaluation code
+author: wondervictor
+mail: tianhengcheng@gmail.com
+copyright@wondervictor
+"""
+
+from __future__ import absolute_import
+import os
+import tqdm
+import pickle
+import datetime
+import argparse
+import numpy as np
+from scipy.io import loadmat
+#from facedet.evaluation.box_utils import jaccard
+#from facedet.evaluation.bbox import bbox_overlaps
+#import torch
+#from mmdet.core.bbox import bbox_overlaps
+
+#def intersect(box_a, box_b):
+#    A = box_a.size(0)
+#    B = box_b.size(0)
+#    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+#                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+#    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+#                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+#    inter = torch.clamp((max_xy - min_xy), min=0)
+#    return inter[:, :, 0] * inter[:, :, 1]
+#
+#def jaccard(box_a, box_b):
+#    inter = intersect(box_a, box_b)
+#    #torch.cuda.empty_cache()
+#    if not inter.is_cuda:
+#        box_a_cpu = box_a.cpu()
+#        box_b_cpu = box_b.cpu()
+#        area_a_cpu = ((box_a_cpu[:, 2]-box_a_cpu[:, 0]) *
+#              (box_a_cpu[:, 3]-box_a_cpu[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+#        area_b_cpu = ((box_b_cpu[:, 2]-box_b_cpu[:, 0]) *
+#              (box_b_cpu[:, 3]-box_b_cpu[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+#        union_cpu = area_a_cpu + area_b_cpu - inter.cpu()
+#        return inter / union_cpu
+#    else:
+#        area_a = ((box_a[:, 2]-box_a[:, 0]) *
+#              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+#        area_b = ((box_b[:, 2]-box_b[:, 0]) *
+#              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+#        union = area_a + area_b - inter
+#
+#        return inter / union  # [A,B]
+#
+def bbox_overlaps(boxes, query_boxes):
+    n_ = boxes.shape[0]
+    k_ = query_boxes.shape[0]
+    overlaps = np.zeros((n_, k_), dtype=np.float)
+    for k in range(k_):
+        query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] +
+                          1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(n_):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(
+                boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(
+                    boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    box_area = (boxes[n, 2] - boxes[n, 0] +
+                                1) * (boxes[n, 3] - boxes[n, 1] + 1)
+                    all_area = float(box_area + query_box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / all_area
+    return overlaps
+
+def bbox_overlap(a, b):
+    x1 = np.maximum(a[:,0], b[0])
+    y1 = np.maximum(a[:,1], b[1])
+    x2 = np.minimum(a[:,2], b[2])
+    y2 = np.minimum(a[:,3], b[3])
+    w = x2-x1+1
+    h = y2-y1+1
+    inter = w*h
+    aarea = (a[:,2]-a[:,0]+1) * (a[:,3]-a[:,1]+1)
+    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
+    o = inter / (aarea+barea-inter)
+    o[w<=0] = 0
+    o[h<=0] = 0
+    return o
+
+def __bbox_overlap(a, b):
+    x1 = torch.max(a[:,0], b[0])
+    y1 = torch.max(a[:,1], b[1])
+    x2 = torch.min(a[:,2], b[2])
+    y2 = torch.min(a[:,3], b[3])
+    w = x2-x1+1
+    h = y2-y1+1
+    inter = w*h
+    aarea = (a[:,2]-a[:,0]+1) * (a[:,3]-a[:,1]+1)
+    barea = (b[2]-b[0]+1) * (b[3]-b[1]+1)
+    o = inter / (aarea+barea-inter)
+    o[w<=0] = 0
+    o[h<=0] = 0
+    return o
+
+def np_around(array, num_decimals=0):
+    #return array
+    return np.around(array, decimals=num_decimals)
+
+#def compute_iou(box_a, box_b):
+#    x0 = np.maximum(box_a[:,0], box_b[0])
+#    y0 = np.maximum(box_a[:,1], box_b[1])
+#    x1 = np.minimum(box_a[:,2], box_b[2])
+#    y1 = np.minimum(box_a[:,3], box_b[3])
+#    #print ('x0', x0[0], x1[0], y0[0], y1[0], box_a[0], box_b[:])
+#    #w = np.maximum(x1 - x0 + 1, 0) 
+#    w = np_around(x1 - x0 + 1) 
+#    #h = np.maximum(y1 - y0 + 1, 0)
+#    h = np_around(y1 - y0 + 1)
+#    inter = np_around(w * h)
+#    area_a = (box_a[:,2] - box_a[:,0] + 1) * (box_a[:,3] - box_a[:,1] + 1)
+#    area_a = np_around(area_a)
+#    area_b = (box_b[2] - box_b[0] + 1) * (box_b[3] - box_b[1] + 1)
+#    area_b = np_around(area_b)
+#    iou = inter / (area_a + area_b - inter)
+#    iou[w <= 0] = 0
+#    iou[h <=0] = 0
+#    return iou
+
+def np_round(val, decimals=4):
+    return val
+    #if isinstance(val, np.ndarray):
+    #    val = np.around(val, decimals=decimals)
+    #return val
+
+
+def get_gt_boxes(gt_dir):
+    """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
+
+    gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat'))
+    hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
+    medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
+    easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))
+
+    facebox_list = gt_mat['face_bbx_list']
+    event_list = gt_mat['event_list']
+    file_list = gt_mat['file_list']
+
+    hard_gt_list = hard_mat['gt_list']
+    medium_gt_list = medium_mat['gt_list']
+    easy_gt_list = easy_mat['gt_list']
+
+    return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list
+
+
+def get_gt_boxes_from_txt(gt_path, cache_dir):
+
+    cache_file = os.path.join(cache_dir, 'gt_cache.pkl')
+    if os.path.exists(cache_file):
+        with open(cache_file, 'rb') as f:
+            boxes = pickle.load(f)
+        return boxes
+
+    f = open(gt_path, 'r')
+    state = 0
+    lines = f.readlines()
+    lines = list(map(lambda x: x.rstrip('\r\n'), lines))
+    boxes = {}
+    #print(len(lines))
+    f.close()
+    current_boxes = []
+    current_name = None
+    for line in lines:
+        if state == 0 and '--' in line:
+            state = 1
+            current_name = line
+            continue
+        if state == 1:
+            state = 2
+            continue
+
+        if state == 2 and '--' in line:
+            state = 1
+            boxes[current_name] = np.array(current_boxes).astype('float32')
+            current_name = line
+            current_boxes = []
+            continue
+
+        if state == 2:
+            box = [float(x) for x in line.split(' ')[:4]]
+            current_boxes.append(box)
+            continue
+
+    with open(cache_file, 'wb') as f:
+        pickle.dump(boxes, f)
+    return boxes
+
+
+def read_pred_file(filepath):
+
+    with open(filepath, 'r') as f:
+        lines = f.readlines()
+        img_file = lines[0].rstrip('\n\r')
+        lines = lines[2:]
+
+    boxes = np.array(list(map(lambda x: [float(a) for a in x.rstrip('\r\n').split(' ')], lines))).astype('float')
+    return img_file.split('/')[-1], boxes
+
+
+def get_preds(pred_dir):
+    events = os.listdir(pred_dir)
+    boxes = dict()
+    pbar = tqdm.tqdm(events)
+
+    for event in pbar:
+        pbar.set_description('Reading Predictions ')
+        event_dir = os.path.join(pred_dir, event)
+        event_images = os.listdir(event_dir)
+        current_event = dict()
+        for imgtxt in event_images:
+            imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt))
+            current_event[imgname.rstrip('.jpg')] = _boxes
+        boxes[event] = current_event
+    return boxes
+
+
+def norm_score(pred):
+    """ norm score
+    pred {key: [[x1,y1,x2,y2,s]]}
+    """
+
+    max_score = -1 
+    min_score = 2
+
+    for _, k in pred.items():
+        for _, v in k.items():
+            if len(v) == 0:
+                continue
+            _min = np.min(v[:, -1])
+            _max = np.max(v[:, -1])
+            max_score = max(_max, max_score)
+            min_score = min(_min, min_score)
+
+    diff = max_score - min_score
+    for _, k in pred.items():
+        for _, v in k.items():
+            if len(v) == 0:
+                continue
+            v[:, -1] = (v[:, -1] - min_score).astype(np.float64)/diff
+    return pred
+
+
+def image_eval(pred, gt, ignore, iou_thresh, mpp):
+    """ single image evaluation
+    pred: Nx5
+    gt: Nx4
+    ignore:
+    """
+
+    
+    _pred = pred.copy()
+    _gt = gt.copy()
+    pred_recall = np.zeros(_pred.shape[0])
+    recall_list = np.zeros(_gt.shape[0])
+    proposal_list = np.ones(_pred.shape[0])
+
+    _pred[:, 2] = _pred[:, 2] + _pred[:, 0]
+    _pred[:, 3] = _pred[:, 3] + _pred[:, 1]
+    _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
+    _gt[:, 3] = _gt[:, 3] + _gt[:, 1]
+
+    gt_overlap_list = mpp.starmap(bbox_overlap, zip([_gt]*_pred.shape[0],[_pred[h] for h in range(_pred.shape[0])]))
+
+    #use_cuda = True
+    #if use_cuda:
+    #    _pred = torch.cuda.FloatTensor(_pred[:,:4])
+    #    _gt = torch.cuda.FloatTensor(_gt)
+    #else:
+    #    _pred = torch.FloatTensor(_pred[:,:4])
+    #    _gt = torch.FloatTensor(_gt)
+
+    #overlaps = jaccard(_pred, _gt).cpu().numpy()
+    #overlaps = compute_iou((_pred[:, :4]), (_gt))
+
+    #overlaps = bbox_overlaps(_pred, _gt)
+
+    #if use_cuda:
+    #    overlaps = overlaps.cpu().numpy()
+    #else:
+    #    overlaps = overlaps.numpy()
+
+    for h in range(_pred.shape[0]):
+
+        #gt_overlap = overlaps[h]
+        #gt_overlap = bbox_overlap(_gt, _pred[h])
+        gt_overlap = gt_overlap_list[h]
+        #if use_cuda:
+        #    gt_overlap = gt_overlap.cpu().numpy()
+        #else:
+        #    gt_overlap = gt_overlap.numpy()
+
+        #max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
+        #gt_overlap = compute_iou(_gt, _pred[h, :4])
+        #exit()
+        #exit()
+        #print ('overlap', gt_overlap)
+        max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
+
+        if max_overlap >= iou_thresh:
+            if ignore[max_idx] == 0:
+                recall_list[max_idx] = -1
+                proposal_list[h] = -1
+            elif recall_list[max_idx] == 0:
+                recall_list[max_idx] = 1
+
+        r_keep_index = np.where(recall_list == 1)[0]
+        pred_recall[h] = len(r_keep_index)
+
+    return pred_recall, proposal_list
+
+
+def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
+    pr_info = np.zeros((thresh_num, 2)).astype('float')
+    fp = np.zeros((pred_info.shape[0],), dtype=np.int)
+    last_info = [-1, -1]
+    for t in range(thresh_num):
+
+        thresh = 1 - (t+1)/thresh_num
+        r_index = np.where(pred_info[:, 4] >= thresh)[0]
+        if len(r_index) == 0:
+            pr_info[t, 0] = 0
+            pr_info[t, 1] = 0
+        else:
+            r_index = r_index[-1]
+            p_index = np.where(proposal_list[:r_index+1] == 1)[0]
+            pr_info[t, 0] = len(p_index) #valid pred number
+            pr_info[t, 1] = pred_recall[r_index] # valid gt number
+
+            if t>0 and pr_info[t, 0] > pr_info[t-1,0] and pr_info[t, 1]==pr_info[t-1,1]:
+                fp[r_index] = 1
+                #if thresh>=0.85:
+                #    print(thresh, t, pr_info[t])
+    #print(pr_info[:10,0])
+    #print(pr_info[:10,1])
+    return pr_info, fp
+
+
+def dataset_pr_info(thresh_num, pr_curve, count_face):
+    _pr_curve = np.zeros((thresh_num, 2))
+    for i in range(thresh_num):
+        #_pr_curve[i, 0] = round(pr_curve[i, 1] / pr_curve[i, 0], 4)
+        #_pr_curve[i, 1] = round(pr_curve[i, 1] / count_face, 4)
+        _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
+        _pr_curve[i, 1] = pr_curve[i, 1] / count_face
+    return _pr_curve
+
+
+def voc_ap(rec, prec):
+
+    # correct AP calculation
+    # first append sentinel values at the end
+    #print ('rec:', rec)
+    #print ('pre:', prec)
+    mrec = np.concatenate(([0.], rec, [1.]))
+    mpre = np.concatenate(([0.], prec, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np_round(np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]))
+    return ap
+
+
+def wider_evaluation(pred, gt_path, iou_thresh=0.5, debug=False):
+    #pred = get_preds(pred)
+    pred = norm_score(pred)
+    thresh_num = 1000
+    #thresh_num = 2000
+    facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_path)
+    event_num = len(event_list)
+    settings = ['easy', 'medium', 'hard']
+    setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
+    from multiprocessing import Pool
+    #from multiprocessing.pool import ThreadPool
+    mpp = Pool(8)
+    aps = [-1.0, -1.0, -1.0]
+    meta = {}
+    #setting_id = 2
+    print('')
+    for setting_id in range(3):
+    #for setting_id in range(1):
+        ta = datetime.datetime.now()
+        # different setting
+        #iou_th = 0.5 #+ 0.05 * idx
+        iou_th = iou_thresh
+        # different setting
+        gt_list = setting_gts[setting_id]
+        count_face = 0
+        pr_curve = np.zeros((thresh_num, 2)).astype('float')
+        # [hard, medium, easy]
+        #pbar = tqdm.tqdm(range(event_num))
+        #for i in pbar:
+        high_score_count = 0
+        high_score_fp_count = 0
+        for i in range(event_num):
+            #pbar.set_description('Processing {}'.format(settings[setting_id]))
+            event_name = str(event_list[i][0][0])
+            img_list = file_list[i][0]
+            pred_list = pred[event_name]
+            sub_gt_list = gt_list[i][0]
+            # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
+            gt_bbx_list = facebox_list[i][0]
+
+            for j in range(len(img_list)):
+                img_name = str(img_list[j][0][0])
+                pred_info = pred_list[img_name]
+
+                gt_boxes = gt_bbx_list[j][0].astype('float')
+                keep_index = sub_gt_list[j][0]
+                #print ('keep_index', keep_index)
+                count_face += len(keep_index)
+                
+
+                if len(gt_boxes) == 0 or len(pred_info) == 0:
+                    continue
+                #ignore = np.zeros(gt_boxes.shape[0])
+                #if len(keep_index) != 0:
+                #    ignore[keep_index-1] = 1
+                #assert len(keep_index)>0
+                ignore = np.zeros(gt_boxes.shape[0], dtype=np.int)
+                if len(keep_index) != 0:
+                    ignore[keep_index-1] = 1
+                pred_info = np_round(pred_info,1)
+                #print('ignore:', len(ignore), len(np.where(ignore==1)[0]))
+                #pred_sort_idx= np.argsort(pred_info[:,4])
+                #pred_info = pred_info[pred_sort_idx][::-1]
+                #print ('pred_info', pred_info[:20, 4])
+                #exit()
+
+
+                gt_boxes = np_round(gt_boxes)
+                #ignore = np_round(ignore)
+                pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_th, mpp)
+                #print(pred_recall[:10], proposal_list[:10])
+                #print('1 stage', pred_recall, proposal_list)
+                #print(pred_info.shape, pred_recall.shape)
+
+                _img_pr_info, fp = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall)
+                #for f in range(pred_info.shape[0]):
+                #    _score = pred_info[f,4]
+                #    if _score<0.929:
+                #        break
+                #    high_score_count+=1
+                #    if fp[f]==1:
+                #        w = pred_info[f, 2]
+                #        h = pred_info[f, 3]
+                #        print('fp:', event_name, img_name, _score, w, h)
+                #        high_score_fp_count+=1
+                pr_curve += _img_pr_info
+        #print ('pr_curve', pr_curve, count_face)
+        pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)
+        #print(pr_curve.shape)
+
+        propose = pr_curve[:, 0]
+        recall = pr_curve[:, 1]
+        #for f in range(thresh_num):
+        #    print('R-P:', recall[f], propose[f])
+        for srecall in np.arange(0.1, 1.0001, 0.1):
+            rindex = len(np.where(recall<=srecall)[0])-1
+            rthresh = 1.0 - float(rindex)/thresh_num
+            print('Recall-Precision-Thresh:', recall[rindex], propose[rindex], rthresh)
+
+        ap = voc_ap(recall, propose)
+        aps[setting_id] = ap
+        tb = datetime.datetime.now()
+        #print('high score count:', high_score_count)
+        #print('high score fp count:', high_score_fp_count)
+        print('%s cost %.4f seconds, ap: %.5f'%(settings[setting_id], (tb-ta).total_seconds(), ap))
+
+    return aps
+
+def get_widerface_gts(gt_path):
+    facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_path)
+    event_num = len(event_list)
+
+    settings = ['easy', 'medium', 'hard']
+    setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
+    all_results = []
+    for setting_id in range(3):
+        results = {}
+        gt_list = setting_gts[setting_id]
+        count_face = 0
+        # [hard, medium, easy]
+        #pbar = tqdm.tqdm(range(event_num))
+        #for i in pbar:
+        for i in range(event_num):
+            #pbar.set_description('Processing {}'.format(settings[setting_id]))
+            event_name = str(event_list[i][0][0])
+            img_list = file_list[i][0]
+            sub_gt_list = gt_list[i][0]
+            # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
+            gt_bbx_list = facebox_list[i][0]
+            results[event_name] = {}
+
+            for j in range(len(img_list)):
+
+                gt_boxes = gt_bbx_list[j][0].astype('float').copy()
+                gt_boxes[:,2] += gt_boxes[:,0]
+                gt_boxes[:,3] += gt_boxes[:,1]
+                keep_index = sub_gt_list[j][0].copy()
+                #print ('keep_index', keep_index.shape)
+                count_face += len(keep_index)
+                
+
+                if len(gt_boxes) == 0:
+                    results[event_name][str(img_list[j][0][0])] = np.empty( (0,4) )
+                    continue
+                keep_index -= 1
+                keep_index = keep_index.flatten()
+                #ignore = np.zeros(gt_boxes.shape[0])
+                #if len(keep_index) != 0:
+                #    ignore[keep_index-1] = 1
+                #assert len(keep_index)>0
+                #ignore = np.zeros(gt_boxes.shape[0], dtype=np.int)
+                #if len(keep_index) != 0:
+                #    ignore[keep_index-1] = 1
+                #print('ignore:', len(ignore), len(np.where(ignore==1)[0]))
+                #pred_sort_idx= np.argsort(pred_info[:,4])
+                #pred_info = pred_info[pred_sort_idx][::-1]
+                #print ('pred_info', pred_info[:20, 4])
+                #exit()
+                #if setting_id==2 and len(keep_index)<gt_boxes.shape[0]:
+                #    print(gt_boxes.shape, keep_index.shape)
+                
+                gt_boxes = np_round(gt_boxes)[keep_index,:]
+
+                results[event_name][str(img_list[j][0][0])] = gt_boxes
+        all_results.append(results)
+    return all_results
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--pred', default='')
+    parser.add_argument('-g', '--gt', default='./ground_truth/')
+
+    args = parser.parse_args()
+    evaluation(args.pred, args.gt)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/insightface/detection/scrfd/mmdet/core/export/__init__.py b/insightface/detection/scrfd/mmdet/core/export/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..76589b1f279a71a59a5515d1b78cea0865f83131
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/export/__init__.py
@@ -0,0 +1,8 @@
+from .pytorch2onnx import (build_model_from_cfg,
+                           generate_inputs_and_wrap_model,
+                           preprocess_example_input)
+
+__all__ = [
+    'build_model_from_cfg', 'generate_inputs_and_wrap_model',
+    'preprocess_example_input'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/export/pytorch2onnx.py b/insightface/detection/scrfd/mmdet/core/export/pytorch2onnx.py
new file mode 100755
index 0000000000000000000000000000000000000000..8f9309df8c763d68404d4654b92abb3db4abdf93
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/export/pytorch2onnx.py
@@ -0,0 +1,143 @@
+from functools import partial
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.runner import load_checkpoint
+
+
+def generate_inputs_and_wrap_model(config_path, checkpoint_path, input_config):
+    """Prepare sample input and wrap model for ONNX export.
+
+    The ONNX export API only accept args, and all inputs should be
+    torch.Tensor or corresponding types (such as tuple of tensor).
+    So we should call this function before exporting. This function will:
+
+    1. generate corresponding inputs which are used to execute the model.
+    2. Wrap the model's forward function.
+
+    For example, the MMDet models' forward function has a parameter
+    ``return_loss:bool``. As we want to set it as False while export API
+    supports neither bool type or kwargs. So we have to replace the forward
+    like: ``model.forward = partial(model.forward, return_loss=False)``
+
+    Args:
+        config_path (str): the OpenMMLab config for the model we want to
+            export to ONNX
+        checkpoint_path (str): Path to the corresponding checkpoint
+        input_config (dict): the exactly data in this dict depends on the
+            framework. For MMSeg, we can just declare the input shape,
+            and generate the dummy data accordingly. However, for MMDet,
+            we may pass the real img path, or the NMS will return None
+            as there is no legal bbox.
+
+    Returns:
+        tuple: (model, tensor_data) wrapped model which can be called by \
+        model(*tensor_data) and a list of inputs which are used to execute \
+            the model while exporting.
+    """
+
+    model = build_model_from_cfg(config_path, checkpoint_path)
+    one_img, one_meta = preprocess_example_input(input_config)
+    tensor_data = [one_img]
+    model.forward = partial(
+        model.forward, img_metas=[[one_meta]], return_loss=False)
+
+    # pytorch has some bug in pytorch1.3, we have to fix it
+    # by replacing these existing op
+    opset_version = 11
+    # put the import within the function thus it will not cause import error
+    # when not using this function
+    try:
+        from mmcv.onnx.symbolic import register_extra_symbolics
+    except ModuleNotFoundError:
+        raise NotImplementedError('please update mmcv to version>=v1.0.4')
+    register_extra_symbolics(opset_version)
+
+    return model, tensor_data
+
+
+def build_model_from_cfg(config_path, checkpoint_path):
+    """Build a model from config and load the given checkpoint.
+
+    Args:
+        config_path (str): the OpenMMLab config for the model we want to
+            export to ONNX
+        checkpoint_path (str): Path to the corresponding checkpoint
+
+    Returns:
+        torch.nn.Module: the built model
+    """
+    from mmdet.models import build_detector
+
+    cfg = mmcv.Config.fromfile(config_path)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the model
+    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+    load_checkpoint(model, checkpoint_path, map_location='cpu')
+    model.cpu().eval()
+    return model
+
+
+def preprocess_example_input(input_config):
+    """Prepare an example input image for ``generate_inputs_and_wrap_model``.
+
+    Args:
+        input_config (dict): customized config describing the example input.
+
+    Returns:
+        tuple: (one_img, one_meta), tensor of the example input image and \
+            meta information for the example input image.
+
+    Examples:
+        >>> from mmdet.core.export import preprocess_example_input
+        >>> input_config = {
+        >>>         'input_shape': (1,3,224,224),
+        >>>         'input_path': 'demo/demo.jpg',
+        >>>         'normalize_cfg': {
+        >>>             'mean': (123.675, 116.28, 103.53),
+        >>>             'std': (58.395, 57.12, 57.375)
+        >>>             }
+        >>>         }
+        >>> one_img, one_meta = preprocess_example_input(input_config)
+        >>> print(one_img.shape)
+        torch.Size([1, 3, 224, 224])
+        >>> print(one_meta)
+        {'img_shape': (224, 224, 3),
+        'ori_shape': (224, 224, 3),
+        'pad_shape': (224, 224, 3),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False}
+    """
+    input_path = input_config['input_path']
+    input_shape = input_config['input_shape']
+    one_img = mmcv.imread(input_path)
+    one_img = mmcv.imresize(one_img, input_shape[2:][::-1])
+    show_img = one_img.copy()
+    if 'normalize_cfg' in input_config.keys():
+        normalize_cfg = input_config['normalize_cfg']
+        mean = np.array(normalize_cfg['mean'], dtype=np.float32)
+        std = np.array(normalize_cfg['std'], dtype=np.float32)
+        one_img = mmcv.imnormalize(one_img, mean, std)
+    one_img = one_img.transpose(2, 0, 1)
+    one_img = torch.from_numpy(one_img).unsqueeze(0).float().requires_grad_(
+        True)
+    (_, C, H, W) = input_shape
+    one_meta = {
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+        'show_img': show_img,
+    }
+
+    return one_img, one_meta
diff --git a/insightface/detection/scrfd/mmdet/core/fp16/__init__.py b/insightface/detection/scrfd/mmdet/core/fp16/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..0a68c28b9014f3452c661eaa1daab08153da59f9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/fp16/__init__.py
@@ -0,0 +1,8 @@
+from .deprecated_fp16_utils import \
+    DeprecatedFp16OptimizerHook as Fp16OptimizerHook
+from .deprecated_fp16_utils import deprecated_auto_fp16 as auto_fp16
+from .deprecated_fp16_utils import deprecated_force_fp32 as force_fp32
+from .deprecated_fp16_utils import \
+    deprecated_wrap_fp16_model as wrap_fp16_model
+
+__all__ = ['auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model']
diff --git a/insightface/detection/scrfd/mmdet/core/fp16/deprecated_fp16_utils.py b/insightface/detection/scrfd/mmdet/core/fp16/deprecated_fp16_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..1b15b047e0b40ab494037449465bda1de2c8ecf7
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/fp16/deprecated_fp16_utils.py
@@ -0,0 +1,47 @@
+import warnings
+
+from mmcv.runner import (Fp16OptimizerHook, auto_fp16, force_fp32,
+                         wrap_fp16_model)
+
+
+class DeprecatedFp16OptimizerHook(Fp16OptimizerHook):
+    """A wrapper class for the FP16 optimizer hook. This class wraps
+    :class:`Fp16OptimizerHook` in `mmcv.runner` and shows a warning that the
+    :class:`Fp16OptimizerHook` from `mmdet.core` will be deprecated.
+
+    Refer to :class:`Fp16OptimizerHook` in `mmcv.runner` for more details.
+
+    Args:
+        loss_scale (float): Scale factor multiplied with loss.
+    """
+
+    def __init__(*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Fp16OptimizerHook from "mmdet.core" will be '
+            'deprecated in the future. Please import them from "mmcv.runner" '
+            'instead')
+
+
+def deprecated_auto_fp16(*args, **kwargs):
+    warnings.warn(
+        'Importing auto_fp16 from "mmdet.core" will be '
+        'deprecated in the future. Please import them from "mmcv.runner" '
+        'instead')
+    return auto_fp16(*args, **kwargs)
+
+
+def deprecated_force_fp32(*args, **kwargs):
+    warnings.warn(
+        'Importing force_fp32 from "mmdet.core" will be '
+        'deprecated in the future. Please import them from "mmcv.runner" '
+        'instead')
+    return force_fp32(*args, **kwargs)
+
+
+def deprecated_wrap_fp16_model(*args, **kwargs):
+    warnings.warn(
+        'Importing wrap_fp16_model from "mmdet.core" will be '
+        'deprecated in the future. Please import them from "mmcv.runner" '
+        'instead')
+    wrap_fp16_model(*args, **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/core/mask/__init__.py b/insightface/detection/scrfd/mmdet/core/mask/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ab1e88bc686d5c2fe72b3114cb2b3e372e73a0f8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/mask/__init__.py
@@ -0,0 +1,8 @@
+from .mask_target import mask_target
+from .structures import BaseInstanceMasks, BitmapMasks, PolygonMasks
+from .utils import encode_mask_results, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
+    'PolygonMasks', 'encode_mask_results'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/mask/mask_target.py b/insightface/detection/scrfd/mmdet/core/mask/mask_target.py
new file mode 100755
index 0000000000000000000000000000000000000000..18e423507086e8bc0ba36ff01138a0808a0735b2
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/mask/mask_target.py
@@ -0,0 +1,62 @@
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images.
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals.
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        list[Tensor]: Mask target of each image.
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np, mask_size, device=device,
+            inds=pos_assigned_gt_inds).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/insightface/detection/scrfd/mmdet/core/mask/structures.py b/insightface/detection/scrfd/mmdet/core/mask/structures.py
new file mode 100755
index 0000000000000000000000000000000000000000..6f5da5547f27045da43894c7c08f8c75b71c82d8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/mask/structures.py
@@ -0,0 +1,827 @@
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.ops.roi_align import roi_align
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+        pass
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+        pass
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+        pass
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+        pass
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+        pass
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear'):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+        pass
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+        pass
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+        pass
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+        pass
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+        pass
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+        pass
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+        pass
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+        pass
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                mmcv.imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                mmcv.imresize(mask, out_shape, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                mmcv.imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                mmcv.impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            resized_masks = (targets >= 0.5).cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            translated_masks = mmcv.imtranslate(
+                self.masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=fill_val,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = mmcv.imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = mmcv.imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=fill_val)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] *= w_scale
+                    p[1::2] *= h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            cropped_masks = []
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    # pycocotools will clip the boundary
+                    p = p.copy()
+                    p[0::2] -= bbox[0]
+                    p[1::2] -= bbox[1]
+                    cropped_poly_per_obj.append(p)
+                cropped_masks.append(cropped_poly_per_obj)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear'):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] -= bbox[0]
+                p[1::2] -= bbox[1]
+
+                # resize
+                p[0::2] *= w_scale
+                p[1::2] *= h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=None,
+                  interpolation=None):
+        """Translate the PolygonMasks."""
+        assert fill_val is None or fill_val == 0, 'Here fill_val is not '\
+            f'used, and defaultly should be None or 0. got {fill_val}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(np.bool)
+    return bitmap_mask
diff --git a/insightface/detection/scrfd/mmdet/core/mask/utils.py b/insightface/detection/scrfd/mmdet/core/mask/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..c88208291ab2a605bee9fe6c1a28a443b74c6372
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/mask/utils.py
@@ -0,0 +1,63 @@
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = mmcv.slice_list(polys_single, polys_lens_single)
+        mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    if isinstance(mask_results, tuple):  # mask scoring
+        cls_segms, cls_mask_scores = mask_results
+    else:
+        cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = [[] for _ in range(num_classes)]
+    for i in range(len(cls_segms)):
+        for cls_segm in cls_segms[i]:
+            encoded_mask_results[i].append(
+                mask_util.encode(
+                    np.array(
+                        cls_segm[:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    if isinstance(mask_results, tuple):
+        return encoded_mask_results, cls_mask_scores
+    else:
+        return encoded_mask_results
diff --git a/insightface/detection/scrfd/mmdet/core/post_processing/__init__.py b/insightface/detection/scrfd/mmdet/core/post_processing/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..880b3f06609b050aae163b2e38088c1ee4aa0998
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/post_processing/__init__.py
@@ -0,0 +1,8 @@
+from .bbox_nms import fast_nms, multiclass_nms
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_scores)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks', 'fast_nms'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/post_processing/bbox_nms.py b/insightface/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
new file mode 100755
index 0000000000000000000000000000000000000000..7ab322a2a9f5b822184f95fe2a042ef144431509
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
@@ -0,0 +1,149 @@
+import torch
+from mmcv.ops.nms import batched_nms
+
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None,
+                   return_inds=False):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+
+    Returns:
+        tuple: (bboxes, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    #print('!!!!!', multi_bboxes.shape)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+
+    scores = multi_scores[:, :-1]
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+
+    labels = torch.arange(num_classes, dtype=torch.long)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, 4)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    # remove low scoring boxes
+    valid_mask = scores > score_thr
+    inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+    bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    if inds.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        return bboxes, labels
+
+    # TODO: add size check before feed into batched_nms
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], keep
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(multi_bboxes,
+             multi_scores,
+             multi_coeffs,
+             score_thr,
+             iou_thr,
+             top_k,
+             max_num=-1):
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        tuple: (bboxes, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/insightface/detection/scrfd/mmdet/core/post_processing/merge_augs.py b/insightface/detection/scrfd/mmdet/core/post_processing/merge_augs.py
new file mode 100755
index 0000000000000000000000000000000000000000..167093ebf1d016806b2b997f28207887231b2e6b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/post_processing/merge_augs.py
@@ -0,0 +1,117 @@
+import numpy as np
+import torch
+from mmcv.ops import nms
+
+from ..bbox import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              rpn_test_cfg.nms_thr)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = []
+    for mask, img_info in zip(aug_masks, img_metas):
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        if flip:
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask)
+
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
diff --git a/insightface/detection/scrfd/mmdet/core/utils/__init__.py b/insightface/detection/scrfd/mmdet/core/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b49b3fc468f29121b8ea74ede09b6aea069d47fb
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/utils/__init__.py
@@ -0,0 +1,7 @@
+from .dist_utils import DistOptimizerHook, allreduce_grads, reduce_mean
+from .misc import multi_apply, unmap
+
+__all__ = [
+    'allreduce_grads', 'DistOptimizerHook', 'reduce_mean', 'multi_apply',
+    'unmap'
+]
diff --git a/insightface/detection/scrfd/mmdet/core/utils/dist_utils.py b/insightface/detection/scrfd/mmdet/core/utils/dist_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..5fe77753313783f95bd7111038ef8b58ee4e4bc5
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/utils/dist_utils.py
@@ -0,0 +1,69 @@
+import warnings
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+    """Deprecated optimizer hook for distributed training."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn('"DistOptimizerHook" is deprecated, please switch to'
+                      '"mmcv.runner.OptimizerHook".')
+        super().__init__(*args, **kwargs)
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/insightface/detection/scrfd/mmdet/core/utils/misc.py b/insightface/detection/scrfd/mmdet/core/utils/misc.py
new file mode 100755
index 0000000000000000000000000000000000000000..407116e997f7931c5c027b117e8396320b3a72ff
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/core/utils/misc.py
@@ -0,0 +1,41 @@
+from functools import partial
+
+import torch
+from six.moves import map, zip
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        #print(inds)
+        #print('CCC', ret.shape, inds.shape, data.shape)
+        ret[inds.type(torch.bool), :] = data
+    return ret
diff --git a/insightface/detection/scrfd/mmdet/datasets/__init__.py b/insightface/detection/scrfd/mmdet/datasets/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2302e4e32ccd4acc109b8a818d934c386845e4b0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/__init__.py
@@ -0,0 +1,24 @@
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .cityscapes import CityscapesDataset
+from .coco import CocoDataset
+from .custom import CustomDataset
+from .retinaface import RetinaFaceDataset
+from .dataset_wrappers import (ClassBalancedDataset, ConcatDataset,
+                               RepeatDataset)
+from .deepfashion import DeepFashionDataset
+from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .utils import replace_ImageToTensor
+from .voc import VOCDataset
+from .wider_face import WIDERFaceDataset
+from .xml_style import XMLDataset
+
+__all__ = [
+    'CustomDataset', 'XMLDataset', 'CocoDataset', 'DeepFashionDataset',
+    'VOCDataset', 'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset',
+    'LVISV1Dataset', 'GroupSampler', 'DistributedGroupSampler',
+    'DistributedSampler', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
+    'RetinaFaceDataset',
+    'ClassBalancedDataset', 'WIDERFaceDataset', 'DATASETS', 'PIPELINES',
+    'build_dataset', 'replace_ImageToTensor'
+]
diff --git a/insightface/detection/scrfd/mmdet/datasets/builder.py b/insightface/detection/scrfd/mmdet/datasets/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..16d9ae34b8ec1852b095bdcd75e89a8f2be67efb
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/builder.py
@@ -0,0 +1,143 @@
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    hard_limit = rlimit[1]
+    soft_limit = min(4096, hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def _concat_dataset(cfg, default_args=None):
+    from .dataset_wrappers import ConcatDataset
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+    separate_eval = cfg.get('separate_eval', True)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        # pop 'separate_eval' since it is not a valid key for common datasets.
+        if 'separate_eval' in data_cfg:
+            data_cfg.pop('separate_eval')
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets, separate_eval)
+
+
+def build_dataset(cfg, default_args=None):
+    from .dataset_wrappers import (ConcatDataset, RepeatDataset,
+                                   ClassBalancedDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = DistributedGroupSampler(dataset, samples_per_gpu,
+                                              world_size, rank)
+        else:
+            sampler = DistributedSampler(
+                dataset, world_size, rank, shuffle=False)
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
diff --git a/insightface/detection/scrfd/mmdet/datasets/cityscapes.py b/insightface/detection/scrfd/mmdet/datasets/cityscapes.py
new file mode 100755
index 0000000000000000000000000000000000000000..71eead87e7f4e511c0cb59e69c3a599832ada0e4
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/cityscapes.py
@@ -0,0 +1,334 @@
+# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa
+# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+
+import glob
+import os
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmcv.utils import print_log
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CocoDataset):
+
+    CLASSES = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+               'bicycle')
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        valid_inds = []
+        # obtain images that contain annotation
+        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.coco.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = img_info['id']
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+            ann_info = self.coco.loadAnns(ann_ids)
+            all_iscrowd = all([_['iscrowd'] for _ in ann_info])
+            if self.filter_empty_gt and (self.img_ids[i] not in ids_in_cat
+                                         or all_iscrowd):
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            img_info (dict): Image info of an image.
+            ann_info (list[dict]): Annotation info of an image.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, \
+                bboxes_ignore, labels, masks, seg_map. \
+                "masks" are already decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann['segmentation'])
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=img_info['segm_file'])
+
+        return ann
+
+    def results2txt(self, results, outfile_prefix):
+        """Dump the detection results to a txt file.
+
+        Args:
+            results (list[list | tuple]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files.
+                If the prefix is "somepath/xxx",
+                the txt files will be named "somepath/xxx.txt".
+
+        Returns:
+            list[str]: Result txt files which contains corresponding \
+                instance segmentation images.
+        """
+        try:
+            import cityscapesscripts.helpers.labels as CSLabels
+        except ImportError:
+            raise ImportError('Please run "pip install citscapesscripts" to '
+                              'install cityscapesscripts first.')
+        result_files = []
+        os.makedirs(outfile_prefix, exist_ok=True)
+        prog_bar = mmcv.ProgressBar(len(self))
+        for idx in range(len(self)):
+            result = results[idx]
+            filename = self.data_infos[idx]['filename']
+            basename = osp.splitext(osp.basename(filename))[0]
+            pred_txt = osp.join(outfile_prefix, basename + '_pred.txt')
+
+            bbox_result, segm_result = result
+            bboxes = np.vstack(bbox_result)
+            # segm results
+            if isinstance(segm_result, tuple):
+                # Some detectors use different scores for bbox and mask,
+                # like Mask Scoring R-CNN. Score of segm will be used instead
+                # of bbox score.
+                segms = mmcv.concat_list(segm_result[0])
+                mask_score = segm_result[1]
+            else:
+                # use bbox score for mask score
+                segms = mmcv.concat_list(segm_result)
+                mask_score = [bbox[-1] for bbox in bboxes]
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(bbox_result)
+            ]
+            labels = np.concatenate(labels)
+
+            assert len(bboxes) == len(segms) == len(labels)
+            num_instances = len(bboxes)
+            prog_bar.update()
+            with open(pred_txt, 'w') as fout:
+                for i in range(num_instances):
+                    pred_class = labels[i]
+                    classes = self.CLASSES[pred_class]
+                    class_id = CSLabels.name2label[classes].id
+                    score = mask_score[i]
+                    mask = maskUtils.decode(segms[i]).astype(np.uint8)
+                    png_filename = osp.join(outfile_prefix,
+                                            basename + f'_{i}_{classes}.png')
+                    mmcv.imwrite(mask, png_filename)
+                    fout.write(f'{osp.basename(png_filename)} {class_id} '
+                               f'{score}\n')
+            result_files.append(pred_txt)
+
+        return result_files
+
+    def format_results(self, results, txtfile_prefix=None):
+        """Format the results to txt (standard format for Cityscapes
+        evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            txtfile_prefix (str | None): The prefix of txt files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving txt/png files when txtfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if txtfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            txtfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2txt(results, txtfile_prefix)
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 outfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        """Evaluation in Cityscapes/COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            outfile_prefix (str | None): The prefix of output file. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If results are evaluated with COCO protocol, it would be the
+                prefix of output json file. For example, the metric is 'bbox'
+                and 'segm', then json files would be "a/b/prefix.bbox.json" and
+                "a/b/prefix.segm.json".
+                If results are evaluated with cityscapes protocol, it would be
+                the prefix of output txt/png files. The output files would be
+                png images under folder "a/b/prefix/xxx/" and the file name of
+                images would be written into a txt file
+                "a/b/prefix/xxx_pred.txt", where "xxx" is the video name of
+                cityscapes. If not specified, a temp file will be created.
+                Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float]): IoU threshold used for evaluating
+                recalls. If set to a list, the average recall of all IoUs will
+                also be computed. Default: 0.5.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric or cityscapes mAP \
+                and AP@50.
+        """
+        eval_results = dict()
+
+        metrics = metric.copy() if isinstance(metric, list) else [metric]
+
+        if 'cityscapes' in metrics:
+            eval_results.update(
+                self._evaluate_cityscapes(results, outfile_prefix, logger))
+            metrics.remove('cityscapes')
+
+        # left metrics are all coco metric
+        if len(metrics) > 0:
+            # create CocoDataset with CityscapesDataset annotation
+            self_coco = CocoDataset(self.ann_file, self.pipeline.transforms,
+                                    None, self.data_root, self.img_prefix,
+                                    self.seg_prefix, self.proposal_file,
+                                    self.test_mode, self.filter_empty_gt)
+            # TODO: remove this in the future
+            # reload annotations of correct class
+            self_coco.CLASSES = self.CLASSES
+            self_coco.data_infos = self_coco.load_annotations(self.ann_file)
+            eval_results.update(
+                self_coco.evaluate(results, metrics, logger, outfile_prefix,
+                                   classwise, proposal_nums, iou_thrs))
+
+        return eval_results
+
+    def _evaluate_cityscapes(self, results, txtfile_prefix, logger):
+        """Evaluation in Cityscapes protocol.
+
+        Args:
+            results (list): Testing results of the dataset.
+            txtfile_prefix (str | None): The prefix of output txt file
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results, contains 'mAP' \
+                and 'AP@50'.
+        """
+
+        try:
+            import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa
+        except ImportError:
+            raise ImportError('Please run "pip install citscapesscripts" to '
+                              'install cityscapesscripts first.')
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        result_files, tmp_dir = self.format_results(results, txtfile_prefix)
+
+        if tmp_dir is None:
+            result_dir = osp.join(txtfile_prefix, 'results')
+        else:
+            result_dir = osp.join(tmp_dir.name, 'results')
+
+        eval_results = OrderedDict()
+        print_log(f'Evaluating results under {result_dir} ...', logger=logger)
+
+        # set global states in cityscapes evaluation API
+        CSEval.args.cityscapesPath = os.path.join(self.img_prefix, '../..')
+        CSEval.args.predictionPath = os.path.abspath(result_dir)
+        CSEval.args.predictionWalk = None
+        CSEval.args.JSONOutput = False
+        CSEval.args.colorized = False
+        CSEval.args.gtInstancesFile = os.path.join(result_dir,
+                                                   'gtInstances.json')
+        CSEval.args.groundTruthSearch = os.path.join(
+            self.img_prefix.replace('leftImg8bit', 'gtFine'),
+            '*/*_gtFine_instanceIds.png')
+
+        groundTruthImgList = glob.glob(CSEval.args.groundTruthSearch)
+        assert len(groundTruthImgList), 'Cannot find ground truth images' \
+            f' in {CSEval.args.groundTruthSearch}.'
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(CSEval.getPrediction(gt, CSEval.args))
+        CSEval_results = CSEval.evaluateImgLists(predictionImgList,
+                                                 groundTruthImgList,
+                                                 CSEval.args)['averages']
+
+        eval_results['mAP'] = CSEval_results['allAp']
+        eval_results['AP@50'] = CSEval_results['allAp50%']
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/insightface/detection/scrfd/mmdet/datasets/coco.py b/insightface/detection/scrfd/mmdet/datasets/coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..9eea6a4fd35c22a383e87db415bc0171db4248de
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/coco.py
@@ -0,0 +1,544 @@
+import itertools
+import logging
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from terminaltables import AsciiTable
+
+from mmdet.core import eval_recalls
+from .builder import DATASETS
+from .custom import CustomDataset
+
+try:
+    import pycocotools
+    if not hasattr(pycocotools, '__sphinx_mock__'):  # for doc generation
+        assert pycocotools.__version__ >= '12.0.2'
+except AssertionError:
+    raise AssertionError('Incompatible version of pycocotools is installed. '
+                         'Run pip uninstall pycocotools first. Then run pip '
+                         'install mmpycocotools to install open-mmlab forked '
+                         'pycocotools.')
+
+
+@DATASETS.register_module()
+class CocoDataset(CustomDataset):
+
+    CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+               'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+               'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+               'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+               'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+               'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+
+        self.coco = COCO(ann_file)
+        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            data_infos.append(info)
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get COCO annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return self._parse_ann_info(self.data_infos[idx], ann_info)
+
+    def get_cat_ids(self, idx):
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return [ann['category_id'] for ann in ann_info]
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        valid_inds = []
+        # obtain images that contain annotation
+        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.coco.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = self.img_ids[i]
+            if self.filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
+                labels, masks, seg_map. "masks" are raw annotations and not \
+                decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann.get('segmentation', None))
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def xyxy2xywh(self, bbox):
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def _proposal2json(self, results):
+        """Convert proposal results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            bboxes = results[idx]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = 1
+                json_results.append(data)
+        return json_results
+
+    def _det2json(self, results):
+        """Convert detection results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            result = results[idx]
+            for label in range(len(result)):
+                bboxes = result[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    json_results.append(data)
+        return json_results
+
+    def _segm2json(self, results):
+        """Convert instance segmentation results to COCO json style."""
+        bbox_json_results = []
+        segm_json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            det, seg = results[idx]
+            for label in range(len(det)):
+                # bbox results
+                bboxes = det[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    bbox_json_results.append(data)
+
+                # segm results
+                # some detectors use different scores for bbox and mask
+                if isinstance(seg, tuple):
+                    segms = seg[0][label]
+                    mask_score = seg[1][label]
+                else:
+                    segms = seg[label]
+                    mask_score = [bbox[4] for bbox in bboxes]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(mask_score[i])
+                    data['category_id'] = self.cat_ids[label]
+                    if isinstance(segms[i]['counts'], bytes):
+                        segms[i]['counts'] = segms[i]['counts'].decode()
+                    data['segmentation'] = segms[i]
+                    segm_json_results.append(data)
+        return bbox_json_results, segm_json_results
+
+    def results2json(self, results, outfile_prefix):
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (list[list | tuple | ndarray]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
+                values are corresponding filenames.
+        """
+        result_files = dict()
+        if isinstance(results[0], list):
+            json_results = self._det2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            mmcv.dump(json_results, result_files['bbox'])
+        elif isinstance(results[0], tuple):
+            json_results = self._segm2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            mmcv.dump(json_results[0], result_files['bbox'])
+            mmcv.dump(json_results[1], result_files['segm'])
+        elif isinstance(results[0], np.ndarray):
+            json_results = self._proposal2json(results)
+            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
+            mmcv.dump(json_results, result_files['proposal'])
+        else:
+            raise TypeError('invalid type of results')
+        return result_files
+
+    def fast_eval_recall(self, results, proposal_nums, iou_thrs, logger=None):
+        gt_bboxes = []
+        for i in range(len(self.img_ids)):
+            ann_ids = self.coco.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self.coco.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, results, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2json(results, jsonfile_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=None,
+                 metric_items=None):
+        """Evaluation in COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float], optional): IoU threshold used for
+                evaluating recalls/mAPs. If set to a list, the average of all
+                IoUs will also be computed. If not specified, [0.50, 0.55,
+                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+                Default: None.
+            metric_items (list[str] | str, optional): Metric items that will
+                be returned. If not specified, ``['AR@100', 'AR@300',
+                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+                ``metric=='bbox' or metric=='segm'``.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric.
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        if metric_items is not None:
+            if not isinstance(metric_items, list):
+                metric_items = [metric_items]
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        eval_results = OrderedDict()
+        cocoGt = self.coco
+        for metric in metrics:
+            msg = f'Evaluating {metric}...'
+            if logger is None:
+                msg = '\n' + msg
+            print_log(msg, logger=logger)
+
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    results, proposal_nums, iou_thrs, logger='silent')
+                log_msg = []
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                print_log(log_msg, logger=logger)
+                continue
+
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                cocoDt = cocoGt.loadRes(result_files[metric])
+            except IndexError:
+                print_log(
+                    'The testing results of the whole dataset is empty.',
+                    logger=logger,
+                    level=logging.ERROR)
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.catIds = self.cat_ids
+            cocoEval.params.imgIds = self.img_ids
+            cocoEval.params.maxDets = list(proposal_nums)
+            cocoEval.params.iouThrs = iou_thrs
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item {metric_item} is not supported')
+
+            if metric == 'proposal':
+                cocoEval.params.useCats = 0
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+                if classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = cocoEval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self.coco.loadCats(catId)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    print_log('\n' + table.table, logger=logger)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
+                    )
+                    eval_results[key] = val
+                ap = cocoEval.stats[:6]
+                eval_results[f'{metric}_mAP_copypaste'] = (
+                    f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/insightface/detection/scrfd/mmdet/datasets/custom.py b/insightface/detection/scrfd/mmdet/datasets/custom.py
new file mode 100755
index 0000000000000000000000000000000000000000..1f78a1f5ac41f6ce889c1c175efed5508f6e5e2f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/custom.py
@@ -0,0 +1,363 @@
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmdet.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class CustomDataset(Dataset):
+    """Custom dataset for detection.
+
+    The annotation format is shown as follows. The `ann` field is optional for
+    testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': 'a.jpg',
+                'width': 1280,
+                'height': 720,
+                'ann': {
+                    'bboxes': <np.ndarray> (n, 4) in (x1, y1, x2, y2) order.
+                    'labels': <np.ndarray> (n, ),
+                    'bboxes_ignore': <np.ndarray> (k, 4), (optional field)
+                    'labels_ignore': <np.ndarray> (k, 4) (optional field)
+                }
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Default: None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests.
+    """
+
+    CLASSES = None
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 classes=None,
+                 data_root=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True,
+                 support_mosaic=False,
+                 ):
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.img_prefix = img_prefix
+        self.seg_prefix = seg_prefix
+        self.proposal_file = proposal_file
+        self.test_mode = test_mode
+        self.filter_empty_gt = filter_empty_gt
+        self.support_mosaic = support_mosaic
+        self.CLASSES = self.get_classes(classes)
+
+        # join paths if data_root is specified
+        if self.data_root is not None:
+            if not osp.isabs(self.ann_file):
+                self.ann_file = osp.join(self.data_root, self.ann_file)
+            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
+                self.img_prefix = osp.join(self.data_root, self.img_prefix)
+            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
+                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
+            if not (self.proposal_file is None
+                    or osp.isabs(self.proposal_file)):
+                self.proposal_file = osp.join(self.data_root,
+                                              self.proposal_file)
+        # load annotations (and proposals)
+        self.data_infos = self.load_annotations(self.ann_file)
+
+        if self.proposal_file is not None:
+            self.proposals = self.load_proposals(self.proposal_file)
+        else:
+            self.proposals = None
+
+        # filter images too small and containing no annotations
+        if not test_mode:
+            valid_inds = self._filter_imgs()
+            self.data_infos = [self.data_infos[i] for i in valid_inds]
+            if self.proposals is not None:
+                self.proposals = [self.proposals[i] for i in valid_inds]
+            # set group flag for the sampler
+            self._set_group_flag()
+
+        # processing pipeline
+        self.pipeline = Compose(pipeline)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.data_infos)
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file."""
+        return mmcv.load(ann_file)
+
+    def load_proposals(self, proposal_file):
+        """Load proposal from proposal file."""
+        return mmcv.load(proposal_file)
+
+    def get_ann_info(self, idx):
+        """Get annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        return self.data_infos[idx]['ann']
+
+    def get_cat_ids(self, idx):
+        """Get category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.data_infos[idx]['ann']['labels'].astype(np.int).tolist()
+
+    def pre_pipeline(self, results):
+        """Prepare results dict for pipeline."""
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small."""
+        if self.filter_empty_gt:
+            warnings.warn(
+                'CustomDataset does not support filtering empty gt images.')
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+        return valid_inds
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+        for i in range(len(self)):
+            img_info = self.data_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        """Get another random index from the same group as the given index."""
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get training/test data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training/test data (with annotation if `test_mode` is set \
+                True).
+        """
+
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            if self.support_mosaic:
+                num_samples = np.random.choice([1,4,9], p=[0.4,0.4,0.2])
+            else:
+                num_samples = 1
+            if num_samples==1:
+                data = self.prepare_train_img(idx)
+            else:
+                data = self.prepare_multiple_train_imgs(idx, num_samples)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training data and annotation after pipeline with new keys \
+                introduced by pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_multiple_train_imgs(self, idx, num_samples):
+        """Get training data and annotations (of multiple images) after
+        pipeline.
+        While the main sample is determined by idx, the other samples
+        are selected randomly.
+        Args:
+            idx (int): Index of the main sample.
+        Returns:
+            dict: Training data and annotation after pipeline with new keys \
+                introduced by pipeline.
+        """
+        num = num_samples - 1
+
+        n_samples = len(self)
+        total_indices = list(range(n_samples))
+        total_indices.remove(idx)
+
+        selected_indices = random.choice(total_indices, num, False)
+        selected_indices = np.insert(selected_indices, 0, idx)
+
+        results = []
+        for i in selected_indices:
+            img_info = self.data_infos[i]
+            ann_info = self.get_ann_info(i)
+            results_i = dict(img_info=img_info, ann_info=ann_info)
+            if self.proposals is not None:
+                results_i['proposals'] = self.proposals[i]
+            self.pre_pipeline(results_i)
+            results.append(results_i)
+        return self.pipeline(results)
+
+
+    def prepare_test_img(self, idx):
+        """Get testing data  after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Testing data after pipeline with new keys intorduced by \
+                piepline.
+        """
+
+        img_info = self.data_infos[idx]
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Returns:
+            tuple[str] or list[str]: Names of categories of the dataset.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        return class_names
+
+    def format_results(self, results, **kwargs):
+        """Place holder to format result to dataset specific output."""
+        pass
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate the dataset.
+
+        Args:
+            results (list): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. It must be a float
+                when evaluating mAP, and can be a list when evaluating recall.
+                Default: 0.5.
+            scale_ranges (list[tuple] | None): Scale ranges for evaluating mAP.
+                Default: None.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        if metric == 'mAP':
+            assert isinstance(iou_thr, float)
+            mean_ap, _ = eval_map(
+                results,
+                annotations,
+                scale_ranges=scale_ranges,
+                iou_thr=iou_thr,
+                dataset=self.CLASSES,
+                logger=logger)
+            eval_results['mAP'] = mean_ap
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            if isinstance(iou_thr, float):
+                iou_thr = [iou_thr]
+            recalls = eval_recalls(
+                gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
+            for i, num in enumerate(proposal_nums):
+                for j, iou in enumerate(iou_thr):
+                    eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/insightface/detection/scrfd/mmdet/datasets/dataset_wrappers.py b/insightface/detection/scrfd/mmdet/datasets/dataset_wrappers.py
new file mode 100755
index 0000000000000000000000000000000000000000..55ad5cb60e581a96bdbd1fbbeebc2f46f8c4e899
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/dataset_wrappers.py
@@ -0,0 +1,282 @@
+import bisect
+import math
+from collections import defaultdict
+
+import numpy as np
+from mmcv.utils import print_log
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    concat the group flag for image aspect ratio.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+        separate_eval (bool): Whether to evaluate the results
+            separately if it is used as validation dataset.
+            Defaults to True.
+    """
+
+    def __init__(self, datasets, separate_eval=True):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        self.separate_eval = separate_eval
+        if not separate_eval:
+            if any([isinstance(ds, CocoDataset) for ds in datasets]):
+                raise NotImplementedError(
+                    'Evaluating concatenated CocoDataset as a whole is not'
+                    ' supported! Please set "separate_eval=True"')
+            elif len(set([type(ds) for ds in datasets])) != 1:
+                raise NotImplementedError(
+                    'All the datasets should have same types')
+
+        if hasattr(datasets[0], 'flag'):
+            flags = []
+            for i in range(0, len(datasets)):
+                flags.append(datasets[i].flag)
+            self.flag = np.concatenate(flags)
+
+    def get_cat_ids(self, idx):
+        """Get category ids of concatenated dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    'absolute value of index should not exceed dataset length')
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_cat_ids(sample_idx)
+
+    def evaluate(self, results, logger=None, **kwargs):
+        """Evaluate the results.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: AP results of the total dataset or each separate
+            dataset if `self.separate_eval=True`.
+        """
+        assert len(results) == self.cumulative_sizes[-1], \
+            ('Dataset and results have different sizes: '
+             f'{self.cumulative_sizes[-1]} v.s. {len(results)}')
+
+        # Check whether all the datasets support evaluation
+        for dataset in self.datasets:
+            assert hasattr(dataset, 'evaluate'), \
+                    f'{type(dataset)} does not implement evaluate function'
+
+        if self.separate_eval:
+            dataset_idx = -1
+            total_eval_results = dict()
+            for size, dataset in zip(self.cumulative_sizes, self.datasets):
+                start_idx = 0 if dataset_idx == -1 else \
+                    self.cumulative_sizes[dataset_idx]
+                end_idx = self.cumulative_sizes[dataset_idx + 1]
+
+                results_per_dataset = results[start_idx:end_idx]
+                print_log(
+                    f'\nEvaluateing {dataset.ann_file} with '
+                    f'{len(results_per_dataset)} images now',
+                    logger=logger)
+
+                eval_results_per_dataset = dataset.evaluate(
+                    results_per_dataset, logger=logger, **kwargs)
+                dataset_idx += 1
+                for k, v in eval_results_per_dataset.items():
+                    total_eval_results.update({f'{dataset_idx}_{k}': v})
+
+            return total_eval_results
+        elif any([isinstance(ds, CocoDataset) for ds in self.datasets]):
+            raise NotImplementedError(
+                'Evaluating concatenated CocoDataset as a whole is not'
+                ' supported! Please set "separate_eval=True"')
+        elif len(set([type(ds) for ds in self.datasets])) != 1:
+            raise NotImplementedError(
+                'All the datasets should have same types')
+        else:
+            original_data_infos = self.datasets[0].data_infos
+            self.datasets[0].data_infos = sum(
+                [dataset.data_infos for dataset in self.datasets], [])
+            eval_results = self.datasets[0].evaluate(
+                results, logger=logger, **kwargs)
+            self.datasets[0].data_infos = original_data_infos
+            return eval_results
+
+
+@DATASETS.register_module()
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.tile(self.dataset.flag, times)
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx % self._ori_len]
+
+    def get_cat_ids(self, idx):
+        """Get category ids of repeat dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.dataset.get_cat_ids(idx % self._ori_len)
+
+    def __len__(self):
+        """Length after repetition."""
+        return self.times * self._ori_len
+
+
+# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa
+@DATASETS.register_module()
+class ClassBalancedDataset(object):
+    """A wrapper of repeated dataset with repeat factor.
+
+    Suitable for training on class imbalanced datasets like LVIS. Following
+    the sampling strategy in the `paper <https://arxiv.org/abs/1908.03195>`_,
+    in each epoch, an image may appear multiple times based on its
+    "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined by the fraction of images in the training set (without repeats)
+    in which category c appears.
+    The dataset needs to instantiate :func:`self.get_cat_ids` to support
+    ClassBalancedDataset.
+
+    The repeat factor is computed as followed.
+
+    1. For each category c, compute the fraction # of images
+       that contain it: :math:`f(c)`
+    2. For each category c, compute the category-level repeat factor:
+       :math:`r(c) = max(1, sqrt(t/f(c)))`
+    3. For each image I, compute the image-level repeat factor:
+       :math:`r(I) = max_{c in I} r(c)`
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be repeated.
+        oversample_thr (float): frequency threshold below which data is
+            repeated. For categories with ``f_c >= oversample_thr``, there is
+            no oversampling. For categories with ``f_c < oversample_thr``, the
+            degree of oversampling following the square-root inverse frequency
+            heuristic above.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes will not be oversampled. Otherwise, they will be categorized
+            as the pure background class and involved into the oversampling.
+            Default: True.
+    """
+
+    def __init__(self, dataset, oversample_thr, filter_empty_gt=True):
+        self.dataset = dataset
+        self.oversample_thr = oversample_thr
+        self.filter_empty_gt = filter_empty_gt
+        self.CLASSES = dataset.CLASSES
+
+        repeat_factors = self._get_repeat_factors(dataset, oversample_thr)
+        repeat_indices = []
+        for dataset_idx, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_idx] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        flags = []
+        if hasattr(self.dataset, 'flag'):
+            for flag, repeat_factor in zip(self.dataset.flag, repeat_factors):
+                flags.extend([flag] * int(math.ceil(repeat_factor)))
+            assert len(flags) == len(repeat_indices)
+        self.flag = np.asarray(flags, dtype=np.uint8)
+
+    def _get_repeat_factors(self, dataset, repeat_thr):
+        """Get repeat factor for each images in the dataset.
+
+        Args:
+            dataset (:obj:`CustomDataset`): The dataset
+            repeat_thr (float): The threshold of frequency. If an image
+                contains the categories whose frequency below the threshold,
+                it would be repeated.
+
+        Returns:
+            list[float]: The repeat factors for each images in the dataset.
+        """
+
+        # 1. For each category c, compute the fraction # of images
+        #   that contain it: f(c)
+        category_freq = defaultdict(int)
+        num_images = len(dataset)
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t/f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        repeat_factors = []
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            repeat_factor = 1
+            if len(cat_ids) > 0:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+
+        return repeat_factors
+
+    def __getitem__(self, idx):
+        ori_index = self.repeat_indices[idx]
+        return self.dataset[ori_index]
+
+    def __len__(self):
+        """Length after repetition."""
+        return len(self.repeat_indices)
diff --git a/insightface/detection/scrfd/mmdet/datasets/deepfashion.py b/insightface/detection/scrfd/mmdet/datasets/deepfashion.py
new file mode 100755
index 0000000000000000000000000000000000000000..1125376091f2d4ee6843ae4f2156b3b0453be369
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/deepfashion.py
@@ -0,0 +1,10 @@
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(CocoDataset):
+
+    CLASSES = ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants', 'bag',
+               'neckwear', 'headwear', 'eyeglass', 'belt', 'footwear', 'hair',
+               'skin', 'face')
diff --git a/insightface/detection/scrfd/mmdet/datasets/lvis.py b/insightface/detection/scrfd/mmdet/datasets/lvis.py
new file mode 100755
index 0000000000000000000000000000000000000000..9f3eba0663a4dcde4432ed128ba7bd31160732d7
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/lvis.py
@@ -0,0 +1,744 @@
+import itertools
+import logging
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class LVISV05Dataset(CocoDataset):
+
+    CLASSES = (
+        'acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+        'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+        'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
+        'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
+        'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
+        'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
+        'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
+        'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
+        'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
+        'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
+        'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
+        'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
+        'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball', 'bead',
+        'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
+        'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle', 'beer_can',
+        'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle', 'bench',
+        'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder', 'binoculars',
+        'bird', 'birdfeeder', 'birdbath', 'birdcage', 'birdhouse',
+        'birthday_cake', 'birthday_card', 'biscuit_(bread)', 'pirate_flag',
+        'black_sheep', 'blackboard', 'blanket', 'blazer', 'blender', 'blimp',
+        'blinker', 'blueberry', 'boar', 'gameboard', 'boat', 'bobbin',
+        'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt', 'bolt', 'bonnet',
+        'book', 'book_bag', 'bookcase', 'booklet', 'bookmark',
+        'boom_microphone', 'boot', 'bottle', 'bottle_opener', 'bouquet',
+        'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie', 'bowl',
+        'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
+        'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
+        'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
+        'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
+        'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
+        'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
+        'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
+        'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
+        'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
+        'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
+        'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
+        'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
+        'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
+        'celery', 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
+        'champagne', 'chandelier', 'chap', 'checkbook', 'checkerboard',
+        'cherry', 'chessboard', 'chest_of_drawers_(furniture)',
+        'chicken_(animal)', 'chicken_wire', 'chickpea', 'Chihuahua',
+        'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
+        'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
+        'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
+        'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
+        'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
+        'clementine', 'clip', 'clipboard', 'clock', 'clock_tower',
+        'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
+        'coat_hanger', 'coatrack', 'cock', 'coconut', 'coffee_filter',
+        'coffee_maker', 'coffee_table', 'coffeepot', 'coil', 'coin',
+        'colander', 'coleslaw', 'coloring_material', 'combination_lock',
+        'pacifier', 'comic_book', 'computer_keyboard', 'concrete_mixer',
+        'cone', 'control', 'convertible_(automobile)', 'sofa_bed', 'cookie',
+        'cookie_jar', 'cooking_utensil', 'cooler_(for_food)',
+        'cork_(bottle_plug)', 'corkboard', 'corkscrew', 'edible_corn',
+        'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
+        'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
+        'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
+        'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
+        'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
+        'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
+        'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
+        'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
+        'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
+        'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
+        'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
+        'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+        'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
+        'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
+        'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'fish', 'fish_(food)', 'fishbowl', 'fishing_boat',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flash',
+        'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
+        'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
+        'food_processor', 'football_(American)', 'football_helmet',
+        'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
+        'freshener', 'frisbee', 'frog', 'fruit_juice', 'fruit_salad',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
+        'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
+        'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+        'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
+        'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
+        'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
+        'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
+        'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
+        'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
+        'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
+        'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
+        'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
+        'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
+        'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
+        'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
+        'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
+        'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
+        'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
+        'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
+        'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
+        'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
+        'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
+        'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
+        'speaker_(stero_equipment)', 'loveseat', 'machine_gun', 'magazine',
+        'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
+        'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
+        'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
+        'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
+        'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
+        'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
+        'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
+        'mound_(baseball)', 'mouse_(animal_rodent)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'nameplate', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newsstand',
+        'nightshirt', 'nosebag_(for_animals)', 'noseband_(for_animals)',
+        'notebook', 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
+        'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
+        'orange_(fruit)', 'orange_juice', 'oregano', 'ostrich', 'ottoman',
+        'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
+        'padlock', 'paintbox', 'paintbrush', 'painting', 'pajamas', 'palette',
+        'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
+        'papaya', 'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
+        'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
+        'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
+        'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playing_card', 'playpen', 'pliers',
+        'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
+        'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
+        'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'printer',
+        'projectile_(weapon)', 'projector', 'propeller', 'prune', 'pudding',
+        'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher', 'puppet',
+        'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit', 'race_car',
+        'racket', 'radar', 'radiator', 'radio_receiver', 'radish', 'raft',
+        'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'red_cabbage', 'reflector',
+        'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
+        'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
+        'Rollerblade', 'rolling_pin', 'root_beer',
+        'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
+        'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
+        'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
+        'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
+        'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
+        'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
+        'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
+        'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
+        'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
+        'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
+        'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag', 'shopping_cart',
+        'short_pants', 'shot_glass', 'shoulder_bag', 'shovel', 'shower_head',
+        'shower_curtain', 'shredder_(for_paper)', 'sieve', 'signboard', 'silo',
+        'sink', 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka',
+        'ski_pole', 'skirt', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'soda_fountain',
+        'carbonated_water', 'sofa', 'softball', 'solar_array', 'sombrero',
+        'soup', 'soup_bowl', 'soupspoon', 'sour_cream', 'soya_milk',
+        'space_shuttle', 'sparkler_(fireworks)', 'spatula', 'spear',
+        'spectacles', 'spice_rack', 'spider', 'sponge', 'spoon', 'sportswear',
+        'spotlight', 'squirrel', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife',
+        'steamer_(kitchen_appliance)', 'steering_wheel', 'stencil',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light',
+        'stove', 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+        'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+        'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+        'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
+        'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
+        'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
+        'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
+        'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
+        'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
+        'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
+        'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
+        'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
+        'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
+        'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
+        'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
+        'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
+        'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
+        'water_heater', 'water_jug', 'water_gun', 'water_scooter', 'water_ski',
+        'water_tower', 'watering_can', 'watermelon', 'weathervane', 'webcam',
+        'wedding_cake', 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair',
+        'whipped_cream', 'whiskey', 'whistle', 'wick', 'wig', 'wind_chime',
+        'windmill', 'window_box_(for_plants)', 'windshield_wiper', 'windsock',
+        'wine_bottle', 'wine_bucket', 'wineglass', 'wing_chair',
+        'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', 'wreath',
+        'wrench', 'wristband', 'wristlet', 'yacht', 'yak', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini')
+
+    def load_annotations(self, ann_file):
+        """Load annotation from lvis style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from LVIS api.
+        """
+
+        try:
+            import lvis
+            assert lvis.__version__ >= '10.5.3'
+            from lvis import LVIS
+        except AssertionError:
+            raise AssertionError('Incompatible version of lvis is installed. '
+                                 'Run pip uninstall lvis first. Then run pip '
+                                 'install mmlvis to install open-mmlab forked '
+                                 'lvis. ')
+        except ImportError:
+            raise ImportError('Package lvis is not installed. Please run pip '
+                              'install mmlvis to install open-mmlab forked '
+                              'lvis.')
+        self.coco = LVIS(ann_file)
+        assert not self.custom_classes, 'LVIS custom classes is not supported'
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            if info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                info['filename'] = info['file_name'][-16:]
+            else:
+                info['filename'] = info['file_name']
+            data_infos.append(info)
+        return data_infos
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        """Evaluation in LVIS protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None):
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float]): IoU threshold used for evaluating
+                recalls. If set to a list, the average recall of all IoUs will
+                also be computed. Default: 0.5.
+
+        Returns:
+            dict[str, float]: LVIS style metrics.
+        """
+
+        try:
+            import lvis
+            assert lvis.__version__ >= '10.5.3'
+            from lvis import LVISResults, LVISEval
+        except AssertionError:
+            raise AssertionError('Incompatible version of lvis is installed. '
+                                 'Run pip uninstall lvis first. Then run pip '
+                                 'install mmlvis to install open-mmlab forked '
+                                 'lvis. ')
+        except ImportError:
+            raise ImportError('Package lvis is not installed. Please run pip '
+                              'install mmlvis to install open-mmlab forked '
+                              'lvis.')
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError('metric {} is not supported'.format(metric))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2json(results, jsonfile_prefix)
+
+        eval_results = OrderedDict()
+        # get original api
+        lvis_gt = self.coco
+        for metric in metrics:
+            msg = 'Evaluating {}...'.format(metric)
+            if logger is None:
+                msg = '\n' + msg
+            print_log(msg, logger=logger)
+
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    results, proposal_nums, iou_thrs, logger='silent')
+                log_msg = []
+                for i, num in enumerate(proposal_nums):
+                    eval_results['AR@{}'.format(num)] = ar[i]
+                    log_msg.append('\nAR@{}\t{:.4f}'.format(num, ar[i]))
+                log_msg = ''.join(log_msg)
+                print_log(log_msg, logger=logger)
+                continue
+
+            if metric not in result_files:
+                raise KeyError('{} is not in results'.format(metric))
+            try:
+                lvis_dt = LVISResults(lvis_gt, result_files[metric])
+            except IndexError:
+                print_log(
+                    'The testing results of the whole dataset is empty.',
+                    logger=logger,
+                    level=logging.ERROR)
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
+            lvis_eval.params.imgIds = self.img_ids
+            if metric == 'proposal':
+                lvis_eval.params.useCats = 0
+                lvis_eval.params.maxDets = list(proposal_nums)
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                for k, v in lvis_eval.get_results().items():
+                    if k.startswith('AR'):
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[k] = val
+            else:
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                lvis_results = lvis_eval.get_results()
+                if classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = lvis_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self.coco.load_cats(catId)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    print_log('\n' + table.table, logger=logger)
+
+                for k, v in lvis_results.items():
+                    if k.startswith('AP'):
+                        key = '{}_{}'.format(metric, k)
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[key] = val
+                ap_summary = ' '.join([
+                    '{}:{:.3f}'.format(k, float(v))
+                    for k, v in lvis_results.items() if k.startswith('AP')
+                ])
+                eval_results['{}_mAP_copypaste'.format(metric)] = ap_summary
+            lvis_eval.print_results()
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+LVISDataset = LVISV05Dataset
+DATASETS.register_module(name='LVISDataset', module=LVISDataset)
+
+
+@DATASETS.register_module()
+class LVISV1Dataset(LVISDataset):
+
+    CLASSES = (
+        'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
+        'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
+        'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+        'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+        'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+        'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+        'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+        'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+        'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+        'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+        'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+        'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+        'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+        'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+        'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+        'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+        'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+        'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+        'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+        'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+        'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+        'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+        'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+        'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+        'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
+        'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
+        'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
+        'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
+        'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
+        'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
+        'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
+        'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+        'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+        'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+        'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+        'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
+        'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
+        'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
+        'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
+        'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
+        'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+        'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+        'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+        'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+        'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
+        'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
+        'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
+        'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
+        'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
+        'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
+        'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
+        'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
+        'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+        'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+        'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+        'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+        'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+        'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+        'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+        'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+        'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+        'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+        'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+        'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+        'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
+        'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
+        'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
+        'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
+        'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+        'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+        'folding_chair', 'food_processor', 'football_(American)',
+        'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+        'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
+        'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
+        'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
+        'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
+        'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
+        'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+        'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+        'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+        'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+        'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+        'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+        'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+        'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+        'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+        'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+        'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+        'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+        'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
+        'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+        'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+        'lizard', 'log', 'lollipop', 'speaker_(stero_equipment)', 'loveseat',
+        'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+        'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
+        'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
+        'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
+        'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
+        'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
+        'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
+        'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
+        'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+        'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+        'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+        'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
+        'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
+        'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
+        'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
+        'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+        'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+        'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+        'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+        'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+        'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
+        'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+        'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+        'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+        'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+        'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'reflector', 'remote_control',
+        'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+        'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+        'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+        'rubber_band', 'runner_(carpet)', 'plastic_bag',
+        'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+        'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+        'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+        'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+        'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+        'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+        'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+        'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+        'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+        'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+        'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+        'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+        'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+        'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+        'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+        'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+        'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+        'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
+        'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
+        'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
+        'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
+        'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
+        'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
+        'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+        'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+        'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
+        'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
+        'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+        'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+        'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+        'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+        'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+        'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+        'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+        'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+        'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+        'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+        'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+        'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+        'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+        'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+        'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini')
+
+    def load_annotations(self, ann_file):
+        try:
+            import lvis
+            assert lvis.__version__ >= '10.5.3'
+            from lvis import LVIS
+        except AssertionError:
+            raise AssertionError('Incompatible version of lvis is installed. '
+                                 'Run pip uninstall lvis first. Then run pip '
+                                 'install mmlvis to install open-mmlab forked '
+                                 'lvis. ')
+        except ImportError:
+            raise ImportError('Package lvis is not installed. Please run pip '
+                              'install mmlvis to install open-mmlab forked '
+                              'lvis.')
+        self.coco = LVIS(ann_file)
+        assert not self.custom_classes, 'LVIS custom classes is not supported'
+        self.cat_ids = self.coco.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            # coco_url is used in LVISv1 instead of file_name
+            # e.g. http://images.cocodataset.org/train2017/000000391895.jpg
+            # train/val split in specified in url
+            info['filename'] = info['coco_url'].replace(
+                'http://images.cocodataset.org/', '')
+            data_infos.append(info)
+        return data_infos
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/__init__.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ccaabe4bc22142b468cb4cf35bc920ce4f9efed3
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/__init__.py
@@ -0,0 +1,27 @@
+from .auto_augment import (AutoAugment, BrightnessTransform, ColorTransform,
+                           ContrastTransform, EqualizeTransform, Rotate, Shear,
+                           Translate)
+from .compose import Compose
+from .formating import (Collect, DefaultFormatBundle, ImageToTensor,
+                        ToDataContainer, ToTensor, Transpose, to_tensor)
+from .instaboost import InstaBoost
+from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam,
+                      LoadMultiChannelImageFromFiles, LoadProposals)
+from .test_time_aug import MultiScaleFlipAug
+from .transforms import (Albu, CutOut, Expand, MinIoURandomCrop, Normalize,
+                         Pad, PhotoMetricDistortion, RandomCenterCropPad,
+                         RandomSquareCrop,
+                         RandomCrop, RandomFlip, Resize, SegRescale)
+
+__all__ = [
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+    'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
+    'LoadImageFromFile', 'LoadImageFromWebcam',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug',
+    'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale',
+    'RandomSquareCrop',
+    'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu',
+    'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear',
+    'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform',
+    'ContrastTransform', 'Translate'
+]
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
new file mode 100755
index 0000000000000000000000000000000000000000..0d225331afa69440587e2f13daf1aaacff9ff786
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
@@ -0,0 +1,890 @@
+import copy
+
+import cv2
+import mmcv
+import numpy as np
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+_MAX_LEVEL = 10
+
+
+def level_to_value(level, max_value):
+    """Map from level to values based on max_value."""
+    return (level / _MAX_LEVEL) * max_value
+
+
+def enhance_level_to_value(level, a=1.8, b=0.1):
+    """Map from level to values."""
+    return (level / _MAX_LEVEL) * a + b
+
+
+def random_negative(value, random_negative_prob):
+    """Randomly negate value based on random_negative_prob."""
+    return -value if np.random.rand() < random_negative_prob else value
+
+
+def bbox2fields():
+    """The key correspondence from bboxes to labels, masks and
+    segmentations."""
+    bbox2label = {
+        'gt_bboxes': 'gt_labels',
+        'gt_bboxes_ignore': 'gt_labels_ignore'
+    }
+    bbox2mask = {
+        'gt_bboxes': 'gt_masks',
+        'gt_bboxes_ignore': 'gt_masks_ignore'
+    }
+    bbox2seg = {
+        'gt_bboxes': 'gt_semantic_seg',
+    }
+    return bbox2label, bbox2mask, bbox2seg
+
+
+@PIPELINES.register_module()
+class AutoAugment(object):
+    """Auto augmentation.
+
+    This data augmentation is proposed in `Learning Data Augmentation
+    Strategies for Object Detection <https://arxiv.org/pdf/1906.11172>`_.
+
+    TODO: Implement 'Shear', 'Sharpness' and 'Rotate' transforms
+
+    Args:
+        policies (list[list[dict]]): The policies of auto augmentation. Each
+            policy in ``policies`` is a specific augmentation policy, and is
+            composed by several augmentations (dict). When AutoAugment is
+            called, a random policy in ``policies`` will be selected to
+            augment images.
+
+    Examples:
+        >>> replace = (104, 116, 124)
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(
+        >>>             type='Shear',
+        >>>             prob=0.4,
+        >>>             level=0,
+        >>>             replace=replace,
+        >>>             axis='x')
+        >>>     ],
+        >>>     [
+        >>>         dict(
+        >>>             type='Rotate',
+        >>>             prob=0.6,
+        >>>             level=10,
+        >>>             replace=replace),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self, policies):
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+
+        self.policies = copy.deepcopy(policies)
+        self.transforms = [Compose(policy) for policy in self.policies]
+
+    def __call__(self, results):
+        transform = np.random.choice(self.transforms)
+        return transform(results)
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(policies={self.policies})'
+
+
+@PIPELINES.register_module()
+class Shear(object):
+    """Apply Shear Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range [0,_MAX_LEVEL].
+        img_fill_val (int | float | tuple): The filled values for image border.
+            If float, the same fill value will be used for all the three
+            channels of image. If tuple, the should be 3 elements.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1].
+        direction (str): The direction for shear, either "horizontal"
+            or "vertical".
+        max_shear_magnitude (float): The maximum magnitude for Shear
+            transformation.
+        random_negative_prob (float): The probability that turns the
+                offset negative. Should be in range [0,1]
+        interpolation (str): Same as in :func:`mmcv.imshear`.
+    """
+
+    def __init__(self,
+                 level,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 direction='horizontal',
+                 max_shear_magnitude=0.3,
+                 random_negative_prob=0.5,
+                 interpolation='bilinear'):
+        assert isinstance(level, (int, float)), 'The level must be type ' \
+            f'int or float, got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, 'The level should be in range ' \
+            f'[0,{_MAX_LEVEL}], got {level}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must ' \
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), 'all ' \
+            'elements of img_fill_val should between range [0,255].' \
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability of shear should be in ' \
+            f'range [0,1]. got {prob}.'
+        assert direction in ('horizontal', 'vertical'), 'direction must ' \
+            f'in be either "horizontal" or "vertical". got {direction}.'
+        assert isinstance(max_shear_magnitude, float), 'max_shear_magnitude ' \
+            f'should be type float. got {type(max_shear_magnitude)}.'
+        assert 0. <= max_shear_magnitude <= 1., 'Defaultly ' \
+            'max_shear_magnitude should be in range [0,1]. ' \
+            f'got {max_shear_magnitude}.'
+        self.level = level
+        self.magnitude = level_to_value(level, max_shear_magnitude)
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.direction = direction
+        self.max_shear_magnitude = max_shear_magnitude
+        self.random_negative_prob = random_negative_prob
+        self.interpolation = interpolation
+
+    def _shear_img(self,
+                   results,
+                   magnitude,
+                   direction='horizontal',
+                   interpolation='bilinear'):
+        """Shear the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The direction for shear, either "horizontal"
+                or "vertical".
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            img_sheared = mmcv.imshear(
+                img,
+                magnitude,
+                direction,
+                border_value=self.img_fill_val,
+                interpolation=interpolation)
+            results[key] = img_sheared.astype(img.dtype)
+
+    def _shear_bboxes(self, results, magnitude):
+        """Shear the bboxes."""
+        h, w, c = results['img_shape']
+        if self.direction == 'horizontal':
+            shear_matrix = np.stack([[1, magnitude],
+                                     [0, 1]]).astype(np.float32)  # [2, 2]
+        else:
+            shear_matrix = np.stack([[1, 0], [magnitude,
+                                              1]]).astype(np.float32)
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_box, 1]
+            coordinates = coordinates[..., 0].transpose(
+                (2, 1, 0)).astype(np.float32)  # [nb_box, 2, 4]
+            new_coords = np.matmul(shear_matrix[None, :, :],
+                                   coordinates)  # [nb_box, 2, 4]
+            min_x = np.min(new_coords[:, 0, :], axis=-1)
+            min_y = np.min(new_coords[:, 1, :], axis=-1)
+            max_x = np.max(new_coords[:, 0, :], axis=-1)
+            max_y = np.max(new_coords[:, 1, :], axis=-1)
+            min_x = np.clip(min_x, a_min=0, a_max=w)
+            min_y = np.clip(min_y, a_min=0, a_max=h)
+            max_x = np.clip(max_x, a_min=min_x, a_max=w)
+            max_y = np.clip(max_y, a_min=min_y, a_max=h)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _shear_masks(self,
+                     results,
+                     magnitude,
+                     direction='horizontal',
+                     fill_val=0,
+                     interpolation='bilinear'):
+        """Shear the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.shear((h, w),
+                                       magnitude,
+                                       direction,
+                                       border_value=fill_val,
+                                       interpolation=interpolation)
+
+    def _shear_seg(self,
+                   results,
+                   magnitude,
+                   direction='horizontal',
+                   fill_val=255,
+                   interpolation='bilinear'):
+        """Shear the segmentation maps."""
+        for key in results.get('seg_fields', []):
+            seg = results[key]
+            results[key] = mmcv.imshear(
+                seg,
+                magnitude,
+                direction,
+                border_value=fill_val,
+                interpolation=interpolation).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after shear
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to shear images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Sheared results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        magnitude = random_negative(self.magnitude, self.random_negative_prob)
+        self._shear_img(results, magnitude, self.direction, self.interpolation)
+        self._shear_bboxes(results, magnitude)
+        # fill_val set to 0 for background of mask.
+        self._shear_masks(
+            results,
+            magnitude,
+            self.direction,
+            fill_val=0,
+            interpolation=self.interpolation)
+        self._shear_seg(
+            results,
+            magnitude,
+            self.direction,
+            fill_val=self.seg_ignore_label,
+            interpolation=self.interpolation)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'direction={self.direction}, '
+        repr_str += f'max_shear_magnitude={self.max_shear_magnitude}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Rotate(object):
+    """Apply Rotate Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range (0,_MAX_LEVEL].
+        scale (int | float): Isotropic scale factor. Same in
+            ``mmcv.imrotate``.
+        center (int | float | tuple[float]): Center point (w, h) of the
+            rotation in the source image. If None, the center of the
+            image will be used. Same in ``mmcv.imrotate``.
+        img_fill_val (int | float | tuple): The fill value for image border.
+            If float, the same value will be used for all the three
+            channels of image. If tuple, the should be 3 elements (e.g.
+            equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1.
+        max_rotate_angle (int | float): The maximum angles for rotate
+            transformation.
+        random_negative_prob (float): The probability that turns the
+             offset negative.
+    """
+
+    def __init__(self,
+                 level,
+                 scale=1,
+                 center=None,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 max_rotate_angle=30,
+                 random_negative_prob=0.5):
+        assert isinstance(level, (int, float)), \
+            f'The level must be type int or float. got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
+        assert isinstance(scale, (int, float)), \
+            f'The scale must be type int or float. got type {type(scale)}.'
+        if isinstance(center, (int, float)):
+            center = (center, center)
+        elif isinstance(center, tuple):
+            assert len(center) == 2, 'center with type tuple must have '\
+                f'2 elements. got {len(center)} elements.'
+        else:
+            assert center is None, 'center must be None or type int, '\
+                f'float or tuple, got type {type(center)}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255]. '\
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            'got {prob}.'
+        assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
+            f'should be type int or float. got type {type(max_rotate_angle)}.'
+        self.level = level
+        self.scale = scale
+        # Rotation angle in degrees. Positive values mean
+        # clockwise rotation.
+        self.angle = level_to_value(level, max_rotate_angle)
+        self.center = center
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.max_rotate_angle = max_rotate_angle
+        self.random_negative_prob = random_negative_prob
+
+    def _rotate_img(self, results, angle, center=None, scale=1.0):
+        """Rotate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            angle (float): Rotation angle in degrees, positive values
+                mean clockwise rotation. Same in ``mmcv.imrotate``.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation. Same in ``mmcv.imrotate``.
+            scale (int | float): Isotropic scale factor. Same in
+                ``mmcv.imrotate``.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            img_rotated = mmcv.imrotate(
+                img, angle, center, scale, border_value=self.img_fill_val)
+            results[key] = img_rotated.astype(img.dtype)
+
+    def _rotate_bboxes(self, results, rotate_matrix):
+        """Rotate the bboxes."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_bbox, 1]
+            # pad 1 to convert from format [x, y] to homogeneous
+            # coordinates format [x, y, 1]
+            coordinates = np.concatenate(
+                (coordinates,
+                 np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
+                axis=1)  # [4, 3, nb_bbox, 1]
+            coordinates = coordinates.transpose(
+                (2, 0, 1, 3))  # [nb_bbox, 4, 3, 1]
+            rotated_coords = np.matmul(rotate_matrix,
+                                       coordinates)  # [nb_bbox, 4, 2, 1]
+            rotated_coords = rotated_coords[..., 0]  # [nb_bbox, 4, 2]
+            min_x, min_y = np.min(
+                rotated_coords[:, :, 0], axis=1), np.min(
+                    rotated_coords[:, :, 1], axis=1)
+            max_x, max_y = np.max(
+                rotated_coords[:, :, 0], axis=1), np.max(
+                    rotated_coords[:, :, 1], axis=1)
+            min_x, min_y = np.clip(
+                min_x, a_min=0, a_max=w), np.clip(
+                    min_y, a_min=0, a_max=h)
+            max_x, max_y = np.clip(
+                max_x, a_min=min_x, a_max=w), np.clip(
+                    max_y, a_min=min_y, a_max=h)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _rotate_masks(self,
+                      results,
+                      angle,
+                      center=None,
+                      scale=1.0,
+                      fill_val=0):
+        """Rotate the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
+
+    def _rotate_seg(self,
+                    results,
+                    angle,
+                    center=None,
+                    scale=1.0,
+                    fill_val=255):
+        """Rotate the segmentation map."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imrotate(
+                seg, angle, center, scale,
+                border_value=fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after rotate
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to rotate images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        h, w = results['img'].shape[:2]
+        center = self.center
+        if center is None:
+            center = ((w - 1) * 0.5, (h - 1) * 0.5)
+        angle = random_negative(self.angle, self.random_negative_prob)
+        self._rotate_img(results, angle, center, self.scale)
+        rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
+        self._rotate_bboxes(results, rotate_matrix)
+        self._rotate_masks(results, angle, center, self.scale, fill_val=0)
+        self._rotate_seg(
+            results, angle, center, self.scale, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'center={self.center}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Translate(object):
+    """Translate the images, bboxes, masks and segmentation maps horizontally
+    or vertically.
+
+    Args:
+        level (int | float): The level for Translate and should be in
+            range [0,_MAX_LEVEL].
+        prob (float): The probability for performing translation and
+            should be in range [0, 1].
+        img_fill_val (int | float | tuple): The filled value for image
+            border. If float, the same fill value will be used for all
+            the three channels of image. If tuple, the should be 3
+            elements (e.g. equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        max_translate_offset (int | float): The maximum pixel's offset for
+            Translate.
+        random_negative_prob (float): The probability that turns the
+            offset negative.
+        min_size (int | float): The minimum pixel for filtering
+            invalid bboxes after the translation.
+    """
+
+    def __init__(self,
+                 level,
+                 prob=0.5,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 direction='horizontal',
+                 max_translate_offset=250.,
+                 random_negative_prob=0.5,
+                 min_size=0):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level used for calculating Translate\'s offset should be ' \
+            'in range [0,_MAX_LEVEL]'
+        assert 0 <= prob <= 1.0, \
+            'The probability of translation should be in range [0, 1].'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, \
+                'img_fill_val as tuple must have 3 elements.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError('img_fill_val must be type float or tuple.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255].'
+        assert direction in ('horizontal', 'vertical'), \
+            'direction should be "horizontal" or "vertical".'
+        assert isinstance(max_translate_offset, (int, float)), \
+            'The max_translate_offset must be type int or float.'
+        # the offset used for translation
+        self.offset = int(level_to_value(level, max_translate_offset))
+        self.level = level
+        self.prob = prob
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.direction = direction
+        self.max_translate_offset = max_translate_offset
+        self.random_negative_prob = random_negative_prob
+        self.min_size = min_size
+
+    def _translate_img(self, results, offset, direction='horizontal'):
+        """Translate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            results[key] = mmcv.imtranslate(
+                img, offset, direction, self.img_fill_val).astype(img.dtype)
+
+    def _translate_bboxes(self, results, offset):
+        """Shift bboxes horizontally or vertically, according to offset."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            if self.direction == 'horizontal':
+                min_x = np.maximum(0, min_x + offset)
+                max_x = np.minimum(w, max_x + offset)
+            elif self.direction == 'vertical':
+                min_y = np.maximum(0, min_y + offset)
+                max_y = np.minimum(h, max_y + offset)
+
+            # the boxs translated outside of image will be filtered along with
+            # the corresponding masks, by invoking ``_filter_invalid``.
+            results[key] = np.concatenate([min_x, min_y, max_x, max_y],
+                                          axis=-1)
+
+    def _translate_masks(self,
+                         results,
+                         offset,
+                         direction='horizontal',
+                         fill_val=0):
+        """Translate masks horizontally or vertically."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.translate((h, w), offset, direction, fill_val)
+
+    def _translate_seg(self,
+                       results,
+                       offset,
+                       direction='horizontal',
+                       fill_val=255):
+        """Translate segmentation maps horizontally or vertically."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imtranslate(seg, offset, direction,
+                                            fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_size=0):
+        """Filter bboxes and masks too small or translated out of image."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_size) & (bbox_h > min_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+        return results
+
+    def __call__(self, results):
+        """Call function to translate images, bounding boxes, masks and
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Translated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        offset = random_negative(self.offset, self.random_negative_prob)
+        self._translate_img(results, offset, self.direction)
+        self._translate_bboxes(results, offset)
+        # fill_val defaultly 0 for BitmapMasks and None for PolygonMasks.
+        self._translate_masks(results, offset, self.direction)
+        # fill_val set to ``seg_ignore_label`` for the ignored value
+        # of segmentation map.
+        self._translate_seg(
+            results, offset, self.direction, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results, min_size=self.min_size)
+        return results
+
+
+@PIPELINES.register_module()
+class ColorTransform(object):
+    """Apply Color transformation to image. The bboxes, masks, and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Color transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_color_img(self, results, factor=1.0):
+        """Apply Color transformation to image."""
+        for key in results.get('img_fields', ['img']):
+            # NOTE defaultly the image should be BGR format
+            img = results[key]
+            results[key] = mmcv.adjust_color(img, factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Color transformation.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Colored results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_color_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class EqualizeTransform(object):
+    """Apply Equalize transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+    """
+
+    def __init__(self, prob=0.5):
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.prob = prob
+
+    def _imequalize(self, results):
+        """Equalizes the histogram of one image."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.imequalize(img).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Equalize transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._imequalize(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob})'
+
+
+@PIPELINES.register_module()
+class BrightnessTransform(object):
+    """Apply Brightness transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Brightness transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_brightness_img(self, results, factor=1.0):
+        """Adjust the brightness of image."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.adjust_brightness(img,
+                                                  factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Brightness transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_brightness_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ContrastTransform(object):
+    """Apply Contrast transformation to image. The bboxes, masks and
+    segmentations are not modified.
+
+    Args:
+        level (int | float): Should be in range [0,_MAX_LEVEL].
+        prob (float): The probability for performing Contrast transformation.
+    """
+
+    def __init__(self, level, prob=0.5):
+        assert isinstance(level, (int, float)), \
+            'The level must be type int or float.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            'The level should be in range [0,_MAX_LEVEL].'
+        assert 0 <= prob <= 1.0, \
+            'The probability should be in range [0,1].'
+        self.level = level
+        self.prob = prob
+        self.factor = enhance_level_to_value(level)
+
+    def _adjust_contrast_img(self, results, factor=1.0):
+        """Adjust the image contrast."""
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.adjust_contrast(img, factor).astype(img.dtype)
+
+    def __call__(self, results):
+        """Call function for Contrast transformation.
+
+        Args:
+            results (dict): Results dict from loading pipeline.
+
+        Returns:
+            dict: Results after the transformation.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        self._adjust_contrast_img(results, self.factor)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/compose.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/compose.py
new file mode 100755
index 0000000000000000000000000000000000000000..ca48f1c935755c486edc2744e1713e2b5ba3cdc8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/compose.py
@@ -0,0 +1,51 @@
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose(object):
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/formating.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/formating.py
new file mode 100755
index 0000000000000000000000000000000000000000..afb75edc2f87239435eb9a8e51fb9b9e7d0d154d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/formating.py
@@ -0,0 +1,364 @@
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor(object):
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor(object):
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img.transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose(object):
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer(object):
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+            dict(key='gt_labels'))``.
+    """
+
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to \
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss', 'gt_labels']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32),
+                std=np.ones(num_channels, dtype=np.float32),
+                to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - "img_shape": shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - "scale_factor": a float indicating the preprocessing scale
+
+        - "flip": a boolean indicating if image flip transform was used
+
+        - "filename": path to the image file
+
+        - "ori_shape": original shape of the image as a tuple (h, w, c)
+
+        - "pad_shape": image shape after padding
+
+        - "img_norm_cfg": a dict of normalization information:
+
+            - mean - per channel mean subtraction
+            - std - per channel std divisor
+            - to_rgb - bool indicating if bgr was converted to rgb
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
+            'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+            'img_norm_cfg')``
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:mmcv.DataContainer.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+
+                - keys in``self.keys``
+                - ``img_metas``
+        """
+
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class WrapFieldsToLists(object):
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/instaboost.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/instaboost.py
new file mode 100755
index 0000000000000000000000000000000000000000..38b6819f60587a6e0c0f6d57bfda32bb3a7a4267
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/instaboost.py
@@ -0,0 +1,98 @@
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class InstaBoost(object):
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+    """
+
+    def __init__(self,
+                 action_candidate=('normal', 'horizontal', 'skip'),
+                 action_prob=(1, 0, 0),
+                 scale=(0.8, 1.2),
+                 dx=15,
+                 dy=15,
+                 theta=(-1, 1),
+                 color_prob=0.5,
+                 hflag=False,
+                 aug_ratio=0.5):
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results):
+        labels = results['ann_info']['labels']
+        masks = results['ann_info']['masks']
+        bboxes = results['ann_info']['bboxes']
+        n = len(labels)
+
+        anns = []
+        for i in range(n):
+            label = labels[i]
+            bbox = bboxes[i]
+            mask = masks[i]
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+            anns.append({
+                'category_id': label,
+                'segmentation': mask,
+                'bbox': bbox
+            })
+
+        return anns
+
+    def _parse_anns(self, results, anns, img):
+        gt_bboxes = []
+        gt_labels = []
+        gt_masks_ann = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            gt_bboxes.append(bbox)
+            gt_labels.append(ann['category_id'])
+            gt_masks_ann.append(ann['segmentation'])
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+        results['ann_info']['labels'] = gt_labels
+        results['ann_info']['bboxes'] = gt_bboxes
+        results['ann_info']['masks'] = gt_masks_ann
+        results['img'] = img
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        orig_type = img.dtype
+        anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, img.astype(orig_type))
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(cfg={self.cfg}, aug_ratio={self.aug_ratio})'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/loading.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/loading.py
new file mode 100755
index 0000000000000000000000000000000000000000..7f8c1a5c14f213bfb2d37bd6cf84efb39ff70abe
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/loading.py
@@ -0,0 +1,480 @@
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+from mmdet.core import BitmapMasks, PolygonMasks
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile(object):
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = osp.join(results['img_prefix'],
+                                results['img_info']['filename'])
+        else:
+            filename = results['img_info']['filename']
+
+        img_bytes = self.file_client.get(filename)
+        img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromWebcam(LoadImageFromFile):
+    """Load an image from webcam.
+
+    Similar with :obj:`LoadImageFromFile`, but the image read from webcam is in
+    ``results['img']``.
+    """
+
+    def __call__(self, results):
+        """Call functions to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = None
+        results['ori_filename'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+
+@PIPELINES.register_module()
+class LoadMultiChannelImageFromFiles(object):
+    """Load multi-channel images from a list of separate channel files.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename", which is expected to be a list of filenames).
+    Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='unchanged',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = [
+                osp.join(results['img_prefix'], fname)
+                for fname in results['img_info']['filename']
+            ]
+        else:
+            filename = results['img_info']['filename']
+
+        img = []
+        for name in filename:
+            img_bytes = self.file_client.get(name)
+            img.append(mmcv.imfrombytes(img_bytes, flag=self.color_type))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations(object):
+    """Load mutiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_keypoints (bool): Whether to parse and load the keypoints annotation.
+            Default: False.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_keypoints=False,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 file_client_args=dict(backend='disk')):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_keypoints = with_keypoints
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+        return results
+    def _load_keypoints(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_keypointss'] = ann_info['keypointss'].copy()
+
+        results['keypoints_fields'] = ['gt_keypointss']
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results['gt_labels'] = results['ann_info']['labels'].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks(
+                [self.process_polygons(polygons) for polygons in gt_masks], h,
+                w)
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        results['gt_semantic_seg'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_keypoints:
+            results = self._load_keypoints(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_keypoints={self.with_keypoints}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg})'
+        repr_str += f'poly2mask={self.poly2mask})'
+        repr_str += f'poly2mask={self.file_client_args})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadProposals(object):
+    """Load proposal pipeline.
+
+    Required key is "proposals". Updated keys are "proposals", "bbox_fields".
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals=None):
+        self.num_max_proposals = num_max_proposals
+
+    def __call__(self, results):
+        """Call function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        if proposals.shape[1] not in (4, 5):
+            raise AssertionError(
+                'proposals should have shapes (n, 4) or (n, 5), '
+                f'but found {proposals.shape}')
+        proposals = proposals[:, :4]
+
+        if self.num_max_proposals is not None:
+            proposals = proposals[:self.num_max_proposals]
+
+        if len(proposals) == 0:
+            proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        results['proposals'] = proposals
+        results['bbox_fields'].append('proposals')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(num_max_proposals={self.num_max_proposals})'
+
+
+@PIPELINES.register_module()
+class FilterAnnotations(object):
+    """Filter invalid annotations.
+
+    Args:
+        min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth
+            boxes.
+    """
+
+    def __init__(self, min_gt_bbox_wh):
+        # TODO: add more filter options
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+
+    def __call__(self, results):
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+        h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+        keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1])
+        if not keep.any():
+            return None
+        else:
+            keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg')
+            for key in keys:
+                if key in results:
+                    results[key] = results[key][keep]
+            return results
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/test_time_aug.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/test_time_aug.py
new file mode 100755
index 0000000000000000000000000000000000000000..2b0ad4db3e0f42219c713ac899c7fd7d4368d322
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,119 @@
+import warnings
+
+import mmcv
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug(object):
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        img_scale=[(1333, 400), (1333, 800)],
+        flip=True,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ]
+
+    After MultiScaleFLipAug with above configuration, the results are wrapped
+    into lists of the same length as followed:
+
+    .. code-block::
+
+        dict(
+            img=[...],
+            img_shape=[...],
+            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
+            flip=[False, True, False, True]
+            ...
+        )
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float | list[float] | None): Scale factors for resizing.
+        flip (bool): Whether apply flip augmentation. Default: False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal" and "vertical". If flip_direction is list,
+            multiple flip augmentations will be applied.
+            It has no effect when flip == False. Default: "horizontal".
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale=None,
+                 scale_factor=None,
+                 flip=False,
+                 flip_direction='horizontal'):
+        self.transforms = Compose(transforms)
+        assert (img_scale is None) ^ (scale_factor is None), (
+            'Must have but only one variable can be setted')
+        if img_scale is not None:
+            self.img_scale = img_scale if isinstance(img_scale,
+                                                     list) else [img_scale]
+            self.scale_key = 'scale'
+            assert mmcv.is_list_of(self.img_scale, tuple)
+        else:
+            self.img_scale = scale_factor if isinstance(
+                scale_factor, list) else [scale_factor]
+            self.scale_key = 'scale_factor'
+
+        self.flip = flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip
+                and not any([t['type'] == 'RandomFlip' for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict[str: list]: The augmented data, where each value is wrapped
+               into a list.
+        """
+
+        aug_data = []
+        flip_args = [(False, None)]
+        if self.flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.img_scale:
+            for flip, direction in flip_args:
+                _results = results.copy()
+                _results[self.scale_key] = scale
+                _results['flip'] = flip
+                _results['flip_direction'] = direction
+                data = self.transforms(_results)
+                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip})'
+        repr_str += f'flip_direction={self.flip_direction}'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/datasets/pipelines/transforms.py b/insightface/detection/scrfd/mmdet/datasets/pipelines/transforms.py
new file mode 100755
index 0000000000000000000000000000000000000000..869acfb5b11ce48a5b483cdf3dcffac7a35740e4
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/pipelines/transforms.py
@@ -0,0 +1,2037 @@
+import inspect
+
+import mmcv
+import numpy as np
+from numpy import random
+import cv2
+
+from mmdet.core import PolygonMasks
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from ..builder import PIPELINES
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+
+@PIPELINES.register_module()
+class Resize(object):
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and uper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_keypoints(self, results):
+        for key in results.get('keypoints_fields', []):
+            keypointss = results[key].copy()
+            factors = results['scale_factor']
+            assert factors[0]==factors[2]
+            assert factors[1]==factors[3]
+            #print('AAA', results['scale_factor'])
+            keypointss[:,:,0] *= factors[0]
+            keypointss[:,:,1] *= factors[1]
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                keypointss[:,:, 0] = np.clip(keypointss[:,:, 0], 0, img_shape[1])
+                keypointss[:,:, 1] = np.clip(keypointss[:,:, 1], 0, img_shape[0])
+            results[key] = keypointss
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_semantic_seg'] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_keypoints(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio})'
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip(object):
+    """Flip the image & bbox & mask.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image wil
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+         of 0.3, vertically with probability of 0.5
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert mmcv.is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmcv.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def keypoints_flip(self, keypointss, img_shape, direction):
+
+        assert direction == 'horizontal'
+        assert keypointss.shape[-1] == 3
+        assert keypointss.shape[1]==5
+
+        assert keypointss.ndim==3
+        flipped = keypointss.copy()
+        flip_order = [1,0,2,4,3]
+        for idx, a in enumerate(flip_order):
+            flipped[:,idx,:] = keypointss[:,a,:]
+        w = img_shape[1]
+        flipped[..., 0] = w - flipped[..., 0]
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list) -
+                                                    1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            for key in results.get('keypoints_fields', []):
+                results[key] = self.keypoints_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class Pad(object):
+    """Pad the image & mask.
+
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        for key in results.get('img_fields', ['img']):
+            if self.size is not None:
+                padded_img = mmcv.impad(
+                    results[key], shape=self.size, pad_val=self.pad_val)
+            elif self.size_divisor is not None:
+                padded_img = mmcv.impad_to_multiple(
+                    results[key], self.size_divisor, pad_val=self.pad_val)
+            results[key] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def _pad_masks(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        pad_shape = results['pad_shape'][:2]
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].pad(pad_shape, pad_val=self.pad_val)
+
+    def _pad_seg(self, results):
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.impad(
+                results[key], shape=results['pad_shape'][:2])
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_masks(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize(object):
+    """Normalize the image.
+
+    Added key is "img_norm_cfg".
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imnormalize(results[key], self.mean, self.std,
+                                            self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop(object):
+    """Random crop the image & bboxes & masks.
+
+    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
+    then the cropped results are generated.
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            height and width.
+        crop_type (str, optional): one of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Default False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
+          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
+          `gt_masks_ignore`.
+        - If the crop does not contain any gt-bbox region and
+          `allow_negative_crop` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size,
+                 crop_type='absolute',
+                 allow_negative_crop=False,
+                 bbox_clip_border=True):
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        # The key correspondence from bboxes to labels and masks.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area. Default to False.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+    def _get_crop_size(self, image_size):
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (tuple): (h, w).
+
+        Returns:
+            crop_size (tuple): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == 'absolute_range':
+            assert self.crop_size[0] <= self.crop_size[1]
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_h, crop_w = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        elif self.crop_type == 'relative_range':
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    def __call__(self, results):
+        """Call function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+@PIPELINES.register_module()
+class RandomSquareCrop(object):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self, crop_ratio_range=None, crop_choice=None, bbox_clip_border=True):
+
+        self.crop_ratio_range = crop_ratio_range
+        self.crop_choice = crop_choice
+        self.bbox_clip_border = bbox_clip_border
+
+        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
+        if self.crop_ratio_range is not None:
+            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
+
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        assert 'gt_bboxes' in results
+        boxes = results['gt_bboxes']
+        #boxes = [results[key] for key in results['bbox_fields']]
+        #boxes = np.concatenate(boxes, 0)
+        h, w, c = img.shape
+        scale_retry = 0
+        if self.crop_ratio_range is not None:
+            max_scale = self.crop_ratio_max
+        else:
+            max_scale = np.amax(self.crop_choice)
+        #max_scale = max(max_scale, float(max(w,h))/min(w,h))
+
+        while True:
+            scale_retry += 1
+
+            if scale_retry==1 or max_scale>1.0:
+                if self.crop_ratio_range is not None:
+                    scale = np.random.uniform(self.crop_ratio_min,
+                                              self.crop_ratio_max)
+                elif self.crop_choice is not None:
+                    scale = np.random.choice(self.crop_choice)
+            else:
+                #scale = min(scale*1.2, max_scale)
+                scale = scale*1.2
+
+            # print(scale, img.shape[:2], boxes)
+            # import cv2
+            # cv2.imwrite('aaa.png', img)
+
+            for i in range(250):
+                short_side = min(w, h)
+                cw = int(scale * short_side)
+                ch = cw
+
+                # TODO +1
+                if w==cw:
+                    left = 0
+                elif w>cw:
+                    #left = random.uniform(w - cw)
+                    left = random.randint(0, w - cw)
+                else:
+                    left = random.randint(w - cw, 0)
+                if h==ch:
+                    top = 0
+                elif h>ch:
+                    #top = random.uniform(h - ch)
+                    top = random.randint(0, h - ch)
+                else:
+                    top = random.randint(h - ch, 0)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + cw), int(top + ch)), dtype=np.int)
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                # adjust boxes
+                def is_center_of_bboxes_in_patch(boxes, patch):
+                    # TODO >=
+                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                    mask = ((center[:, 0] > patch[0]) *
+                            (center[:, 1] > patch[1]) *
+                            (center[:, 0] < patch[2]) *
+                            (center[:, 1] < patch[3]))
+                    return mask
+
+                mask = is_center_of_bboxes_in_patch(boxes, patch)
+                if not mask.any():
+                    continue
+                for key in results.get('bbox_fields', []):
+                    boxes = results[key].copy()
+                    #print('BBB', key, boxes.shape)
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    boxes = boxes[mask]
+                    if self.bbox_clip_border:
+                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                    boxes -= np.tile(patch[:2], 2)
+
+                    results[key] = boxes
+                    # labels
+                    label_key = self.bbox2label.get(key)
+                    if label_key in results:
+                        results[label_key] = results[label_key][mask]
+
+                    # keypoints field
+                    if key=='gt_bboxes':
+                        for kps_key in results.get('keypoints_fields', []):
+                            keypointss = results[kps_key].copy()
+                            #print('AAAA', kps_key, keypointss.shape, mask.shape)
+                            keypointss = keypointss[mask,:,:]
+                            if self.bbox_clip_border:
+                                keypointss[:,:,:2] = keypointss[:,:,:2].clip(max=patch[2:])
+                                keypointss[:,:,:2] = keypointss[:,:,:2].clip(min=patch[:2])
+                            #keypointss[:,:,:2] -= np.tile(patch[:2], 2)
+                            keypointss[:,:,0] -= patch[0]
+                            keypointss[:,:,1] -= patch[1]
+                            results[kps_key] = keypointss
+
+                    # mask fields
+                    mask_key = self.bbox2mask.get(key)
+                    if mask_key in results:
+                        results[mask_key] = results[mask_key][mask.nonzero()
+                                                              [0]].crop(patch)
+
+                # adjust the img no matter whether the gt is empty before crop
+                #img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                rimg = np.ones( (ch, cw, 3), dtype=img.dtype) * 128
+                patch_from = patch.copy()
+                patch_from[0] = max(0, patch_from[0])
+                patch_from[1] = max(0, patch_from[1])
+                patch_from[2] = min(img.shape[1], patch_from[2])
+                patch_from[3] = min(img.shape[0], patch_from[3])
+                patch_to = patch.copy()
+                patch_to[0] = max(0, patch_to[0]*-1)
+                patch_to[1] = max(0, patch_to[1]*-1)
+                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
+                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
+                rimg[patch_to[1]:patch_to[3], patch_to[0]:patch_to[2],:] = img[patch_from[1]:patch_from[3], patch_from[0]:patch_from[2], :]
+                #print(img.shape, scale, patch, patch_from, patch_to, rimg.shape)
+                img = rimg
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                # seg fields
+                #for key in results.get('seg_fields', []):
+                #    results[key] = results[key][patch[1]:patch[3],
+                #                                patch[0]:patch[2]]
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_iou}, '
+        repr_str += f'crop_size={self.crop_size})'
+        return repr_str
+
+@PIPELINES.register_module()
+class SegRescale(object):
+    """Rescale semantic segmentation maps.
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor=1, backend='cv2'):
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def __call__(self, results):
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key],
+                    self.scale_factor,
+                    interpolation='nearest',
+                    backend=self.backend)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion(object):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    9. random grayscale
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+        gray_prob (float): prob of grayscale.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18,
+                 gray_prob=0.0):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+        self.gray_prob = gray_prob
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert img.dtype == np.float32, \
+            'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+
+        if self.gray_prob>0.0:
+            if random.random()<self.gray_prob:
+                gray = mmcv.bgr2gray(img)
+                img = cv2.merge([gray, gray, gray])
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta},\n'
+        repr_str += f'gray_prob={self.gray_prob})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Expand(object):
+    """Random expand the image & bboxes.
+
+    Randomly place the original image on a canvas of 'ratio' x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Args:
+        mean (tuple): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (tuple): range of expand ratio.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 to_rgb=True,
+                 ratio_range=(1, 4),
+                 seg_ignore_label=None,
+                 prob=0.5):
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to expand images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes expanded
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+
+        results['img'] = expand_img
+        # expand bboxes
+        for key in results.get('bbox_fields', []):
+            results[key] = results[key] + np.tile(
+                (left, top), 2).astype(results[key].dtype)
+
+        # expand masks
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segs
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results[key] = expand_gt_seg
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MinIoURandomCrop(object):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size=0.3,
+                 bbox_clip_border=True):
+        # 1: return ori img
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        boxes = [results[key] for key in results['bbox_fields']]
+        boxes = np.concatenate(boxes, 0)
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = ((center[:, 0] > patch[0]) *
+                                (center[:, 1] > patch[1]) *
+                                (center[:, 0] < patch[2]) *
+                                (center[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    for key in results.get('bbox_fields', []):
+                        boxes = results[key].copy()
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        if self.bbox_clip_border:
+                            boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                            boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                        boxes -= np.tile(patch[:2], 2)
+
+                        results[key] = boxes
+                        # labels
+                        label_key = self.bbox2label.get(key)
+                        if label_key in results:
+                            results[label_key] = results[label_key][mask]
+
+                        # mask fields
+                        mask_key = self.bbox2mask.get(key)
+                        if mask_key in results:
+                            results[mask_key] = results[mask_key][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                # seg fields
+                for key in results.get('seg_fields', []):
+                    results[key] = results[key][patch[1]:patch[3],
+                                                patch[0]:patch[2]]
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}), '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Corrupt(object):
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int, optional): The severity of corruption. Default: 1.
+    """
+
+    def __init__(self, corruption, severity=1):
+        self.corruption = corruption
+        self.severity = severity
+
+    def __call__(self, results):
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Albu(object):
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict): Bbox_params for albumentation `Compose`
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug
+    """
+
+    def __init__(self,
+                 transforms,
+                 bbox_params=None,
+                 keymap=None,
+                 update_pad_shape=False,
+                 skip_img_without_anno=False):
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.update_pad_shape = update_pad_shape
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmcv.is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        # TODO: add bbox_fields
+        if 'bboxes' in results:
+            # to list of boxes
+            if isinstance(results['bboxes'], np.ndarray):
+                results['bboxes'] = [x for x in results['bboxes']]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        results = self.aug(**results)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'], results['image'].shape[0],
+                        results['image'].shape[1])
+
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+
+        if 'gt_labels' in results:
+            if isinstance(results['gt_labels'], list):
+                results['gt_labels'] = np.array(results['gt_labels'])
+            results['gt_labels'] = results['gt_labels'].astype(np.int64)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCenterCropPad(object):
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Args:
+        crop_size (tuple | None): expected size after crop, final size will
+            computed according to ratio. Requires (h, w) in train mode, and
+            None in test mode.
+        ratios (tuple): random select a ratio from tuple and crop image to
+            (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode.
+        border (int): max distance from center select area to image border.
+            Only available in train mode.
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+        test_pad_mode (tuple): padding method and padding shape value, only
+            available in test mode. Default is using 'logical_or' with
+            127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size=None,
+                 ratios=(0.9, 1.0, 1.1),
+                 border=128,
+                 mean=None,
+                 std=None,
+                 to_rgb=None,
+                 test_mode=False,
+                 test_pad_mode=('logical_or', 127),
+                 bbox_clip_border=True):
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        boxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[0] * scale)
+            new_w = int(self.crop_size[1] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                mask = self._filter_boxes(patch, boxes)
+                # if image do not have valid bbox, any crop patch is valid.
+                if not mask.any() and len(boxes) > 0:
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape
+                results['pad_shape'] = cropped_img.shape
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                for key in results.get('bbox_fields', []):
+                    mask = self._filter_boxes(patch, results[key])
+                    bboxes = results[key][mask]
+                    bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
+                    bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
+                    if self.bbox_clip_border:
+                        bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
+                        bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
+                    keep = (bboxes[:, 2] > bboxes[:, 0]) & (
+                        bboxes[:, 3] > bboxes[:, 1])
+                    bboxes = bboxes[keep]
+                    results[key] = bboxes
+                    if key in ['gt_bboxes']:
+                        if 'gt_labels' in results:
+                            labels = results['gt_labels'][mask]
+                            labels = labels[keep]
+                            results['gt_labels'] = labels
+                        if 'gt_masks' in results:
+                            raise NotImplementedError(
+                                'RandomCenterCropPad only supports bbox.')
+
+                # crop semantic seg
+                for key in results.get('seg_fields', []):
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        results['img_shape'] = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            target_h = h | self.test_pad_mode[1]
+            target_w = w | self.test_pad_mode[1]
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['pad_shape'] = cropped_img.shape
+        results['border'] = border
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}), '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CutOut(object):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Args:
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+    """
+
+    def __init__(self,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0)):
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    def __call__(self, results):
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/datasets/retinaface.py b/insightface/detection/scrfd/mmdet/datasets/retinaface.py
new file mode 100755
index 0000000000000000000000000000000000000000..ed9c8b4f0c883cd6318418df424d844b7b9f6423
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/retinaface.py
@@ -0,0 +1,169 @@
+import itertools
+import logging
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmdet.core import eval_recalls
+from .builder import DATASETS
+from .custom import CustomDataset
+
+try:
+    import pycocotools
+    if not hasattr(pycocotools, '__sphinx_mock__'):  # for doc generation
+        assert pycocotools.__version__ >= '12.0.2'
+except AssertionError:
+    raise AssertionError('Incompatible version of pycocotools is installed. '
+                         'Run pip uninstall pycocotools first. Then run pip '
+                         'install mmpycocotools to install open-mmlab forked '
+                         'pycocotools.')
+
+@DATASETS.register_module()
+class RetinaFaceDataset(CustomDataset):
+
+    CLASSES = ('FG', )
+    def __init__(self, min_size=None, **kwargs):
+        self.NK = 5
+        self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
+        self.min_size = min_size
+        self.gt_path = kwargs.get('gt_path')
+        super(RetinaFaceDataset, self).__init__(**kwargs)
+        #print(self.cat2label)
+
+    def _parse_ann_line(self, line):
+        values = [float(x) for x in line.strip().split()]
+        bbox = np.array(values[0:4], dtype=np.float32 )
+        kps = np.zeros( (self.NK,3), dtype=np.float32 )
+        ignore = False
+        if self.min_size is not None:
+            assert not self.test_mode
+            w = bbox[2] - bbox[0]
+            h = bbox[3] - bbox[1]
+            if w < self.min_size or h < self.min_size:
+                ignore = True
+        if len(values)>4:
+            if len(values)>5:
+                #print(values)
+                kps = np.array( values[4:19], dtype=np.float32 ).reshape((self.NK,3))
+                for li in range(kps.shape[0]):
+                    if (kps[li,:]==-1).all():
+                        #assert kps[li][2]==-1
+                        kps[li][2] = 0.0 #weight = 0, ignore
+                    else:
+                        assert kps[li][2]>=0
+                        kps[li][2] = 1.0 #weight
+                        #if li==0:
+                        #  landmark_num+=1
+                        #if kps[li][2]==0.0:#visible
+                        #  kps[li][2] = 1.0
+                        #else:
+                        #  kps[li][2] = 0.0
+            else: #len(values)==5
+                if not ignore:
+                    ignore = (values[4]==1)
+        else:
+            assert self.test_mode
+
+        return dict(bbox=bbox, kps=kps, ignore=ignore, cat='FG')
+
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+        name = None
+        bbox_map = {}
+        for line in open(ann_file, 'r'):
+            line = line.strip()
+            if line.startswith('#'):
+                value = line[1:].strip().split()
+                name = value[0]
+                width = int(value[1])
+                height = int(value[2])
+
+                bbox_map[name] = dict(width=width, height=height, objs=[])
+                continue
+            assert name is not None
+            assert name in bbox_map
+            bbox_map[name]['objs'].append(line)
+        print('origin image size', len(bbox_map))
+        data_infos = []
+        for name in bbox_map:
+            item = bbox_map[name]
+            width = item['width']
+            height = item['height']
+            vals = item['objs']
+            objs = []
+            for line in vals:
+                data = self._parse_ann_line(line)
+                if data is None:
+                    continue
+                objs.append( data ) #data is (bbox, kps, cat)
+            if len(objs)==0 and not self.test_mode:
+                continue
+            data_infos.append(dict(filename=name, width = width, height=height, objs = objs))
+        return data_infos
+
+
+    def get_ann_info(self, idx):
+        """Get COCO annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        data_info = self.data_infos[idx]
+
+        bboxes = []
+        keypointss = []
+        labels = []
+        bboxes_ignore = []
+        labels_ignore = []
+        for obj in data_info['objs']:
+            label = self.cat2label[obj['cat']]
+            bbox = obj['bbox']
+            keypoints = obj['kps']
+            ignore = obj['ignore']
+            if ignore:
+                bboxes_ignore.append(bbox)
+                labels_ignore.append(label)
+            else:
+                bboxes.append(bbox)
+                labels.append(label)
+                keypointss.append(keypoints)
+        if not bboxes:
+            bboxes = np.zeros((0, 4))
+            labels = np.zeros((0, ))
+            keypointss = np.zeros((0, self.NK, 3))
+        else:
+            #bboxes = np.array(bboxes, ndmin=2) - 1
+            bboxes = np.array(bboxes, ndmin=2)
+            labels = np.array(labels)
+            keypointss = np.array(keypointss, ndmin=3)
+        if not bboxes_ignore:
+            bboxes_ignore = np.zeros((0, 4))
+            labels_ignore = np.zeros((0, ))
+        else:
+            #bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
+            bboxes_ignore = np.array(bboxes_ignore, ndmin=2)
+            labels_ignore = np.array(labels_ignore)
+        ann = dict(
+            bboxes=bboxes.astype(np.float32),
+            labels=labels.astype(np.int64),
+            keypointss = keypointss.astype(np.float32),
+            bboxes_ignore=bboxes_ignore.astype(np.float32),
+            labels_ignore=labels_ignore.astype(np.int64))
+        return ann
+
diff --git a/insightface/detection/scrfd/mmdet/datasets/samplers/__init__.py b/insightface/detection/scrfd/mmdet/datasets/samplers/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2596aeb2ccfc85b58624713c04453d34e94a4062
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,4 @@
+from .distributed_sampler import DistributedSampler
+from .group_sampler import DistributedGroupSampler, GroupSampler
+
+__all__ = ['DistributedSampler', 'DistributedGroupSampler', 'GroupSampler']
diff --git a/insightface/detection/scrfd/mmdet/datasets/samplers/distributed_sampler.py b/insightface/detection/scrfd/mmdet/datasets/samplers/distributed_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..a9a1fc0b13ebc31d1f1cb45d8d1ed3cee4574310
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,32 @@
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/insightface/detection/scrfd/mmdet/datasets/samplers/group_sampler.py b/insightface/detection/scrfd/mmdet/datasets/samplers/group_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..a691b949d73b067fcfa95f192e91d28195425a98
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/samplers/group_sampler.py
@@ -0,0 +1,143 @@
+from __future__ import division
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/insightface/detection/scrfd/mmdet/datasets/utils.py b/insightface/detection/scrfd/mmdet/datasets/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..04a4c4c926a0ea82adf807d2614e62cc29a78df9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/utils.py
@@ -0,0 +1,62 @@
+import copy
+import warnings
+
+
+def replace_ImageToTensor(pipelines):
+    """Replace the ImageToTensor transform in a data pipeline to
+    DefaultFormatBundle, which is normally useful in batch inference.
+
+    Args:
+        pipelines (list[dict]): Data pipeline configs.
+
+    Returns:
+        list: The new pipeline list with all ImageToTensor replaced by
+            DefaultFormatBundle.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='ImageToTensor', keys=['img']),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='DefaultFormatBundle'),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> assert expected_pipelines == replace_ImageToTensor(pipelines)
+    """
+    pipelines = copy.deepcopy(pipelines)
+    for i, pipeline in enumerate(pipelines):
+        if pipeline['type'] == 'MultiScaleFlipAug':
+            assert 'transforms' in pipeline
+            pipeline['transforms'] = replace_ImageToTensor(
+                pipeline['transforms'])
+        elif pipeline['type'] == 'ImageToTensor':
+            warnings.warn(
+                '"ImageToTensor" pipeline is replaced by '
+                '"DefaultFormatBundle" for batch inference. It is '
+                'recommended to manually replace it in the test '
+                'data pipeline in your config file.', UserWarning)
+            pipelines[i] = {'type': 'DefaultFormatBundle'}
+    return pipelines
diff --git a/insightface/detection/scrfd/mmdet/datasets/voc.py b/insightface/detection/scrfd/mmdet/datasets/voc.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d1e861ea09d67739512991ba0166c93cd2b7d55
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/voc.py
@@ -0,0 +1,89 @@
+from collections import OrderedDict
+
+from mmdet.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class VOCDataset(XMLDataset):
+
+    CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+               'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+               'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+               'tvmonitor')
+
+    def __init__(self, **kwargs):
+        super(VOCDataset, self).__init__(**kwargs)
+        if 'VOC2007' in self.img_prefix:
+            self.year = 2007
+        elif 'VOC2012' in self.img_prefix:
+            self.year = 2012
+        else:
+            raise ValueError('Cannot infer dataset year from img_prefix')
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate in VOC protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'mAP', 'recall'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. It must be a float
+                when evaluating mAP, and can be a list when evaluating recall.
+                Default: 0.5.
+            scale_ranges (list[tuple], optional): Scale ranges for evaluating
+                mAP. If not specified, all bounding boxes would be included in
+                evaluation. Default: None.
+
+        Returns:
+            dict[str, float]: AP/recall metrics.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        if metric == 'mAP':
+            assert isinstance(iou_thr, float)
+            if self.year == 2007:
+                ds_name = 'voc07'
+            else:
+                ds_name = self.CLASSES
+            mean_ap, _ = eval_map(
+                results,
+                annotations,
+                scale_ranges=None,
+                iou_thr=iou_thr,
+                dataset=ds_name,
+                logger=logger)
+            eval_results['mAP'] = mean_ap
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            if isinstance(iou_thr, float):
+                iou_thr = [iou_thr]
+            recalls = eval_recalls(
+                gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
+            for i, num in enumerate(proposal_nums):
+                for j, iou in enumerate(iou_thr):
+                    eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/insightface/detection/scrfd/mmdet/datasets/wider_face.py b/insightface/detection/scrfd/mmdet/datasets/wider_face.py
new file mode 100755
index 0000000000000000000000000000000000000000..3a13907db87a9986a7d701837259a0b712fc9dca
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/wider_face.py
@@ -0,0 +1,51 @@
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import mmcv
+
+from .builder import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class WIDERFaceDataset(XMLDataset):
+    """Reader for the WIDER Face dataset in PASCAL VOC format.
+
+    Conversion scripts can be found in
+    https://github.com/sovrasov/wider-face-pascal-voc-annotations
+    """
+    CLASSES = ('face', )
+
+    def __init__(self, **kwargs):
+        super(WIDERFaceDataset, self).__init__(**kwargs)
+
+    def load_annotations(self, ann_file):
+        """Load annotation from WIDERFace XML style annotation file.
+
+        Args:
+            ann_file (str): Path of XML file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+
+        data_infos = []
+        img_ids = mmcv.list_from_file(ann_file)
+        for img_id in img_ids:
+            filename = f'{img_id}.jpg'
+            xml_path = osp.join(self.img_prefix, 'Annotations',
+                                f'{img_id}.xml')
+            tree = ET.parse(xml_path)
+            root = tree.getroot()
+            size = root.find('size')
+            width = int(size.find('width').text)
+            height = int(size.find('height').text)
+            folder = root.find('folder').text
+            data_infos.append(
+                dict(
+                    id=img_id,
+                    filename=osp.join(folder, filename),
+                    width=width,
+                    height=height))
+
+        return data_infos
diff --git a/insightface/detection/scrfd/mmdet/datasets/xml_style.py b/insightface/detection/scrfd/mmdet/datasets/xml_style.py
new file mode 100755
index 0000000000000000000000000000000000000000..b912de38d12d1e146e34eac61ff2e09c4a989706
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/datasets/xml_style.py
@@ -0,0 +1,169 @@
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class XMLDataset(CustomDataset):
+    """XML dataset for detection.
+
+    Args:
+        min_size (int | float, optional): The minimum size of bounding
+            boxes in the images. If the size of a bounding box is less than
+            ``min_size``, it would be add to ignored field.
+    """
+
+    def __init__(self, min_size=None, **kwargs):
+        super(XMLDataset, self).__init__(**kwargs)
+        self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
+        self.min_size = min_size
+
+    def load_annotations(self, ann_file):
+        """Load annotation from XML style ann_file.
+
+        Args:
+            ann_file (str): Path of XML file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+
+        data_infos = []
+        img_ids = mmcv.list_from_file(ann_file)
+        for img_id in img_ids:
+            filename = f'JPEGImages/{img_id}.jpg'
+            xml_path = osp.join(self.img_prefix, 'Annotations',
+                                f'{img_id}.xml')
+            tree = ET.parse(xml_path)
+            root = tree.getroot()
+            size = root.find('size')
+            width = 0
+            height = 0
+            if size is not None:
+                width = int(size.find('width').text)
+                height = int(size.find('height').text)
+            else:
+                img_path = osp.join(self.img_prefix, 'JPEGImages',
+                                    '{}.jpg'.format(img_id))
+                img = Image.open(img_path)
+                width, height = img.size
+            data_infos.append(
+                dict(id=img_id, filename=filename, width=width, height=height))
+
+        return data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without annotation."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if min(img_info['width'], img_info['height']) < min_size:
+                continue
+            if self.filter_empty_gt:
+                img_id = img_info['id']
+                xml_path = osp.join(self.img_prefix, 'Annotations',
+                                    f'{img_id}.xml')
+                tree = ET.parse(xml_path)
+                root = tree.getroot()
+                for obj in root.findall('object'):
+                    name = obj.find('name').text
+                    if name in self.CLASSES:
+                        valid_inds.append(i)
+                        break
+            else:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, idx):
+        """Get annotation from XML file by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        bboxes = []
+        labels = []
+        bboxes_ignore = []
+        labels_ignore = []
+        for obj in root.findall('object'):
+            name = obj.find('name').text
+            if name not in self.CLASSES:
+                continue
+            label = self.cat2label[name]
+            difficult = int(obj.find('difficult').text)
+            bnd_box = obj.find('bndbox')
+            # TODO: check whether it is necessary to use int
+            # Coordinates may be float type
+            bbox = [
+                int(float(bnd_box.find('xmin').text)),
+                int(float(bnd_box.find('ymin').text)),
+                int(float(bnd_box.find('xmax').text)),
+                int(float(bnd_box.find('ymax').text))
+            ]
+            ignore = False
+            if self.min_size:
+                assert not self.test_mode
+                w = bbox[2] - bbox[0]
+                h = bbox[3] - bbox[1]
+                if w < self.min_size or h < self.min_size:
+                    ignore = True
+            if difficult or ignore:
+                bboxes_ignore.append(bbox)
+                labels_ignore.append(label)
+            else:
+                bboxes.append(bbox)
+                labels.append(label)
+        if not bboxes:
+            bboxes = np.zeros((0, 4))
+            labels = np.zeros((0, ))
+        else:
+            bboxes = np.array(bboxes, ndmin=2) - 1
+            labels = np.array(labels)
+        if not bboxes_ignore:
+            bboxes_ignore = np.zeros((0, 4))
+            labels_ignore = np.zeros((0, ))
+        else:
+            bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
+            labels_ignore = np.array(labels_ignore)
+        ann = dict(
+            bboxes=bboxes.astype(np.float32),
+            labels=labels.astype(np.int64),
+            bboxes_ignore=bboxes_ignore.astype(np.float32),
+            labels_ignore=labels_ignore.astype(np.int64))
+        return ann
+
+    def get_cat_ids(self, idx):
+        """Get category ids in XML file by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        cat_ids = []
+        img_id = self.data_infos[idx]['id']
+        xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        for obj in root.findall('object'):
+            name = obj.find('name').text
+            if name not in self.CLASSES:
+                continue
+            label = self.cat2label[name]
+            cat_ids.append(label)
+
+        return cat_ids
diff --git a/insightface/detection/scrfd/mmdet/models/__init__.py b/insightface/detection/scrfd/mmdet/models/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..44ac99855ae52101c91be167fa78d8219fc47259
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/__init__.py
@@ -0,0 +1,16 @@
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
+                      ROI_EXTRACTORS, SHARED_HEADS, build_backbone,
+                      build_detector, build_head, build_loss, build_neck,
+                      build_roi_extractor, build_shared_head)
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+    'DETECTORS', 'build_backbone', 'build_neck', 'build_roi_extractor',
+    'build_shared_head', 'build_head', 'build_loss', 'build_detector'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/__init__.py b/insightface/detection/scrfd/mmdet/models/backbones/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2dc4665de4512da59a1755e715ee56f1386186ec
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/__init__.py
@@ -0,0 +1,20 @@
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d, ResNetV1e
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .trident_resnet import TridentResNet
+
+from .mobilenet import MobileNetV1
+
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNetV1e', 'ResNeXt', 'SSDVGG', 'HRNet', 'Res2Net',
+    'HourglassNet', 'DetectoRS_ResNet', 'DetectoRS_ResNeXt', 'Darknet',
+    'ResNeSt', 'TridentResNet', 'MobileNetV1'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/darknet.py b/insightface/detection/scrfd/mmdet/models/backbones/darknet.py
new file mode 100755
index 0000000000000000000000000000000000000000..517fe26259217792e0dad80ca3824d914cfe3904
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/darknet.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+
+
+class ResBlock(nn.Module):
+    """The basic residual block used in Darknet. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
+        super(ResBlock, self).__init__()
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + residual
+
+        return out
+
+
+@BACKBONES.register_module()
+class Darknet(nn.Module):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True):
+        super(Darknet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+        self.depth = depth
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'conv_res_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                m = getattr(self, self.cr_blocks[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    @staticmethod
+    def make_conv_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+        """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+        function will make that. The Conv layers always have 3x3 filters with
+        stride=2. The number of the filters in Conv layer is the same as the
+        out channels of the ResBlock.
+
+        Args:
+            in_channels (int): The number of input channels.
+            out_channels (int): The number of output channels.
+            res_repeat (int): The number of ResBlocks.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Dictionary to construct and config norm layer.
+                Default: dict(type='BN', requires_grad=True)
+            act_cfg (dict): Config dict for activation layer.
+                Default: dict(type='LeakyReLU', negative_slope=0.1).
+        """
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        model = nn.Sequential()
+        model.add_module(
+            'conv',
+            ConvModule(
+                in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+        for idx in range(res_repeat):
+            model.add_module('res{}'.format(idx),
+                             ResBlock(out_channels, **cfg))
+        return model
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnet.py b/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..324e737ded5858dddd074539a67aacf25dc0c77f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnet.py
@@ -0,0 +1,305 @@
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, constant_init
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    r"""Bottleneck for the ResNet backbone in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_.
+
+    This bottleneck allows the users to specify whether to use
+    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
+
+    Args:
+         inplanes (int): The number of input channels.
+         planes (int): The number of output channels before expansion.
+         rfp_inplanes (int, optional): The number of channels from RFP.
+             Default: None. If specified, an additional conv layer will be
+             added for ``rfp_feat``. Otherwise, the structure is the same as
+             base class.
+         sac (dict, optional): Dictionary to construct SAC. Default: None.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 rfp_inplanes=None,
+                 sac=None,
+                 **kwargs):
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        assert sac is None or isinstance(sac, dict)
+        self.sac = sac
+        self.with_sac = sac is not None
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                bias=False)
+
+        self.rfp_inplanes = rfp_inplanes
+        if self.rfp_inplanes:
+            self.rfp_conv = build_conv_layer(
+                None,
+                self.rfp_inplanes,
+                planes * self.expansion,
+                1,
+                stride=1,
+                bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the weights."""
+        if self.rfp_inplanes:
+            constant_init(self.rfp_conv, 0)
+
+    def rfp_forward(self, x, rfp_feat):
+        """The forward function that also takes the RFP features as input."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        if self.rfp_inplanes:
+            rfp_feat = self.rfp_conv(rfp_feat)
+            out = out + rfp_feat
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone for RPF in detectoRS.
+
+    The difference between this module and base class is that we pass
+    ``rfp_inplanes`` to the first block.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 rfp_inplanes=None,
+                 **kwargs):
+        self.block = block
+        assert downsample_first, f'downsampel_first={downsample_first} is ' \
+                                 'not supported in DetectoRS'
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                rfp_inplanes=rfp_inplanes,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for _ in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super(ResLayer, self).__init__(*layers)
+
+
+@BACKBONES.register_module()
+class DetectoRS_ResNet(ResNet):
+    """ResNet backbone for DetectoRS.
+
+    Args:
+        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
+            Convolution). Default: None.
+        stage_with_sac (list): Which stage to use sac. Default: (False, False,
+            False, False).
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+        output_img (bool): If ``True``, the input image will be inserted into
+            the starting position of output. Default: False.
+        pretrained (str, optional): The pretrained model to load.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 sac=None,
+                 stage_with_sac=(False, False, False, False),
+                 rfp_inplanes=None,
+                 output_img=False,
+                 pretrained=None,
+                 **kwargs):
+        self.sac = sac
+        self.stage_with_sac = stage_with_sac
+        self.rfp_inplanes = rfp_inplanes
+        self.output_img = output_img
+        self.pretrained = pretrained
+        super(DetectoRS_ResNet, self).__init__(**kwargs)
+
+        self.inplanes = self.stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            sac = self.sac if self.stage_with_sac[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                sac=sac,
+                rfp_inplanes=rfp_inplanes if i > 0 else None,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
+        return ResLayer(**kwargs)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = list(super(DetectoRS_ResNet, self).forward(x))
+        if self.output_img:
+            outs.insert(0, x)
+        return tuple(outs)
+
+    def rfp_forward(self, x, rfp_feats):
+        """Forward function for RFP."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            rfp_feat = rfp_feats[i] if i > 0 else None
+            for layer in res_layer:
+                x = layer.rfp_forward(x, rfp_feat)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnext.py b/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnext.py
new file mode 100755
index 0000000000000000000000000000000000000000..57d032fe37ed82d5ba24e761bdc014cc0ee5ac64
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/detectors_resnext.py
@@ -0,0 +1,122 @@
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .detectors_resnet import Bottleneck as _Bottleneck
+from .detectors_resnet import DetectoRS_ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        elif not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class DetectoRS_ResNeXt(DetectoRS_ResNet):
+    """ResNeXt backbone for DetectoRS.
+
+    Args:
+        groups (int): The number of groups in ResNeXt.
+        base_width (int): The base width of ResNeXt.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(DetectoRS_ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return super().make_res_layer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/hourglass.py b/insightface/detection/scrfd/mmdet/models/backbones/hourglass.py
new file mode 100755
index 0000000000000000000000000000000000000000..3422acee35e3c6f8731cdb310f188e671b5be12f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/hourglass.py
@@ -0,0 +1,198 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import BasicBlock
+
+
+class HourglassModule(nn.Module):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 stage_blocks,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(HourglassModule, self).__init__()
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_channel,
+            next_channel,
+            cur_block,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_channel,
+                next_channel,
+                next_block,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            next_channel,
+            cur_channel,
+            cur_block,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = nn.Upsample(scale_factor=2)
+
+    def forward(self, x):
+        """Forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassNet(nn.Module):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`_ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmdet.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=5,
+                 num_stacks=2,
+                 stage_channels=(256, 256, 384, 384, 384, 512),
+                 stage_blocks=(2, 2, 2, 2, 2, 4),
+                 feat_channel=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(HourglassNet, self).__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ResLayer(BasicBlock, 128, 256, 1, stride=2, norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            cur_channel,
+            cur_channel,
+            num_stacks - 1,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Init module weights.
+
+        We do nothing in this function because all modules we used
+        (ConvModule, BasicBlock and etc.) have default initialization, and
+        currently we don't provide pretrained model of HourglassNet.
+
+        Detector's __init__() will call backbone's init_weights() with
+        pretrained as input, so we keep this function.
+        """
+        # Training Centripetal Model needs to reset parameters for Conv2d
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.reset_parameters()
+
+    def forward(self, x):
+        """Forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/hrnet.py b/insightface/detection/scrfd/mmdet/models/backbones/hrnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..1ecc79f125f7e95fb5518c9c654b47ef40a81010
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/hrnet.py
@@ -0,0 +1,537 @@
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      kaiming_init)
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(nn.Module):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super(HRModule, self).__init__()
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(nn.Module):
+    """HRNet backbone.
+
+    High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: https://arxiv.org/abs/1904.04514
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 zero_init_residual=False):
+        super(HRNet, self).__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*hr_modules), in_channels
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode whill keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/mobilenet.py b/insightface/detection/scrfd/mmdet/models/backbones/mobilenet.py
new file mode 100755
index 0000000000000000000000000000000000000000..8dccab8fd5638722a063953096bd6881c5c31208
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/mobilenet.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+from mmdet.utils import get_root_logger
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
+                      constant_init, kaiming_init)
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+from ..builder import BACKBONES
+
+@BACKBONES.register_module()
+class MobileNetV1(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 #base_channels=32,
+                 block_cfg = None,
+                 num_stages=4,
+                 out_indices=(0, 1, 2, 3)):
+        super(MobileNetV1, self).__init__()
+        self.out_indices = out_indices
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True)
+            )
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        if block_cfg is None:
+            stage_planes = [8, 16, 32, 64, 128, 256]  #0.25 default
+            stage_blocks = [2,4,4,2]
+        else:
+            stage_planes = block_cfg['stage_planes']
+            stage_blocks = block_cfg['stage_blocks']
+        assert len(stage_planes)==6
+        assert len(stage_blocks)==4
+        self.stem = nn.Sequential(
+                conv_bn(3, stage_planes[0], 2),
+                conv_dw(stage_planes[0], stage_planes[1], 1),
+                )
+        self.stage_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            _layers = []
+            for n in range(num_blocks):
+                if n==0:
+                    _layer = conv_dw(stage_planes[i+1], stage_planes[i+2], 2)
+                else:
+                    _layer = conv_dw(stage_planes[i+2], stage_planes[i+2], 1)
+                _layers.append(_layer)
+
+            _block = nn.Sequential(*_layers)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, _block)
+            self.stage_layers.append(layer_name)
+
+        #bc = base_channels
+        #self.stages = nn.ModuleDict()
+        #self.stage0 = nn.Sequential(
+        #    conv_bn(3, bc, 2),
+        #    conv_dw(bc, bc*2, 1),
+        #    conv_dw(bc*2, bc*4, 2),
+        #    conv_dw(bc*4, bc*4, 1),
+        #)
+        #self.stage1 = nn.Sequential(
+        #    conv_dw(bc*4, bc*8, 2),
+        #    conv_dw(bc*8, bc*8, 1),
+
+        #    conv_dw(bc*8, bc*8, 1),
+        #    conv_dw(bc*8, bc*8, 1),
+        #)
+        #self.stage2 = nn.Sequential(
+        #    conv_dw(bc*8, bc*16, 2),
+        #    conv_dw(bc*16, bc*16, 1),
+        #    conv_dw(bc*16, bc*16, 1),
+        #    conv_dw(bc*16, bc*16, 1),
+        #    #conv_dw(bc*16, bc*16, 1),
+        #    #conv_dw(bc*16, bc*16, 1),
+        #)
+        #self.stage3 = nn.Sequential(
+        #    conv_dw(bc*16, bc*32, 2),
+        #    conv_dw(bc*32, bc*32, 1),
+        #)
+        #self.stages = [self.stage0, self.stage1, self.stage2, self.stage3]
+
+
+
+    def forward(self, x):
+        output = []
+        x = self.stem(x)
+        for i, layer_name in enumerate(self.stage_layers):
+            stage_layer = getattr(self, layer_name)
+            x = stage_layer(x)
+            if i in self.out_indices:
+                output.append(x)
+
+        return tuple(output)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/regnet.py b/insightface/detection/scrfd/mmdet/models/backbones/regnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..b786a3f8add4456f5d5f9f7660cc30958d966ae1
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/regnet.py
@@ -0,0 +1,325 @@
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@BACKBONES.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottlneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True):
+        super(ResNet, self).__init__()
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise ValueError('Expect "arch" to be either a string '
+                             f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        self.block = Bottleneck
+        expansion_bak = self.block.expansion
+        self.block.expansion = 1
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=self.stage_widths[i],
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                groups=stage_groups,
+                base_width=group_width,
+                base_channels=self.stage_widths[i])
+            self.inplanes = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+        self.block.expansion = expansion_bak
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def generate_regnet(self,
+                        initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number \
+                of stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divior.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/res2net.py b/insightface/detection/scrfd/mmdet/models/backbones/res2net.py
new file mode 100755
index 0000000000000000000000000000000000000000..7901b7f2fa29741d72328bdbdbf92fc4d5c5f847
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/res2net.py
@@ -0,0 +1,351 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      kaiming_init)
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottle2neck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 scales=4,
+                 base_width=26,
+                 base_channels=64,
+                 stage_type='normal',
+                 **kwargs):
+        """Bottle2neck block for Res2Net.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottle2neck, self).__init__(inplanes, planes, **kwargs)
+        assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.'
+        width = int(math.floor(self.planes * (base_width / base_channels)))
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width * scales, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width * scales,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        if stage_type == 'stage' and self.conv2_stride != 1:
+            self.pool = nn.AvgPool2d(
+                kernel_size=3, stride=self.conv2_stride, padding=1)
+        convs = []
+        bns = []
+
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.conv_cfg,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.dcn,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width * scales,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.stage_type = stage_type
+        self.scales = scales
+        self.width = width
+        delattr(self, 'conv2')
+        delattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            spx = torch.split(out, self.width, 1)
+            sp = self.convs[0](spx[0].contiguous())
+            sp = self.relu(self.bns[0](sp))
+            out = sp
+            for i in range(1, self.scales - 1):
+                if self.stage_type == 'stage':
+                    sp = spx[i]
+                else:
+                    sp = sp + spx[i]
+                sp = self.convs[i](sp.contiguous())
+                sp = self.relu(self.bns[i](sp))
+                out = torch.cat((out, sp), 1)
+
+            if self.stage_type == 'normal' or self.conv2_stride == 1:
+                out = torch.cat((out, spx[self.scales - 1]), 1)
+            elif self.stage_type == 'stage':
+                out = torch.cat((out, self.pool(spx[self.scales - 1])), 1)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Layer(nn.Sequential):
+    """Res2Layer to build Res2Net style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 scales=4,
+                 base_width=26,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.AvgPool2d(
+                    kernel_size=stride,
+                    stride=stride,
+                    ceil_mode=True,
+                    count_include_pad=False),
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=1,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                scales=scales,
+                base_width=base_width,
+                stage_type='stage',
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    scales=scales,
+                    base_width=base_width,
+                    **kwargs))
+        super(Res2Layer, self).__init__(*layers)
+
+
+@BACKBONES.register_module()
+class Res2Net(ResNet):
+    """Res2Net backbone.
+
+    Args:
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+        depth (int): Depth of res2net, from {50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Res2net stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet.models import Res2Net
+        >>> import torch
+        >>> self = Res2Net(depth=50, scales=4, base_width=26)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottle2neck, (3, 4, 6, 3)),
+        101: (Bottle2neck, (3, 4, 23, 3)),
+        152: (Bottle2neck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 scales=4,
+                 base_width=26,
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=True,
+                 **kwargs):
+        self.scales = scales
+        self.base_width = base_width
+        super(Res2Net, self).__init__(
+            style='pytorch', deep_stem=True, avg_down=True, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return Res2Layer(
+            scales=self.scales,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottle2neck):
+                        # dcn in Res2Net bottle2neck is in ModuleList
+                        for n in m.convs:
+                            if hasattr(n, 'conv_offset'):
+                                constant_init(n.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottle2neck):
+                        constant_init(m.norm3, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/resnest.py b/insightface/detection/scrfd/mmdet/models/backbones/resnest.py
new file mode 100755
index 0000000000000000000000000000000000000000..48e1d8bfa47348a13f0da0b9ecf32354fa270340
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/resnest.py
@@ -0,0 +1,317 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        channels (int): Number of intermediate channels.
+        kernel_size (int | tuple[int]): Size of the convolution kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+        dilation (int | tuple[int]): Spacing between kernel elements.
+        groups (int): Number of blocked connections from input channels to
+            output channels.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None):
+        super(SplitAttentionConv2d, self).__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        # To be consistent with original implementation, starting from 0
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        base_width (int): Base of width in terms of base channels. Default: 4.
+        base_channels (int): Base of channels for calculating width.
+            Default: 64.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SplitAttentionConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/resnet.py b/insightface/detection/scrfd/mmdet/models/backbones/resnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..e39e402f05b5edee2e6aac44bbbd5660959792d9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/resnet.py
@@ -0,0 +1,713 @@
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
+                      constant_init, kaiming_init)
+from mmcv.runner import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None):
+        super(BasicBlock, self).__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        #print('init basic block:', inplanes, planes)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        0: (BasicBlock, (2, 2, 2, 2)),
+        18: (BasicBlock, (2, 2, 2, 2)),
+        19: (BasicBlock, (2, 4, 4, 1)),
+        20: (BasicBlock, (2, 3, 2, 2)),
+        22: (BasicBlock, (2, 4, 3, 1)),
+        24: (BasicBlock, (2, 4, 4, 1)),
+        26: (BasicBlock, (2, 4, 4, 2)),
+        28: (BasicBlock, (2, 5, 4, 2)),
+        29: (BasicBlock, (2, 6, 3, 2)),
+        30: (BasicBlock, (2, 5, 5, 2)),
+        32: (BasicBlock, (2, 6, 5, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        35: (BasicBlock, (3, 6, 4, 3)),
+        38: (BasicBlock, (3, 8, 4, 3)),
+        40: (BasicBlock, (3, 8, 5, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        56: (Bottleneck, (3, 8, 4, 3)),
+        68: (Bottleneck, (3, 10, 6, 3)),
+        74: (Bottleneck, (3, 12, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 block_cfg = None,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 no_pool33=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True):
+        super(ResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.no_pool33 = no_pool33
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        if block_cfg is None:
+            self.block, stage_blocks = self.arch_settings[depth]
+        else:
+            self.block = BasicBlock if block_cfg['block']=='BasicBlock' else Bottleneck
+            stage_blocks = block_cfg['stage_blocks']
+            assert len(stage_blocks)>=num_stages
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+        if block_cfg is not None and 'stage_planes' in block_cfg:
+            stage_planes = block_cfg['stage_planes']
+        else:
+            stage_planes = [base_channels * 2**i for i in range(num_stages)]
+
+        #print('resnet cfg:', stage_blocks, stage_planes)
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            #planes = base_channels * 2**i
+            planes = stage_planes[i]
+            #print('block detail:', i, self.inplanes, planes, stride)
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        if self.no_pool33:
+            assert self.deep_stem
+            self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        else:
+            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m.conv2, 'conv_offset'):
+                        constant_init(m.conv2.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
+
+@BACKBONES.register_module()
+class ResNetV1e(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1e, self).__init__(
+            deep_stem=True, avg_down=True, no_pool33=True, **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/resnext.py b/insightface/detection/scrfd/mmdet/models/backbones/resnext.py
new file mode 100755
index 0000000000000000000000000000000000000000..bf0360ea7e67d475bb4e10ae87d7accc5e9988c6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/resnext.py
@@ -0,0 +1,132 @@
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/ssd_vgg.py b/insightface/detection/scrfd/mmdet/models/backbones/ssd_vgg.py
new file mode 100755
index 0000000000000000000000000000000000000000..cbc4fbb2301afc002f47abb9ed133a500d6cf23f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/ssd_vgg.py
@@ -0,0 +1,169 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import VGG, constant_init, kaiming_init, normal_init, xavier_init
+from mmcv.runner import load_checkpoint
+
+from mmdet.utils import get_root_logger
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class SSDVGG(VGG):
+    """VGG Backbone network for single-shot-detection.
+
+    Args:
+        input_size (int): width and height of input, from {300, 512}.
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        out_indices (Sequence[int]): Output from which stages.
+
+    Example:
+        >>> self = SSDVGG(input_size=300, depth=11)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 300, 300)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 19, 19)
+        (1, 512, 10, 10)
+        (1, 256, 5, 5)
+        (1, 256, 3, 3)
+        (1, 256, 1, 1)
+    """
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+
+    def __init__(self,
+                 input_size,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 l2_norm_scale=20.):
+        # TODO: in_channels for mmcv.VGG
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+        assert input_size in (300, 512)
+        self.input_size = input_size
+
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+
+        self.inplanes = 1024
+        self.extra = self._make_extra_layers(self.extra_setting[input_size])
+        self.l2_norm = L2Norm(
+            self.features[out_feature_indices[0] - 1].out_channels,
+            l2_norm_scale)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.features.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        for m in self.extra.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+        constant_init(self.l2_norm, self.l2_norm.scale)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+        for i, layer in enumerate(self.extra):
+            x = F.relu(layer(x), inplace=True)
+            if i % 2 == 1:
+                outs.append(x)
+        outs[0] = self.l2_norm(outs[0])
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _make_extra_layers(self, outplanes):
+        layers = []
+        kernel_sizes = (1, 3)
+        num_layers = 0
+        outplane = None
+        for i in range(len(outplanes)):
+            if self.inplanes == 'S':
+                self.inplanes = outplane
+                continue
+            k = kernel_sizes[num_layers % 2]
+            if outplanes[i] == 'S':
+                outplane = outplanes[i + 1]
+                conv = nn.Conv2d(
+                    self.inplanes, outplane, k, stride=2, padding=1)
+            else:
+                outplane = outplanes[i]
+                conv = nn.Conv2d(
+                    self.inplanes, outplane, k, stride=1, padding=0)
+            layers.append(conv)
+            self.inplanes = outplanes[i]
+            num_layers += 1
+        if self.input_size == 512:
+            layers.append(nn.Conv2d(self.inplanes, 256, 4, padding=1))
+
+        return nn.Sequential(*layers)
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        """L2 normalization layer.
+
+        Args:
+            n_dims (int): Number of dimensions to be normalized
+            scale (float, optional): Defaults to 20..
+            eps (float, optional): Used to avoid division by zero.
+                Defaults to 1e-10.
+        """
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+
+    def forward(self, x):
+        """Forward function."""
+        # normalization layer convert to FP32 in FP16 training
+        x_float = x.float()
+        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return (self.weight[None, :, None, None].float().expand_as(x_float) *
+                x_float / norm).type_as(x)
diff --git a/insightface/detection/scrfd/mmdet/models/backbones/trident_resnet.py b/insightface/detection/scrfd/mmdet/models/backbones/trident_resnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..e6100132b0f4120585da8a309cba4488b4b0ea72
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/backbones/trident_resnet.py
@@ -0,0 +1,292 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, kaiming_init
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.backbones.resnet import Bottleneck, ResNet
+from mmdet.models.builder import BACKBONES
+
+
+class TridentConv(nn.Module):
+    """Trident Convolution Module.
+
+    Args:
+        in_channels (int): Number of channels in input.
+        out_channels (int): Number of channels in output.
+        kernel_size (int): Size of convolution kernel.
+        stride (int, optional): Convolution stride. Default: 1.
+        trident_dilations (tuple[int, int, int], optional): Dilations of
+            different trident branch. Default: (1, 2, 3).
+        test_branch_idx (int, optional): In inference, all 3 branches will
+            be used if `test_branch_idx==-1`, otherwise only branch with
+            index `test_branch_idx` will be used. Default: 1.
+        bias (bool, optional): Whether to use bias in convolution or not.
+            Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 trident_dilations=(1, 2, 3),
+                 test_branch_idx=1,
+                 bias=False):
+        super(TridentConv, self).__init__()
+        self.num_branch = len(trident_dilations)
+        self.with_bias = bias
+        self.test_branch_idx = test_branch_idx
+        self.stride = _pair(stride)
+        self.kernel_size = _pair(kernel_size)
+        self.paddings = _pair(trident_dilations)
+        self.dilations = trident_dilations
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+        self.init_weights()
+
+    def init_weights(self):
+        kaiming_init(self, distribution='uniform', mode='fan_in')
+
+    def extra_repr(self):
+        tmpstr = f'in_channels={self.in_channels}'
+        tmpstr += f', out_channels={self.out_channels}'
+        tmpstr += f', kernel_size={self.kernel_size}'
+        tmpstr += f', num_branch={self.num_branch}'
+        tmpstr += f', test_branch_idx={self.test_branch_idx}'
+        tmpstr += f', stride={self.stride}'
+        tmpstr += f', paddings={self.paddings}'
+        tmpstr += f', dilations={self.dilations}'
+        tmpstr += f', bias={self.bias}'
+        return tmpstr
+
+    def forward(self, inputs):
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding,
+                         dilation) for input, dilation, padding in zip(
+                             inputs, self.dilations, self.paddings)
+            ]
+        else:
+            assert len(inputs) == 1
+            outputs = [
+                F.conv2d(inputs[0], self.weight, self.bias, self.stride,
+                         self.paddings[self.test_branch_idx],
+                         self.dilations[self.test_branch_idx])
+            ]
+
+        return outputs
+
+
+# Since TridentNet is defined over ResNet50 and ResNet101, here we
+# only support TridentBottleneckBlock.
+class TridentBottleneck(Bottleneck):
+    """BottleBlock for TridentResNet.
+
+    Args:
+        trident_dilations (tuple[int, int, int]): Dilations of different
+            trident branch.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        concat_output (bool): Whether to concat the output list to a Tensor.
+            `True` only in the last Block.
+    """
+
+    def __init__(self, trident_dilations, test_branch_idx, concat_output,
+                 **kwargs):
+
+        super(TridentBottleneck, self).__init__(**kwargs)
+        self.trident_dilations = trident_dilations
+        self.num_branch = len(trident_dilations)
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+        self.conv2 = TridentConv(
+            self.planes,
+            self.planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            bias=False,
+            trident_dilations=self.trident_dilations,
+            test_branch_idx=test_branch_idx)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_branch = (
+                self.num_branch
+                if self.training or self.test_branch_idx == -1 else 1)
+            identity = x
+            if not isinstance(x, list):
+                x = (x, ) * num_branch
+                identity = x
+                if self.downsample is not None:
+                    identity = [self.downsample(b) for b in x]
+
+            out = [self.conv1(b) for b in x]
+            out = [self.norm1(b) for b in out]
+            out = [self.relu(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = [self.norm2(b) for b in out]
+            out = [self.relu(b) for b in out]
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv2_plugin_names)
+
+            out = [self.conv3(b) for b in out]
+            out = [self.norm3(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv3_plugin_names)
+
+            out = [
+                out_b + identity_b for out_b, identity_b in zip(out, identity)
+            ]
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = [self.relu(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out, dim=0)
+        return out
+
+
+def make_trident_res_layer(block,
+                           inplanes,
+                           planes,
+                           num_blocks,
+                           stride=1,
+                           trident_dilations=(1, 2, 3),
+                           style='pytorch',
+                           with_cp=False,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='BN'),
+                           dcn=None,
+                           plugins=None,
+                           test_branch_idx=-1):
+    """Build Trident Res Layers."""
+
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = []
+        conv_stride = stride
+        downsample.extend([
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=conv_stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1]
+        ])
+        downsample = nn.Sequential(*downsample)
+
+    layers = []
+    for i in range(num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride if i == 0 else 1,
+                trident_dilations=trident_dilations,
+                downsample=downsample if i == 0 else None,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=plugins,
+                test_branch_idx=test_branch_idx,
+                concat_output=True if i == num_blocks - 1 else False))
+        inplanes = planes * block.expansion
+    return nn.Sequential(*layers)
+
+
+@BACKBONES.register_module()
+class TridentResNet(ResNet):
+    """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to
+    ResNet, while in stage 3, Trident BottleBlock is utilized to replace the
+    normal BottleBlock to yield trident output. Different branch shares the
+    convolution weight but uses different dilations to achieve multi-scale
+    output.
+
+                               / stage3(b0) \
+    x - stem - stage1 - stage2 - stage3(b1) - output
+                               \ stage3(b2) /
+
+    Args:
+        depth (int): Depth of resnet, from {50, 101, 152}.
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        trident_dilations (tuple[int]): Dilations of different trident branch.
+            len(trident_dilations) should be equal to num_branch.
+    """  # noqa
+
+    def __init__(self, depth, num_branch, test_branch_idx, trident_dilations,
+                 **kwargs):
+
+        assert num_branch == len(trident_dilations)
+        assert depth in (50, 101, 152)
+        super(TridentResNet, self).__init__(depth, **kwargs)
+        assert self.num_stages == 3
+        self.test_branch_idx = test_branch_idx
+        self.num_branch = num_branch
+
+        last_stage_idx = self.num_stages - 1
+        stride = self.strides[last_stage_idx]
+        dilation = trident_dilations
+        dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None
+        if self.plugins is not None:
+            stage_plugins = self.make_stage_plugins(self.plugins,
+                                                    last_stage_idx)
+        else:
+            stage_plugins = None
+        planes = self.base_channels * 2**last_stage_idx
+        res_layer = make_trident_res_layer(
+            TridentBottleneck,
+            inplanes=(self.block.expansion * self.base_channels *
+                      2**(last_stage_idx - 1)),
+            planes=planes,
+            num_blocks=self.stage_blocks[last_stage_idx],
+            stride=stride,
+            trident_dilations=dilation,
+            style=self.style,
+            with_cp=self.with_cp,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn,
+            plugins=stage_plugins,
+            test_branch_idx=self.test_branch_idx)
+
+        layer_name = f'layer{last_stage_idx + 1}'
+
+        self.__setattr__(layer_name, res_layer)
+        self.res_layers.pop(last_stage_idx)
+        self.res_layers.insert(last_stage_idx, layer_name)
+
+        self._freeze_stages()
diff --git a/insightface/detection/scrfd/mmdet/models/builder.py b/insightface/detection/scrfd/mmdet/models/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..899e787449d735cde42c0e2e717007a9778cda85
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/builder.py
@@ -0,0 +1,67 @@
+from mmcv.utils import Registry, build_from_cfg
+from torch import nn
+
+BACKBONES = Registry('backbone')
+NECKS = Registry('neck')
+ROI_EXTRACTORS = Registry('roi_extractor')
+SHARED_HEADS = Registry('shared_head')
+HEADS = Registry('head')
+LOSSES = Registry('loss')
+DETECTORS = Registry('detector')
+
+
+def build(cfg, registry, default_args=None):
+    """Build a module.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, is is either a dict
+            or a list of configs.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn module.
+    """
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return nn.Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return build(cfg, BACKBONES)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return build(cfg, NECKS)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_shared_head(cfg):
+    """Build shared head."""
+    return build(cfg, SHARED_HEADS)
+
+
+def build_head(cfg):
+    """Build head."""
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return build(cfg, LOSSES)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/__init__.py b/insightface/detection/scrfd/mmdet/models/dense_heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a762e1fd2f4bcb0f8569ad44378129df959b5f87
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/__init__.py
@@ -0,0 +1,39 @@
+from .anchor_free_head import AnchorFreeHead
+from .anchor_head import AnchorHead
+from .atss_head import ATSSHead
+from .centripetal_head import CentripetalHead
+from .corner_head import CornerHead
+from .fcos_head import FCOSHead
+from .fovea_head import FoveaHead
+from .free_anchor_retina_head import FreeAnchorRetinaHead
+from .fsaf_head import FSAFHead
+from .ga_retina_head import GARetinaHead
+from .ga_rpn_head import GARPNHead
+from .gfl_head import GFLHead
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+from .nasfcos_head import NASFCOSHead
+from .paa_head import PAAHead
+from .pisa_retinanet_head import PISARetinaHead
+from .pisa_ssd_head import PISASSDHead
+from .reppoints_head import RepPointsHead
+from .retina_head import RetinaHead
+from .retina_sepbn_head import RetinaSepBNHead
+from .rpn_head import RPNHead
+from .sabl_retina_head import SABLRetinaHead
+from .ssd_head import SSDHead
+from .transformer_head import TransformerHead
+from .vfnet_head import VFNetHead
+from .yolact_head import YOLACTHead, YOLACTProtonet, YOLACTSegmHead
+from .yolo_head import YOLOV3Head
+from .scrfd_head import SCRFDHead
+
+__all__ = [
+    'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
+    'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
+    'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
+    'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead',
+    'YOLACTSegmHead', 'YOLACTProtonet', 'YOLOV3Head', 'PAAHead',
+    'SABLRetinaHead', 'CentripetalHead', 'VFNetHead', 'TransformerHead',
+    'SCRFDHead'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_free_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..917acde637ab723dbee91eb8a74aca036380180f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,340 @@
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorFreeHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (tuple): Downsample factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 stacked_convs=4,
+                 strides=(4, 8, 16, 32, 64),
+                 dcn_on_last_conv=False,
+                 conv_bias='auto',
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super(AnchorFreeHead, self).__init__()
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                conv_name = None
+                if key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    assert NotImplementedError
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats)[:2]
+
+    def forward_single(self, x):
+        """Forward features of a single scale levle.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+                after classification and regression conv layers, some
+                models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+        """
+        raise NotImplementedError
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points of a single scale level."""
+        h, w = featmap_size
+        x_range = torch.arange(w, dtype=dtype, device=device)
+        y_range = torch.arange(h, dtype=dtype, device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+
+        Returns:
+            tuple: points of each image.
+        """
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5bb413738840b286a38de4acfa1fd28dcaee131
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/anchor_head.py
@@ -0,0 +1,682 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_anchor_generator,
+                        build_assigner, build_bbox_coder, build_sampler,
+                        images_to_levels, multi_apply, multiclass_nms, unmap)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied on decoded bounding boxes. Default: False
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     scales=[8, 16, 32],
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=(.0, .0, .0, .0),
+                     target_stds=(1.0, 1.0, 1.0, 1.0)),
+                 reg_decoded_bbox=False,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None):
+        super(AnchorHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        # TODO better way to determine whether sample or not
+        self.sampling = loss_cls['type'] not in [
+            'FocalLoss', 'GHMC', 'QualityFocalLoss'
+        ]
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+        self.anchor_generator = build_anchor_generator(anchor_generator)
+        # usually the numbers of anchors for each level are the same
+        # except SSD detectors
+        self.num_anchors = self.anchor_generator.num_base_anchors[0]
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_anchors * self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.in_channels, self.num_anchors * 4, 1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        normal_init(self.conv_cls, std=0.01)
+        normal_init(self.conv_reg, std=0.01)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_anchors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.anchor_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            img_meta (dict): Meta info of the image.
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            img_meta (dict): Meta info of the image.
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level
+                label_weights_list (list[Tensor]): Label weights of each level
+                bbox_targets_list (list[Tensor]): BBox targets of each level
+                bbox_weights_list (list[Tensor]): BBox weights of each level
+                num_total_pos (int): Number of positive samples in all images
+                num_total_neg (int): Number of negative samples in all images
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        assign_result = self.assigner.assign(
+            anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each \
+                    level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, num_total_pos, num_total_neg)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_single(self, cls_score, bbox_pred, anchors, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class labelof the
+                corresponding box.
+
+        Example:
+            >>> import mmcv
+            >>> self = AnchorHead(
+            >>>     num_classes=9,
+            >>>     in_channels=1,
+            >>>     anchor_generator=dict(
+            >>>         type='AnchorGenerator',
+            >>>         scales=[8],
+            >>>         ratios=[0.5, 1.0, 2.0],
+            >>>         strides=[4,]))
+            >>> img_metas = [{'img_shape': (32, 32, 3), 'scale_factor': 1}]
+            >>> cfg = mmcv.Config(dict(
+            >>>     score_thr=0.00,
+            >>>     nms=dict(type='nms', iou_thr=1.0),
+            >>>     max_per_img=10))
+            >>> feat = torch.rand(1, 1, 3, 3)
+            >>> cls_score, bbox_pred = self.forward_single(feat)
+            >>> # note the input lists are over different levels, not images
+            >>> cls_scores, bbox_preds = [cls_score], [bbox_pred]
+            >>> result_list = self.get_bboxes(cls_scores, bbox_preds,
+            >>>                               img_metas, cfg)
+            >>> det_bboxes, det_labels = result_list[0]
+            >>> assert len(result_list) == 1
+            >>> assert det_bboxes.shape[1] == 5
+            >>> assert len(det_bboxes) == len(det_labels) == cfg.max_per_img
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            if with_nms:
+                # some heads don't support with_nms argument
+                proposals = self._get_bboxes_single(cls_score_list,
+                                                    bbox_pred_list,
+                                                    mlvl_anchors, img_shape,
+                                                    scale_factor, cfg, rescale)
+            else:
+                proposals = self._get_bboxes_single(cls_score_list,
+                                                    bbox_pred_list,
+                                                    mlvl_anchors, img_shape,
+                                                    scale_factor, cfg, rescale,
+                                                    with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_anchors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors in zip(cls_score_list,
+                                                 bbox_pred_list, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/atss_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/atss_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..7649386293557dd14f7cb70a5b681a2609b9c97b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/atss_head.py
@@ -0,0 +1,650 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_assigner, build_sampler,
+                        images_to_levels, multi_apply, multiclass_nms,
+                        reduce_mean, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+EPS = 1e-12
+
+
+@HEADS.register_module()
+class ATSSHead(AnchorHead):
+    """Bridging the Gap Between Anchor-based and Anchor-free Detection via
+    Adaptive Training Sample Selection.
+
+    ATSS head structure is similar with FCOS, however ATSS use anchor boxes
+    and assign label by Adaptive Training Sample Selection instead max-iou.
+
+    https://arxiv.org/abs/1912.02424
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(ATSSHead, self).__init__(num_classes, in_channels, **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.loss_centerness = build_loss(loss_centerness)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+        self.atss_centerness = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 1, 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.anchor_generator.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.atss_cls, std=0.01, bias=bias_cls)
+        normal_init(self.atss_reg, std=0.01)
+        normal_init(self.atss_centerness, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, feats, self.scales)
+
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        return cls_score, bbox_pred, centerness
+
+    def loss_single(self, anchors, cls_score, bbox_pred, centerness, labels,
+                    label_weights, bbox_targets, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (N, num_total_anchors, 4).
+            num_total_samples (int): Number os positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+            pos_decode_bbox_targets = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_targets)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness,
+                centerness_targets,
+                avg_factor=num_total_samples)
+
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos).cuda()).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, loss_centerness,\
+            bbox_avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                num_total_samples=num_total_samples)
+
+        bbox_avg_factor = sum(bbox_avg_factor)
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).item()
+        if bbox_avg_factor < EPS:
+            bbox_avg_factor = 1
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def centerness_target(self, anchors, bbox_targets):
+        # only calculate pos centerness targets, otherwise there may be nan
+        gts = self.bbox_coder.decode(anchors, bbox_targets)
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        assert not torch.isnan(centerness).any()
+        return centerness
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   centernesses,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_anchors * 1, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used. Default: None.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of the
+                corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            centerness_pred_list = [
+                centernesses[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                centerness_pred_list,
+                                                mlvl_anchors, img_shape,
+                                                scale_factor, cfg, rescale,
+                                                with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           centernesses,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into labeled boxes.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                with shape (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_anchors * 4, H, W).
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_anchors * 1, H, W).
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arrange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): BBox predictions in shape (n, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (n,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_centerness = []
+        for cls_score, bbox_pred, centerness, anchors in zip(
+                cls_scores, bbox_preds, centernesses, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = (scores * centerness[:, None]).max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                centerness = centerness[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_centerness.append(centerness)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(
+                mlvl_bboxes,
+                mlvl_scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=mlvl_centerness)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_centerness
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Get targets for ATSS head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors Tensor): Number of anchors of each scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of postive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
+                                             gt_bboxes, gt_bboxes_ignore,
+                                             gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if hasattr(self, 'bbox_coder'):
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # used in VFNetHead
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/base_dense_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/base_dense_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..de11e4a2197b1dfe241ce7a66daa1907a8fc5661
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,59 @@
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+
+class BaseDenseHead(nn.Module, metaclass=ABCMeta):
+    """Base class for DenseHeads."""
+
+    def __init__(self):
+        super(BaseDenseHead, self).__init__()
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Compute losses of the head."""
+        pass
+
+    @abstractmethod
+    def get_bboxes(self, **kwargs):
+        """Transform network output for a batch into bbox predictions."""
+        pass
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/centripetal_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/centripetal_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..6728218b60539a71f6353645635f741a1ad7263d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/centripetal_head.py
@@ -0,0 +1,421 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import DeformConv2d
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from .corner_head import CornerHead
+
+
+@HEADS.register_module()
+class CentripetalHead(CornerHead):
+    """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object
+    Detection.
+
+    CentripetalHead inherits from :class:`CornerHead`. It removes the
+    embedding branch and adds guiding shift and centripetal shift branches.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2003.09119>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module. 2
+            for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104
+            outputs the final feature and intermediate supervision feature and
+            HourglassNet-52 only outputs the final feature. Default: 2.
+        corner_emb_channels (int): Channel of embedding vector. Default: 1.
+        train_cfg (dict | None): Training config. Useless in CornerHead,
+            but we keep this variable for SingleStageDetector. Default: None.
+        test_cfg (dict | None): Testing config of CornerHead. Default: None.
+        loss_heatmap (dict | None): Config of corner heatmap loss. Default:
+            GaussianFocalLoss.
+        loss_embedding (dict | None): Config of corner embedding loss. Default:
+            AssociativeEmbeddingLoss.
+        loss_offset (dict | None): Config of corner offset loss. Default:
+            SmoothL1Loss.
+        loss_guiding_shift (dict): Config of guiding shift loss. Default:
+            SmoothL1Loss.
+        loss_centripetal_shift (dict): Config of centripetal shift loss.
+            Default: SmoothL1Loss.
+    """
+
+    def __init__(self,
+                 *args,
+                 centripetal_shift_channels=2,
+                 guiding_shift_channels=2,
+                 feat_adaption_conv_kernel=3,
+                 loss_guiding_shift=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+                 loss_centripetal_shift=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 **kwargs):
+        assert centripetal_shift_channels == 2, (
+            'CentripetalHead only support centripetal_shift_channels == 2')
+        self.centripetal_shift_channels = centripetal_shift_channels
+        assert guiding_shift_channels == 2, (
+            'CentripetalHead only support guiding_shift_channels == 2')
+        self.guiding_shift_channels = guiding_shift_channels
+        self.feat_adaption_conv_kernel = feat_adaption_conv_kernel
+        super(CentripetalHead, self).__init__(*args, **kwargs)
+        self.loss_guiding_shift = build_loss(loss_guiding_shift)
+        self.loss_centripetal_shift = build_loss(loss_centripetal_shift)
+
+    def _init_centripetal_layers(self):
+        """Initialize centripetal layers.
+
+        Including feature adaption deform convs (feat_adaption), deform offset
+        prediction convs (dcn_off), guiding shift (guiding_shift) and
+        centripetal shift ( centripetal_shift). Each branch has two parts:
+        prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_feat_adaption = nn.ModuleList()
+        self.br_feat_adaption = nn.ModuleList()
+        self.tl_dcn_offset = nn.ModuleList()
+        self.br_dcn_offset = nn.ModuleList()
+        self.tl_guiding_shift = nn.ModuleList()
+        self.br_guiding_shift = nn.ModuleList()
+        self.tl_centripetal_shift = nn.ModuleList()
+        self.br_centripetal_shift = nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+            self.br_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+
+            self.tl_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+
+            self.tl_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+            self.br_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+
+            self.tl_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self):
+        """Initialize layers for CentripetalHead.
+
+        Including two parts: CornerHead layers and CentripetalHead layers
+        """
+        super()._init_layers()  # using _init_layers in CornerHead
+        self._init_centripetal_layers()
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super().init_weights()
+        for i in range(self.num_feat_levels):
+            normal_init(self.tl_feat_adaption[i], std=0.01)
+            normal_init(self.br_feat_adaption[i], std=0.01)
+            normal_init(self.tl_dcn_offset[i].conv, std=0.1)
+            normal_init(self.br_dcn_offset[i].conv, std=0.1)
+            _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]]
+            _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]]
+            _ = [
+                x.conv.reset_parameters() for x in self.tl_centripetal_shift[i]
+            ]
+            _ = [
+                x.conv.reset_parameters() for x in self.br_centripetal_shift[i]
+            ]
+
+    def forward_single(self, x, lvl_ind):
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+
+        Returns:
+            tuple[Tensor]: A tuple of CentripetalHead's output for current
+            feature level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_guiding_shift (Tensor): Predicted top-left guiding shift
+                  heatmap.
+                - br_guiding_shift (Tensor): Predicted bottom-right guiding
+                  shift heatmap.
+                - tl_centripetal_shift (Tensor): Predicted top-left centripetal
+                  shift heatmap.
+                - br_centripetal_shift (Tensor): Predicted bottom-right
+                  centripetal shift heatmap.
+        """
+        tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super(
+        ).forward_single(
+            x, lvl_ind, return_pool=True)
+
+        tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool)
+        br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool)
+
+        tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach())
+        br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach())
+
+        tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool,
+                                                          tl_dcn_offset)
+        br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool,
+                                                          br_dcn_offset)
+
+        tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind](
+            tl_feat_adaption)
+        br_centripetal_shift = self.br_centripetal_shift[lvl_ind](
+            br_feat_adaption)
+
+        result_list = [
+            tl_heat, br_heat, tl_off, br_off, tl_guiding_shift,
+            br_guiding_shift, tl_centripetal_shift, br_centripetal_shift
+        ]
+        return result_list
+
+    def loss(self,
+             tl_heats,
+             br_heats,
+             tl_offs,
+             br_offs,
+             tl_guiding_shifts,
+             br_guiding_shifts,
+             tl_centripetal_shifts,
+             br_centripetal_shifts,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [left, top, right, bottom] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+                - guiding_loss (list[Tensor]): Guiding shift losses of all
+                  feature levels.
+                - centripetal_loss (list[Tensor]): Centripetal shift losses of
+                  all feature levels.
+        """
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            img_metas[0]['pad_shape'],
+            with_corner_emb=self.with_corner_emb,
+            with_guiding_shift=True,
+            with_centripetal_shift=True)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        [det_losses, off_losses, guiding_losses, centripetal_losses
+         ] = multi_apply(self.loss_single, tl_heats, br_heats, tl_offs,
+                         br_offs, tl_guiding_shifts, br_guiding_shifts,
+                         tl_centripetal_shifts, br_centripetal_shifts,
+                         mlvl_targets)
+        loss_dict = dict(
+            det_loss=det_losses,
+            off_loss=off_losses,
+            guiding_loss=guiding_losses,
+            centripetal_loss=centripetal_losses)
+        return loss_dict
+
+    def loss_single(self, tl_hmp, br_hmp, tl_off, br_off, tl_guiding_shift,
+                    br_guiding_shift, tl_centripetal_shift,
+                    br_centripetal_shift, targets):
+        """Compute losses for single level.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_guiding_shift (Tensor): Top-left guiding shift for current level
+                with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shift (Tensor): Bottom-right guiding shift for current
+                level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shift (Tensor): Top-left centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            br_centripetal_shift (Tensor): Bottom-right centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's differnet branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - off_loss (Tensor): Corner offset loss.
+                - guiding_loss (Tensor): Guiding shift loss.
+                - centripetal_loss (Tensor): Centripetal shift loss.
+        """
+        targets['corner_embedding'] = None
+
+        det_loss, _, _, off_loss = super().loss_single(tl_hmp, br_hmp, None,
+                                                       None, tl_off, br_off,
+                                                       targets)
+
+        gt_tl_guiding_shift = targets['topleft_guiding_shift']
+        gt_br_guiding_shift = targets['bottomright_guiding_shift']
+        gt_tl_centripetal_shift = targets['topleft_centripetal_shift']
+        gt_br_centripetal_shift = targets['bottomright_centripetal_shift']
+
+        gt_tl_heatmap = targets['topleft_heatmap']
+        gt_br_heatmap = targets['bottomright_heatmap']
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_heatmap)
+        br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_heatmap)
+
+        # Guiding shift loss
+        tl_guiding_loss = self.loss_guiding_shift(
+            tl_guiding_shift,
+            gt_tl_guiding_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_guiding_loss = self.loss_guiding_shift(
+            br_guiding_shift,
+            gt_br_guiding_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0
+        # Centripetal shift loss
+        tl_centripetal_loss = self.loss_centripetal_shift(
+            tl_centripetal_shift,
+            gt_tl_centripetal_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_centripetal_loss = self.loss_centripetal_shift(
+            br_centripetal_shift,
+            gt_br_centripetal_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0
+
+        return det_loss, off_loss, guiding_loss, centripetal_loss
+
+    def get_bboxes(self,
+                   tl_heats,
+                   br_heats,
+                   tl_offs,
+                   br_offs,
+                   tl_guiding_shifts,
+                   br_guiding_shifts,
+                   tl_centripetal_shifts,
+                   br_centripetal_shifts,
+                   img_metas,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W). Useless in
+                this function, we keep this arg because it's the raw output
+                from CentripetalHead.
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+                Useless in this function, we keep this arg because it's the
+                raw output from CentripetalHead.
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(img_metas)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    img_metas[img_id],
+                    tl_emb=None,
+                    br_emb=None,
+                    tl_centripetal_shift=tl_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    br_centripetal_shift=br_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/corner_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/corner_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..cdaeca43188e0eeb8302bf9cc66933ed12a8e801
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/corner_head.py
@@ -0,0 +1,1064 @@
+from math import ceil, log
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, bias_init_with_prob
+from mmcv.ops import CornerPool, batched_nms
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from ..utils import gaussian_radius, gen_gaussian_target
+from .base_dense_head import BaseDenseHead
+
+
+class BiCornerPool(nn.Module):
+    """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
+
+    Args:
+        in_channels (int): Input channels of module.
+        out_channels (int): Output channels of module.
+        feat_channels (int): Feature channels of module.
+        directions (list[str]): Directions of two CornerPools.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 directions,
+                 feat_channels=128,
+                 out_channels=128,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(BiCornerPool, self).__init__()
+        self.direction1_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+        self.direction2_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.aftpool_conv = ConvModule(
+            feat_channels,
+            out_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv1 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.conv2 = ConvModule(
+            in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.direction1_pool = CornerPool(directions[0])
+        self.direction2_pool = CornerPool(directions[1])
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward features from the upstream network.
+
+        Args:
+            x (tensor): Input feature of BiCornerPool.
+
+        Returns:
+            conv2 (tensor): Output feature of BiCornerPool.
+        """
+        direction1_conv = self.direction1_conv(x)
+        direction2_conv = self.direction2_conv(x)
+        direction1_feat = self.direction1_pool(direction1_conv)
+        direction2_feat = self.direction2_pool(direction2_conv)
+        aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
+        conv1 = self.conv1(x)
+        relu = self.relu(aftpool_conv + conv1)
+        conv2 = self.conv2(relu)
+        return conv2
+
+
+@HEADS.register_module()
+class CornerHead(BaseDenseHead):
+    """Head of CornerNet: Detecting Objects as Paired Keypoints.
+
+    Code is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
+    kp.py#L73>`_ .
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module. 2
+            for HourglassNet-104 and 1 for HourglassNet-52. Because
+            HourglassNet-104 outputs the final feature and intermediate
+            supervision feature and HourglassNet-52 only outputs the final
+            feature. Default: 2.
+        corner_emb_channels (int): Channel of embedding vector. Default: 1.
+        train_cfg (dict | None): Training config. Useless in CornerHead,
+            but we keep this variable for SingleStageDetector. Default: None.
+        test_cfg (dict | None): Testing config of CornerHead. Default: None.
+        loss_heatmap (dict | None): Config of corner heatmap loss. Default:
+            GaussianFocalLoss.
+        loss_embedding (dict | None): Config of corner embedding loss. Default:
+            AssociativeEmbeddingLoss.
+        loss_offset (dict | None): Config of corner offset loss. Default:
+            SmoothL1Loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_feat_levels=2,
+                 corner_emb_channels=1,
+                 train_cfg=None,
+                 test_cfg=None,
+                 loss_heatmap=dict(
+                     type='GaussianFocalLoss',
+                     alpha=2.0,
+                     gamma=4.0,
+                     loss_weight=1),
+                 loss_embedding=dict(
+                     type='AssociativeEmbeddingLoss',
+                     pull_weight=0.25,
+                     push_weight=0.25),
+                 loss_offset=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1)):
+        super(CornerHead, self).__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.corner_emb_channels = corner_emb_channels
+        self.with_corner_emb = self.corner_emb_channels > 0
+        self.corner_offset_channels = 2
+        self.num_feat_levels = num_feat_levels
+        self.loss_heatmap = build_loss(
+            loss_heatmap) if loss_heatmap is not None else None
+        self.loss_embedding = build_loss(
+            loss_embedding) if loss_embedding is not None else None
+        self.loss_offset = build_loss(
+            loss_offset) if loss_offset is not None else None
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self._init_layers()
+
+    def _make_layers(self, out_channels, in_channels=256, feat_channels=256):
+        """Initialize conv sequential for CornerHead."""
+        return nn.Sequential(
+            ConvModule(in_channels, feat_channels, 3, padding=1),
+            ConvModule(
+                feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
+
+    def _init_corner_kpt_layers(self):
+        """Initialize corner keypoint layers.
+
+        Including corner heatmap branch and corner offset branch. Each branch
+        has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
+        self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
+        self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['top', 'left'],
+                    out_channels=self.in_channels))
+            self.br_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['bottom', 'right'],
+                    out_channels=self.in_channels))
+
+            self.tl_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+            self.br_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+
+            self.tl_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+            self.br_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+
+    def _init_corner_emb_layers(self):
+        """Initialize corner embedding layers.
+
+        Only include corner embedding branch with two parts: prefix `tl_` for
+        top-left and `br_` for bottom-right.
+        """
+        self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+            self.br_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self):
+        """Initialize layers for CornerHead.
+
+        Including two parts: corner keypoint layers and corner embedding layers
+        """
+        self._init_corner_kpt_layers()
+        if self.with_corner_emb:
+            self._init_corner_emb_layers()
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        bias_init = bias_init_with_prob(0.1)
+        for i in range(self.num_feat_levels):
+            # The initialization of parameters are different between nn.Conv2d
+            # and ConvModule. Our experiments show that using the original
+            # initialization of nn.Conv2d increases the final mAP by about 0.2%
+            self.tl_heat[i][-1].conv.reset_parameters()
+            self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.br_heat[i][-1].conv.reset_parameters()
+            self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.tl_off[i][-1].conv.reset_parameters()
+            self.br_off[i][-1].conv.reset_parameters()
+            if self.with_corner_emb:
+                self.tl_emb[i][-1].conv.reset_parameters()
+                self.br_emb[i][-1].conv.reset_parameters()
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of corner heatmaps, offset heatmaps and
+            embedding heatmaps.
+                - tl_heats (list[Tensor]): Top-left corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - tl_embs (list[Tensor] | list[None]): Top-left embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - br_embs (list[Tensor] | list[None]): Bottom-right embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - tl_offs (list[Tensor]): Top-left offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+                - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+        """
+        lvl_ind = list(range(self.num_feat_levels))
+        return multi_apply(self.forward_single, feats, lvl_ind)
+
+    def forward_single(self, x, lvl_ind, return_pool=False):
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+            return_pool (bool): Return corner pool feature or not.
+
+        Returns:
+            tuple[Tensor]: A tuple of CornerHead's output for current feature
+            level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
+                  None for `self.with_corner_emb == False`.
+                - br_emb (Tensor | None): Predicted bottom-right embedding
+                  heatmap. None for `self.with_corner_emb == False`.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_pool (Tensor): Top-left corner pool feature. Not must
+                  have.
+                - br_pool (Tensor): Bottom-right corner pool feature. Not must
+                  have.
+        """
+        tl_pool = self.tl_pool[lvl_ind](x)
+        tl_heat = self.tl_heat[lvl_ind](tl_pool)
+        br_pool = self.br_pool[lvl_ind](x)
+        br_heat = self.br_heat[lvl_ind](br_pool)
+
+        tl_emb, br_emb = None, None
+        if self.with_corner_emb:
+            tl_emb = self.tl_emb[lvl_ind](tl_pool)
+            br_emb = self.br_emb[lvl_ind](br_pool)
+
+        tl_off = self.tl_off[lvl_ind](tl_pool)
+        br_off = self.br_off[lvl_ind](br_pool)
+
+        result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
+        if return_pool:
+            result_list.append(tl_pool)
+            result_list.append(br_pool)
+
+        return result_list
+
+    def get_targets(self,
+                    gt_bboxes,
+                    gt_labels,
+                    feat_shape,
+                    img_shape,
+                    with_corner_emb=False,
+                    with_guiding_shift=False,
+                    with_centripetal_shift=False):
+        """Generate corner targets.
+
+        Including corner heatmap, corner offset.
+
+        Optional: corner embedding, corner guiding shift, centripetal shift.
+
+        For CornerNet, we generate corner heatmap, corner offset and corner
+        embedding from this function.
+
+        For CentripetalNet, we generate corner heatmap, corner offset, guiding
+        shift and centripetal shift from this function.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
+                has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box, each has
+                shape (num_gt,).
+            feat_shape (list[int]): Shape of output feature,
+                [batch, channel, height, width].
+            img_shape (list[int]): Shape of input image,
+                [height, width, channel].
+            with_corner_emb (bool): Generate corner embedding target or not.
+                Default: False.
+            with_guiding_shift (bool): Generate guiding shift target or not.
+                Default: False.
+            with_centripetal_shift (bool): Generate centripetal shift target or
+                not. Default: False.
+
+        Returns:
+            dict: Ground truth of corner heatmap, corner offset, corner
+            embedding, guiding shift and centripetal shift. Containing the
+            following keys:
+
+                - topleft_heatmap (Tensor): Ground truth top-left corner
+                  heatmap.
+                - bottomright_heatmap (Tensor): Ground truth bottom-right
+                  corner heatmap.
+                - topleft_offset (Tensor): Ground truth top-left corner offset.
+                - bottomright_offset (Tensor): Ground truth bottom-right corner
+                  offset.
+                - corner_embedding (list[list[list[int]]]): Ground truth corner
+                  embedding. Not must have.
+                - topleft_guiding_shift (Tensor): Ground truth top-left corner
+                  guiding shift. Not must have.
+                - bottomright_guiding_shift (Tensor): Ground truth bottom-right
+                  corner guiding shift. Not must have.
+                - topleft_centripetal_shift (Tensor): Ground truth top-left
+                  corner centripetal shift. Not must have.
+                - bottomright_centripetal_shift (Tensor): Ground truth
+                  bottom-right corner centripetal shift. Not must have.
+        """
+        batch_size, _, height, width = feat_shape
+        img_h, img_w = img_shape[:2]
+
+        width_ratio = float(width / img_w)
+        height_ratio = float(height / img_h)
+
+        gt_tl_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_br_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+        gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+
+        if with_corner_emb:
+            match = []
+
+        # Guiding shift is a kind of offset, from center to corner
+        if with_guiding_shift:
+            gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+        # Centripetal shift is also a kind of offset, from center to corner
+        # and normalized by log.
+        if with_centripetal_shift:
+            gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+
+        for batch_id in range(batch_size):
+            # Ground truth of corner embedding per image is a list of coord set
+            corner_match = []
+            for box_id in range(len(gt_labels[batch_id])):
+                left, top, right, bottom = gt_bboxes[batch_id][box_id]
+                center_x = (left + right) / 2.0
+                center_y = (top + bottom) / 2.0
+                label = gt_labels[batch_id][box_id]
+
+                # Use coords in the feature level to generate ground truth
+                scale_left = left * width_ratio
+                scale_right = right * width_ratio
+                scale_top = top * height_ratio
+                scale_bottom = bottom * height_ratio
+                scale_center_x = center_x * width_ratio
+                scale_center_y = center_y * height_ratio
+
+                # Int coords on feature map/ground truth tensor
+                left_idx = int(min(scale_left, width - 1))
+                right_idx = int(min(scale_right, width - 1))
+                top_idx = int(min(scale_top, height - 1))
+                bottom_idx = int(min(scale_bottom, height - 1))
+
+                # Generate gaussian heatmap
+                scale_box_width = ceil(scale_right - scale_left)
+                scale_box_height = ceil(scale_bottom - scale_top)
+                radius = gaussian_radius((scale_box_height, scale_box_width),
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
+                    radius)
+                gt_br_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
+                    radius)
+
+                # Generate corner offset
+                left_offset = scale_left - left_idx
+                top_offset = scale_top - top_idx
+                right_offset = scale_right - right_idx
+                bottom_offset = scale_bottom - bottom_idx
+                gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
+                gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
+                gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
+                gt_br_offset[batch_id, 1, bottom_idx,
+                             right_idx] = bottom_offset
+
+                # Generate corner embedding
+                if with_corner_emb:
+                    corner_match.append([[top_idx, left_idx],
+                                         [bottom_idx, right_idx]])
+                # Generate guiding shift
+                if with_guiding_shift:
+                    gt_tl_guiding_shift[batch_id, 0, top_idx,
+                                        left_idx] = scale_center_x - left_idx
+                    gt_tl_guiding_shift[batch_id, 1, top_idx,
+                                        left_idx] = scale_center_y - top_idx
+                    gt_br_guiding_shift[batch_id, 0, bottom_idx,
+                                        right_idx] = right_idx - scale_center_x
+                    gt_br_guiding_shift[
+                        batch_id, 1, bottom_idx,
+                        right_idx] = bottom_idx - scale_center_y
+                # Generate centripetal shift
+                if with_centripetal_shift:
+                    gt_tl_centripetal_shift[batch_id, 0, top_idx,
+                                            left_idx] = log(scale_center_x -
+                                                            scale_left)
+                    gt_tl_centripetal_shift[batch_id, 1, top_idx,
+                                            left_idx] = log(scale_center_y -
+                                                            scale_top)
+                    gt_br_centripetal_shift[batch_id, 0, bottom_idx,
+                                            right_idx] = log(scale_right -
+                                                             scale_center_x)
+                    gt_br_centripetal_shift[batch_id, 1, bottom_idx,
+                                            right_idx] = log(scale_bottom -
+                                                             scale_center_y)
+
+            if with_corner_emb:
+                match.append(corner_match)
+
+        target_result = dict(
+            topleft_heatmap=gt_tl_heatmap,
+            topleft_offset=gt_tl_offset,
+            bottomright_heatmap=gt_br_heatmap,
+            bottomright_offset=gt_br_offset)
+
+        if with_corner_emb:
+            target_result.update(corner_embedding=match)
+        if with_guiding_shift:
+            target_result.update(
+                topleft_guiding_shift=gt_tl_guiding_shift,
+                bottomright_guiding_shift=gt_br_guiding_shift)
+        if with_centripetal_shift:
+            target_result.update(
+                topleft_centripetal_shift=gt_tl_centripetal_shift,
+                bottomright_centripetal_shift=gt_br_centripetal_shift)
+
+        return target_result
+
+    def loss(self,
+             tl_heats,
+             br_heats,
+             tl_embs,
+             br_embs,
+             tl_offs,
+             br_offs,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [left, top, right, bottom] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
+                  losses of all feature levels.
+                - push_loss (list[Tensor]): Part two of AssociativeEmbedding
+                  losses of all feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+        """
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            img_metas[0]['pad_shape'],
+            with_corner_emb=self.with_corner_emb)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        det_losses, pull_losses, push_losses, off_losses = multi_apply(
+            self.loss_single, tl_heats, br_heats, tl_embs, br_embs, tl_offs,
+            br_offs, mlvl_targets)
+        loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
+        if self.with_corner_emb:
+            loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
+        return loss_dict
+
+    def loss_single(self, tl_hmp, br_hmp, tl_emb, br_emb, tl_off, br_off,
+                    targets):
+        """Compute losses for single level.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's differnet branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
+                - push_loss (Tensor): Part two of AssociativeEmbedding loss.
+                - off_loss (Tensor): Corner offset loss.
+        """
+        gt_tl_hmp = targets['topleft_heatmap']
+        gt_br_hmp = targets['bottomright_heatmap']
+        gt_tl_off = targets['topleft_offset']
+        gt_br_off = targets['bottomright_offset']
+        gt_embedding = targets['corner_embedding']
+
+        # Detection loss
+        tl_det_loss = self.loss_heatmap(
+            tl_hmp.sigmoid(),
+            gt_tl_hmp,
+            avg_factor=max(1,
+                           gt_tl_hmp.eq(1).sum()))
+        br_det_loss = self.loss_heatmap(
+            br_hmp.sigmoid(),
+            gt_br_hmp,
+            avg_factor=max(1,
+                           gt_br_hmp.eq(1).sum()))
+        det_loss = (tl_det_loss + br_det_loss) / 2.0
+
+        # AssociativeEmbedding loss
+        if self.with_corner_emb and self.loss_embedding is not None:
+            pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
+                                                       gt_embedding)
+        else:
+            pull_loss, push_loss = None, None
+
+        # Offset loss
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_hmp)
+        br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_hmp)
+        tl_off_loss = self.loss_offset(
+            tl_off,
+            gt_tl_off,
+            tl_off_mask,
+            avg_factor=max(1, tl_off_mask.sum()))
+        br_off_loss = self.loss_offset(
+            br_off,
+            gt_br_off,
+            br_off_mask,
+            avg_factor=max(1, br_off_mask.sum()))
+
+        off_loss = (tl_off_loss + br_off_loss) / 2.0
+
+        return det_loss, pull_loss, push_loss, off_loss
+
+    def get_bboxes(self,
+                   tl_heats,
+                   br_heats,
+                   tl_embs,
+                   br_embs,
+                   tl_offs,
+                   br_offs,
+                   img_metas,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(img_metas)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            result_list.append(
+                self._get_bboxes_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           tl_heat,
+                           br_heat,
+                           tl_off,
+                           br_off,
+                           img_meta,
+                           tl_emb=None,
+                           br_emb=None,
+                           tl_centripetal_shift=None,
+                           br_centripetal_shift=None,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift: Top-left corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            br_centripetal_shift: Bottom-right corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+        """
+        if isinstance(img_meta, (list, tuple)):
+            img_meta = img_meta[0]
+
+        batch_bboxes, batch_scores, batch_clses = self.decode_heatmap(
+            tl_heat=tl_heat.sigmoid(),
+            br_heat=br_heat.sigmoid(),
+            tl_off=tl_off,
+            br_off=br_off,
+            tl_emb=tl_emb,
+            br_emb=br_emb,
+            tl_centripetal_shift=tl_centripetal_shift,
+            br_centripetal_shift=br_centripetal_shift,
+            img_meta=img_meta,
+            k=self.test_cfg.corner_topk,
+            kernel=self.test_cfg.local_maximum_kernel,
+            distance_threshold=self.test_cfg.distance_threshold)
+
+        if rescale:
+            batch_bboxes /= img_meta['scale_factor']
+
+        bboxes = batch_bboxes.view([-1, 4])
+        scores = batch_scores.view([-1, 1])
+        clses = batch_clses.view([-1, 1])
+
+        idx = scores.argsort(dim=0, descending=True)
+        bboxes = bboxes[idx].view([-1, 4])
+        scores = scores[idx].view(-1)
+        clses = clses[idx].view(-1)
+
+        detections = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
+        keepinds = (detections[:, -1] > -0.1)
+        detections = detections[keepinds]
+        labels = clses[keepinds]
+
+        if with_nms:
+            detections, labels = self._bboxes_nms(detections, labels,
+                                                  self.test_cfg)
+
+        return detections, labels
+
+    def _bboxes_nms(self, bboxes, labels, cfg):
+        out_bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:, -1], labels,
+                                       cfg.nms_cfg)
+        out_labels = labels[keep]
+
+        if len(out_bboxes) > 0:
+            idx = torch.argsort(out_bboxes[:, -1], descending=True)
+            idx = idx[:cfg.max_per_img]
+            out_bboxes = out_bboxes[idx]
+            out_labels = out_labels[idx]
+
+        return out_bboxes, out_labels
+
+    def _gather_feat(self, feat, ind, mask=None):
+        """Gather feature according to index.
+
+        Args:
+            feat (Tensor): Target feature map.
+            ind (Tensor): Target coord index.
+            mask (Tensor | None): Mask of featuremap. Default: None.
+
+        Returns:
+            feat (Tensor): Gathered feature.
+        """
+        dim = feat.size(2)
+        ind = ind.unsqueeze(2).repeat(1, 1, dim)
+        feat = feat.gather(1, ind)
+        if mask is not None:
+            mask = mask.unsqueeze(2).expand_as(feat)
+            feat = feat[mask]
+            feat = feat.view(-1, dim)
+        return feat
+
+    def _local_maximum(self, heat, kernel=3):
+        """Extract local maximum pixel with given kernal.
+
+        Args:
+            heat (Tensor): Target heatmap.
+            kernel (int): Kernel size of max pooling. Default: 3.
+
+        Returns:
+            heat (Tensor): A heatmap where local maximum pixels maintain its
+                own value and other positions are 0.
+        """
+        pad = (kernel - 1) // 2
+        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+        keep = (hmax == heat).float()
+        return heat * keep
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        """Transpose and gather feature according to index.
+
+        Args:
+            feat (Tensor): Target feature map.
+            ind (Tensor): Target coord index.
+
+        Returns:
+            feat (Tensor): Transposed and gathered feature.
+        """
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def _topk(self, scores, k=20):
+        """Get top k positions from heatmap.
+
+        Args:
+            scores (Tensor): Target heatmap with shape
+                [batch, num_classes, height, width].
+            k (int): Target number. Default: 20.
+
+        Returns:
+            tuple[torch.Tensor]: Scores, indexes, categories and coords of
+                topk keypoint. Containing following Tensors:
+
+            - topk_scores (Tensor): Max scores of each topk keypoint.
+            - topk_inds (Tensor): Indexes of each topk keypoint.
+            - topk_clses (Tensor): Categories of each topk keypoint.
+            - topk_ys (Tensor): Y-coord of each topk keypoint.
+            - topk_xs (Tensor): X-coord of each topk keypoint.
+        """
+        batch, _, height, width = scores.size()
+        topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k)
+        topk_clses = topk_inds // (height * width)
+        topk_inds = topk_inds % (height * width)
+        topk_ys = topk_inds // width
+        topk_xs = (topk_inds % width).int().float()
+        return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+
+    def decode_heatmap(self,
+                       tl_heat,
+                       br_heat,
+                       tl_off,
+                       br_off,
+                       tl_emb=None,
+                       br_emb=None,
+                       tl_centripetal_shift=None,
+                       br_centripetal_shift=None,
+                       img_meta=None,
+                       k=100,
+                       kernel=3,
+                       distance_threshold=0.5,
+                       num_dets=1000):
+        """Transform outputs for a single batch item into raw bbox predictions.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_emb (Tensor | None): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor | None): Bottom-right corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift (Tensor | None): Top-left centripetal shift
+                for current level with shape (N, 2, H, W).
+            br_centripetal_shift (Tensor | None): Bottom-right centripetal
+                shift for current level with shape (N, 2, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            k (int): Get top k corner keypoints from heatmap.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+            distance_threshold (float): Distance threshold. Top-left and
+                bottom-right corner keypoints with feature distance less than
+                the threshold will be regarded as keypoints from same object.
+            num_dets (int): Num of raw boxes before doing nms.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CornerHead, containing the
+            following Tensors:
+
+            - bboxes (Tensor): Coords of each box.
+            - scores (Tensor): Scores of each box.
+            - clses (Tensor): Categories of each box.
+        """
+        with_embedding = tl_emb is not None and br_emb is not None
+        with_centripetal_shift = (
+            tl_centripetal_shift is not None
+            and br_centripetal_shift is not None)
+        assert with_embedding + with_centripetal_shift == 1
+        batch, _, height, width = tl_heat.size()
+        inp_h, inp_w, _ = img_meta['pad_shape']
+
+        # perform nms on heatmaps
+        tl_heat = self._local_maximum(tl_heat, kernel=kernel)
+        br_heat = self._local_maximum(br_heat, kernel=kernel)
+
+        tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = self._topk(tl_heat, k=k)
+        br_scores, br_inds, br_clses, br_ys, br_xs = self._topk(br_heat, k=k)
+
+        # We use repeat instead of expand here because expand is a
+        # shallow-copy function. Thus it could cause unexpected testing result
+        # sometimes. Using expand will decrease about 10% mAP during testing
+        # compared to repeat.
+        tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
+        tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
+        br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
+        br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
+
+        tl_off = self._transpose_and_gather_feat(tl_off, tl_inds)
+        tl_off = tl_off.view(batch, k, 1, 2)
+        br_off = self._transpose_and_gather_feat(br_off, br_inds)
+        br_off = br_off.view(batch, 1, k, 2)
+
+        tl_xs = tl_xs + tl_off[..., 0]
+        tl_ys = tl_ys + tl_off[..., 1]
+        br_xs = br_xs + br_off[..., 0]
+        br_ys = br_ys + br_off[..., 1]
+
+        if with_centripetal_shift:
+            tl_centripetal_shift = self._transpose_and_gather_feat(
+                tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
+            br_centripetal_shift = self._transpose_and_gather_feat(
+                br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
+
+            tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
+            tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
+            br_ctxs = br_xs - br_centripetal_shift[..., 0]
+            br_ctys = br_ys - br_centripetal_shift[..., 1]
+
+        # all possible boxes based on top k corners (ignoring class)
+        tl_xs *= (inp_w / width)
+        tl_ys *= (inp_h / height)
+        br_xs *= (inp_w / width)
+        br_ys *= (inp_h / height)
+
+        if with_centripetal_shift:
+            tl_ctxs *= (inp_w / width)
+            tl_ctys *= (inp_h / height)
+            br_ctxs *= (inp_w / width)
+            br_ctys *= (inp_h / height)
+
+        x_off = img_meta['border'][2]
+        y_off = img_meta['border'][0]
+
+        tl_xs -= x_off
+        tl_ys -= y_off
+        br_xs -= x_off
+        br_ys -= y_off
+
+        tl_xs *= tl_xs.gt(0.0).type_as(tl_xs)
+        tl_ys *= tl_ys.gt(0.0).type_as(tl_ys)
+        br_xs *= br_xs.gt(0.0).type_as(br_xs)
+        br_ys *= br_ys.gt(0.0).type_as(br_ys)
+
+        bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
+        area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
+
+        if with_centripetal_shift:
+            tl_ctxs -= x_off
+            tl_ctys -= y_off
+            br_ctxs -= x_off
+            br_ctys -= y_off
+
+            tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
+            tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
+            br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
+            br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
+
+            ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
+                                    dim=3)
+            area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
+
+            rcentral = torch.zeros_like(ct_bboxes)
+            # magic nums from paper section 4.1
+            mu = torch.ones_like(area_bboxes) / 2.4
+            mu[area_bboxes > 3500] = 1 / 2.1  # large bbox have smaller mu
+
+            bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
+            bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
+            rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
+                             (rcentral[..., 3] - rcentral[..., 1])).abs()
+            dists = area_ct_bboxes / area_rcentral
+
+            tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 0] >= rcentral[..., 2])
+            tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 1] >= rcentral[..., 3])
+            br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 2] >= rcentral[..., 2])
+            br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 3] >= rcentral[..., 3])
+
+        if with_embedding:
+            tl_emb = self._transpose_and_gather_feat(tl_emb, tl_inds)
+            tl_emb = tl_emb.view(batch, k, 1)
+            br_emb = self._transpose_and_gather_feat(br_emb, br_inds)
+            br_emb = br_emb.view(batch, 1, k)
+            dists = torch.abs(tl_emb - br_emb)
+
+        tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
+        br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
+
+        scores = (tl_scores + br_scores) / 2  # scores for all possible boxes
+
+        # tl and br should have same class
+        tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
+        br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
+        cls_inds = (tl_clses != br_clses)
+
+        # reject boxes based on distances
+        dist_inds = dists > distance_threshold
+
+        # reject boxes based on widths and heights
+        width_inds = (br_xs <= tl_xs)
+        height_inds = (br_ys <= tl_ys)
+
+        scores[cls_inds] = -1
+        scores[width_inds] = -1
+        scores[height_inds] = -1
+        scores[dist_inds] = -1
+        if with_centripetal_shift:
+            scores[tl_ctx_inds] = -1
+            scores[tl_cty_inds] = -1
+            scores[br_ctx_inds] = -1
+            scores[br_cty_inds] = -1
+
+        scores = scores.view(batch, -1)
+        scores, inds = torch.topk(scores, num_dets)
+        scores = scores.unsqueeze(2)
+
+        bboxes = bboxes.view(batch, -1, 4)
+        bboxes = self._gather_feat(bboxes, inds)
+
+        clses = tl_clses.contiguous().view(batch, -1, 1)
+        clses = self._gather_feat(clses, inds).float()
+
+        return bboxes, scores, clses
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/dense_test_mixins.py b/insightface/detection/scrfd/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100755
index 0000000000000000000000000000000000000000..a07c9d4236a1f1f823cb3d659ea1f04c64524745
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,97 @@
+from inspect import signature
+
+import torch
+
+from mmdet.core import bbox2result, bbox_mapping_back, multiclass_nms
+
+
+class BBoxTestMixin(object):
+    """Mixin class for test time augmentation of bboxes."""
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple: (bboxes, scores)
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        # check with_nms argument
+        gb_sig = signature(self.get_bboxes)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_bboxes_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_factors = []  # score_factors for NMS
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_inputs = outs + (img_meta, self.test_cfg, False, False)
+            bbox_outputs = self.get_bboxes(*bbox_inputs)[0]
+            aug_bboxes.append(bbox_outputs[0])
+            aug_scores.append(bbox_outputs[1])
+            # bbox_outputs of some detectors (e.g., ATSS, FCOS, YOLOv3)
+            # contains additional element to adjust scores before NMS
+            if len(bbox_outputs) >= 3:
+                aug_factors.append(bbox_outputs[2])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_factors = torch.cat(aug_factors, dim=0) if aug_factors else None
+        det_bboxes, det_labels = multiclass_nms(
+            merged_bboxes,
+            merged_scores,
+            self.test_cfg.score_thr,
+            self.test_cfg.nms,
+            self.test_cfg.max_per_img,
+            score_factors=merged_factors)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+        bbox_results = bbox2result(_det_bboxes, det_labels, self.num_classes)
+        return bbox_results
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/fcos_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/fcos_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..51639c984c2c44a29b41655fd55030722b6736d7
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/fcos_head.py
@@ -0,0 +1,574 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import distance2bbox, multi_apply, multiclass_nms
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class FCOSHead(AnchorFreeHead):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to supress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (list[int] | list[tuple[int, int]]): Strides of points
+            in multiple feature levels. Default: (4, 8, 16, 32, 64).
+        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: False.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets
+            with FPN strides. Default: False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_centerness (dict): Config of centerness loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
+                                 (512, INF)),
+                 center_sampling=False,
+                 center_sample_radius=1.5,
+                 norm_on_bbox=False,
+                 centerness_on_reg=False,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 **kwargs):
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            **kwargs)
+        self.loss_centerness = build_loss(loss_centerness)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super().init_weights()
+        normal_init(self.conv_centerness, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level, \
+                    each is a 4D-tensor, the channel number is \
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each \
+                    scale level, each is a 4D-tensor, the channel number is \
+                    num_points * 4.
+                centernesses (list[Tensor]): Centerss for each scale level, \
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.strides)
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale levle.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness \
+                predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            bbox_pred = F.relu(bbox_pred)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): Centerss for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes,
+                                                gt_labels)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = len(pos_inds)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels,
+            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+
+        if num_pos > 0:
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = distance2bbox(pos_points,
+                                                     pos_bbox_targets)
+            # centerness weighted iou loss
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=pos_centerness_targets.sum())
+            loss_centerness = self.loss_centerness(pos_centerness,
+                                                   pos_centerness_targets)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   centernesses,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_points * 1, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used. Default: None.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of the
+                corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            centerness_pred_list = [
+                centernesses[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            det_bboxes = self._get_bboxes_single(
+                cls_score_list, bbox_pred_list, centerness_pred_list,
+                mlvl_points, img_shape, scale_factor, cfg, rescale, with_nms)
+            result_list.append(det_bboxes)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           centernesses,
+                           mlvl_points,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                with shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
+                level with shape (num_points * 4, H, W).
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_points * 4, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arrange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): BBox predictions in shape (n, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (n,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_centerness = []
+        for cls_score, bbox_pred, centerness, points in zip(
+                cls_scores, bbox_preds, centernesses, mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = (scores * centerness[:, None]).max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                centerness = centerness[topk_inds]
+            bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_centerness.append(centerness)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(
+                mlvl_bboxes,
+                mlvl_scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=mlvl_centerness)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_centerness
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points according to feature map sizes."""
+        y, x = super()._get_points_single(featmap_size, stride, dtype, device)
+        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
+                             dim=-1) + stride // 2
+        return points
+
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level. \
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_target_single,
+            gt_bboxes_list,
+            gt_labels_list,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_target_single(self, gt_bboxes, gt_labels, points, regress_ranges,
+                           num_points_per_lvl):
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = gt_labels.size(0)
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets):
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        centerness_targets = (
+            left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/fovea_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/fovea_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..c8ccea787cba3d092284d4a5e209adaf6521c86a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/fovea_head.py
@@ -0,0 +1,341 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import DeformConv2d
+
+from mmdet.core import multi_apply, multiclass_nms
+from ..builder import HEADS
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+class FeatureAlign(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deform_groups=4):
+        super(FeatureAlign, self).__init__()
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self):
+        normal_init(self.conv_offset, std=0.1)
+        normal_init(self.conv_adaption, std=0.01)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape)
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module()
+class FoveaHead(AnchorFreeHead):
+    """FoveaBox: Beyond Anchor-based Object Detector
+    https://arxiv.org/abs/1904.03797
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 base_edge_list=(16, 32, 64, 128, 256),
+                 scale_ranges=((8, 32), (16, 64), (32, 128), (64, 256), (128,
+                                                                         512)),
+                 sigma=0.4,
+                 with_deform=False,
+                 deform_groups=4,
+                 **kwargs):
+        self.base_edge_list = base_edge_list
+        self.scale_ranges = scale_ranges
+        self.sigma = sigma
+        self.with_deform = with_deform
+        self.deform_groups = deform_groups
+        super().__init__(num_classes, in_channels, **kwargs)
+
+    def _init_layers(self):
+        # box branch
+        super()._init_reg_convs()
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+        # cls branch
+        if not self.with_deform:
+            super()._init_cls_convs()
+            self.conv_cls = nn.Conv2d(
+                self.feat_channels, self.cls_out_channels, 3, padding=1)
+        else:
+            self.cls_convs = nn.ModuleList()
+            self.cls_convs.append(
+                ConvModule(
+                    self.feat_channels, (self.feat_channels * 4),
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+            self.cls_convs.append(
+                ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
+                           1,
+                           stride=1,
+                           padding=0,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.norm_cfg is None))
+            self.feature_adaption = FeatureAlign(
+                self.feat_channels,
+                self.feat_channels,
+                kernel_size=3,
+                deform_groups=self.deform_groups)
+            self.conv_cls = nn.Conv2d(
+                int(self.feat_channels * 4),
+                self.cls_out_channels,
+                3,
+                padding=1)
+
+    def init_weights(self):
+        super().init_weights()
+        if self.with_deform:
+            self.feature_adaption.init_weights()
+
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        if self.with_deform:
+            cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+        return cls_score, bbox_pred
+
+    def _get_points_single(self, *args, **kwargs):
+        y, x = super()._get_points_single(*args, **kwargs)
+        return y + 0.5, x + 0.5
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bbox_list,
+             gt_label_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        assert len(cls_scores) == len(bbox_preds)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                 bbox_preds[0].device)
+        num_imgs = cls_scores[0].size(0)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_labels, flatten_bbox_targets = self.get_targets(
+            gt_bbox_list, gt_label_list, featmap_sizes, points)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < self.num_classes)).nonzero().view(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
+        if num_pos > 0:
+            pos_bbox_preds = flatten_bbox_preds[pos_inds]
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_weights = pos_bbox_targets.new_zeros(
+                pos_bbox_targets.size()) + 1.0
+            loss_bbox = self.loss_bbox(
+                pos_bbox_preds,
+                pos_bbox_targets,
+                pos_weights,
+                avg_factor=num_pos)
+        else:
+            loss_bbox = torch.tensor(
+                0,
+                dtype=flatten_bbox_preds.dtype,
+                device=flatten_bbox_preds.device)
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self, gt_bbox_list, gt_label_list, featmap_sizes, points):
+        label_list, bbox_target_list = multi_apply(
+            self._get_target_single,
+            gt_bbox_list,
+            gt_label_list,
+            featmap_size_list=featmap_sizes,
+            point_list=points)
+        flatten_labels = [
+            torch.cat([
+                labels_level_img.flatten() for labels_level_img in labels_level
+            ]) for labels_level in zip(*label_list)
+        ]
+        flatten_bbox_targets = [
+            torch.cat([
+                bbox_targets_level_img.reshape(-1, 4)
+                for bbox_targets_level_img in bbox_targets_level
+            ]) for bbox_targets_level in zip(*bbox_target_list)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+        flatten_bbox_targets = torch.cat(flatten_bbox_targets)
+        return flatten_labels, flatten_bbox_targets
+
+    def _get_target_single(self,
+                           gt_bboxes_raw,
+                           gt_labels_raw,
+                           featmap_size_list=None,
+                           point_list=None):
+
+        gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                              (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+        label_list = []
+        bbox_target_list = []
+        # for each pyramid, find the cls and box target
+        for base_len, (lower_bound, upper_bound), stride, featmap_size, \
+            (y, x) in zip(self.base_edge_list, self.scale_ranges,
+                          self.strides, featmap_size_list, point_list):
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            labels = gt_labels_raw.new_zeros(featmap_size) + self.num_classes
+            bbox_targets = gt_bboxes_raw.new(featmap_size[0], featmap_size[1],
+                                             4) + 1
+            # scale assignment
+            hit_indices = ((gt_areas >= lower_bound) &
+                           (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(hit_indices) == 0:
+                label_list.append(labels)
+                bbox_target_list.append(torch.log(bbox_targets))
+                continue
+            _, hit_index_order = torch.sort(-gt_areas[hit_indices])
+            hit_indices = hit_indices[hit_index_order]
+            gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
+            gt_labels = gt_labels_raw[hit_indices]
+            half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
+            half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            # valid fovea area: left, right, top, down
+            pos_left = torch.ceil(
+                gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long().\
+                clamp(0, featmap_size[1] - 1)
+            pos_right = torch.floor(
+                gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long().\
+                clamp(0, featmap_size[1] - 1)
+            pos_top = torch.ceil(
+                gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long().\
+                clamp(0, featmap_size[0] - 1)
+            pos_down = torch.floor(
+                gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long().\
+                clamp(0, featmap_size[0] - 1)
+            for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
+                    zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
+                        gt_bboxes_raw[hit_indices, :]):
+                labels[py1:py2 + 1, px1:px2 + 1] = label
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
+                    (stride * x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
+                    (stride * y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
+                    (gt_x2 - stride * x[py1:py2 + 1, px1:px2 + 1]) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
+                    (gt_y2 - stride * y[py1:py2 + 1, px1:px2 + 1]) / base_len
+            bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
+            label_list.append(labels)
+            bbox_target_list.append(torch.log(bbox_targets))
+        return label_list, bbox_target_list
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        points = self.get_points(
+            featmap_sizes,
+            bbox_preds[0].dtype,
+            bbox_preds[0].device,
+            flatten=True)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            det_bboxes = self._get_bboxes_single(cls_score_list,
+                                                 bbox_pred_list, featmap_sizes,
+                                                 points, img_shape,
+                                                 scale_factor, cfg, rescale)
+            result_list.append(det_bboxes)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           featmap_sizes,
+                           point_list,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(point_list)
+        det_bboxes = []
+        det_scores = []
+        for cls_score, bbox_pred, featmap_size, stride, base_len, (y, x) \
+                in zip(cls_scores, bbox_preds, featmap_sizes, self.strides,
+                       self.base_edge_list, point_list):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4).exp()
+            nms_pre = cfg.get('nms_pre', -1)
+            if (nms_pre > 0) and (scores.shape[0] > nms_pre):
+                max_scores, _ = scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                y = y[topk_inds]
+                x = x[topk_inds]
+            x1 = (stride * x - base_len * bbox_pred[:, 0]).\
+                clamp(min=0, max=img_shape[1] - 1)
+            y1 = (stride * y - base_len * bbox_pred[:, 1]).\
+                clamp(min=0, max=img_shape[0] - 1)
+            x2 = (stride * x + base_len * bbox_pred[:, 2]).\
+                clamp(min=0, max=img_shape[1] - 1)
+            y2 = (stride * y + base_len * bbox_pred[:, 3]).\
+                clamp(min=0, max=img_shape[0] - 1)
+            bboxes = torch.stack([x1, y1, x2, y2], -1)
+            det_bboxes.append(bboxes)
+            det_scores.append(scores)
+        det_bboxes = torch.cat(det_bboxes)
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_scores = torch.cat(det_scores)
+        padding = det_scores.new_zeros(det_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        det_scores = torch.cat([det_scores, padding], dim=1)
+        det_bboxes, det_labels = multiclass_nms(det_bboxes, det_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/free_anchor_retina_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/free_anchor_retina_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..79879fdc3171b8e34b606b27eb1ceb67f4473e3e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -0,0 +1,270 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet.core import bbox_overlaps
+from ..builder import HEADS
+from .retina_head import RetinaHead
+
+EPS = 1e-12
+
+
+@HEADS.register_module()
+class FreeAnchorRetinaHead(RetinaHead):
+    """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 4.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+        bbox_thr (float): The threshold of the saturated linear function. It is
+            usually the same with the IoU threshold used in NMS.
+        gamma (float): Gamma parameter in focal loss.
+        alpha (float): Alpha parameter in focal loss.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 pre_anchor_topk=50,
+                 bbox_thr=0.6,
+                 gamma=2.0,
+                 alpha=0.5,
+                 **kwargs):
+        super(FreeAnchorRetinaHead,
+              self).__init__(num_classes, in_channels, stacked_convs, conv_cfg,
+                             norm_cfg, **kwargs)
+
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == len(self.anchor_generator.base_anchors)
+
+        anchor_list, _ = self.get_anchors(featmap_sizes, img_metas)
+        anchors = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls.permute(0, 2, 3,
+                        1).reshape(cls.size(0), -1, self.cls_out_channels)
+            for cls in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        cls_prob = torch.sigmoid(cls_scores)
+        box_prob = []
+        num_pos = 0
+        positive_losses = []
+        for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_,
+                bbox_preds_) in enumerate(
+                    zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds)):
+
+            with torch.no_grad():
+                if len(gt_bboxes_) == 0:
+                    image_box_prob = torch.zeros(
+                        anchors_.size(0),
+                        self.cls_out_channels).type_as(bbox_preds_)
+                else:
+                    # box_localization: a_{j}^{loc}, shape: [j, 4]
+                    pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
+
+                    # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                    object_box_iou = bbox_overlaps(gt_bboxes_, pred_boxes)
+
+                    # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                    t1 = self.bbox_thr
+                    t2 = object_box_iou.max(
+                        dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
+                    object_box_prob = ((object_box_iou - t1) /
+                                       (t2 - t1)).clamp(
+                                           min=0, max=1)
+
+                    # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                    num_obj = gt_labels_.size(0)
+                    indices = torch.stack([
+                        torch.arange(num_obj).type_as(gt_labels_), gt_labels_
+                    ],
+                                          dim=0)
+                    object_cls_box_prob = torch.sparse_coo_tensor(
+                        indices, object_box_prob)
+
+                    # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                    """
+                    from "start" to "end" implement:
+                    image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                     dim=0).t()
+
+                    """
+                    # start
+                    box_cls_prob = torch.sparse.sum(
+                        object_cls_box_prob, dim=0).to_dense()
+
+                    indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                    if indices.numel() == 0:
+                        image_box_prob = torch.zeros(
+                            anchors_.size(0),
+                            self.cls_out_channels).type_as(object_box_prob)
+                    else:
+                        nonzero_box_prob = torch.where(
+                            (gt_labels_.unsqueeze(dim=-1) == indices[0]),
+                            object_box_prob[:, indices[1]],
+                            torch.tensor([
+                                0
+                            ]).type_as(object_box_prob)).max(dim=0).values
+
+                        # upmap to shape [j, c]
+                        image_box_prob = torch.sparse_coo_tensor(
+                            indices.flip([0]),
+                            nonzero_box_prob,
+                            size=(anchors_.size(0),
+                                  self.cls_out_channels)).to_dense()
+                    # end
+
+                box_prob.append(image_box_prob)
+
+            # construct bags for objects
+            match_quality_matrix = bbox_overlaps(gt_bboxes_, anchors_)
+            _, matched = torch.topk(
+                match_quality_matrix,
+                self.pre_anchor_topk,
+                dim=1,
+                sorted=False)
+            del match_quality_matrix
+
+            # matched_cls_prob: P_{ij}^{cls}
+            matched_cls_prob = torch.gather(
+                cls_prob_[matched], 2,
+                gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                                 1)).squeeze(2)
+
+            # matched_box_prob: P_{ij}^{loc}
+            matched_anchors = anchors_[matched]
+            matched_object_targets = self.bbox_coder.encode(
+                matched_anchors,
+                gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
+            loss_bbox = self.loss_bbox(
+                bbox_preds_[matched],
+                matched_object_targets,
+                reduction_override='none').sum(-1)
+            matched_box_prob = torch.exp(-loss_bbox)
+
+            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+            num_pos += len(gt_bboxes_)
+            positive_losses.append(
+                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_prob = torch.stack(box_prob, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
+            1, num_pos * self.pre_anchor_topk)
+
+        # avoid the absence of gradients in regression subnet
+        # when no ground-truth in a batch
+        if num_pos == 0:
+            positive_loss = bbox_preds.sum() * 0
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
+        """Compute positive bag loss.
+
+        :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
+
+        :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
+
+        :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
+
+        Args:
+            matched_cls_prob (Tensor): Classification probabilty of matched
+                samples in shape (num_gt, pre_anchor_topk).
+            matched_box_prob (Tensor): BBox probability of matched samples,
+                in shape (num_gt, pre_anchor_topk).
+
+        Returns:
+            Tensor: Positive bag loss in shape (num_gt,).
+        """  # noqa: E501, W605
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob, box_prob):
+        """Compute negative bag loss.
+
+        :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
+
+        :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
+
+        :math:`P_{j}^{bg}`: Classification probability of negative samples.
+
+        Args:
+            cls_prob (Tensor): Classification probability, in shape
+                (num_img, num_anchors, num_classes).
+            box_prob (Tensor): Box probability, in shape
+                (num_img, num_anchors, num_classes).
+
+        Returns:
+            Tensor: Negative bag loss in shape (num_img, num_anchors, num_classes).
+        """  # noqa: E501, W605
+        prob = cls_prob * (1 - box_prob)
+        # There are some cases when neg_prob = 0.
+        # This will cause the neg_prob.log() to be inf without clamp.
+        prob = prob.clamp(min=EPS, max=1 - EPS)
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/fsaf_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/fsaf_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..c23e3699f76892989bb06bba1fb25cd43c39da12
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/fsaf_head.py
@@ -0,0 +1,418 @@
+import numpy as np
+import torch
+from mmcv.cnn import normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, images_to_levels, multi_apply,
+                        unmap)
+from ..builder import HEADS
+from ..losses.accuracy import accuracy
+from ..losses.utils import weight_reduce_loss
+from .retina_head import RetinaHead
+
+
+@HEADS.register_module()
+class FSAFHead(RetinaHead):
+    """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors (num_anchors is 1 for anchor-
+    free methods)
+
+    Args:
+        *args: Same as its base class in :class:`RetinaHead`
+        score_threshold (float, optional): The score_threshold to calculate
+            positive recall. If given, prediction scores lower than this value
+            is counted as incorrect prediction. Default to None.
+        **kwargs: Same as its base class in :class:`RetinaHead`
+
+    Example:
+        >>> import torch
+        >>> self = FSAFHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == self.num_classes
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self, *args, score_threshold=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.score_threshold = score_threshold
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level.
+
+        Args:
+            x (Tensor): Feature map of a single scale level.
+
+        Returns:
+            tuple (Tensor):
+                cls_score (Tensor): Box scores for each scale level
+                    Has shape (N, num_points * num_classes, H, W).
+                bbox_pred (Tensor): Box energies / deltas for each scale
+                    level with shape (N, num_points * 4, H, W).
+        """
+        cls_score, bbox_pred = super().forward_single(x)
+        # relu: TBLR encoder only accepts positive bbox_pred
+        return cls_score, self.relu(bbox_pred)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super(FSAFHead, self).init_weights()
+        # The positive bias in self.retina_reg conv is to prevent predicted \
+        #  bbox with 0 area
+        normal_init(self.retina_reg, std=0.01, bias=0.25)
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Most of the codes are the same with the base class
+          :obj: `AnchorHead`, except that it also collects and returns
+          the matched gt index in the image (from 0 to num_gt-1). If the
+          anchor bbox is not matched to any gt, the corresponding value in
+          pos_gt_inds is -1.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # Assign gt and sample anchors
+        anchors = flat_anchors[inside_flags.type(torch.bool), :]
+        assign_result = self.assigner.assign(
+            anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros((num_valid_anchors, label_channels),
+                                          dtype=torch.float)
+        pos_gt_inds = anchors.new_full((num_valid_anchors, ),
+                                       -1,
+                                       dtype=torch.long)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            # The assigned gt_index for each anchor. (0-based)
+            pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # shadowed_labels is a tensor composed of tuples
+        #  (anchor_inds, class_label) that indicate those anchors lying in the
+        #  outer region of a gt or overlapped by another gt with a smaller
+        #  area.
+        #
+        # Therefore, only the shadowed labels are ignored for loss calculation.
+        # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
+        shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+        if shadowed_labels is not None and shadowed_labels.numel():
+            if len(shadowed_labels.shape) == 2:
+                idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
+                assert (labels[idx_] != label_).all(), \
+                    'One label cannot be both positive and ignored'
+                label_weights[idx_, label_] = 0
+            else:
+                label_weights[shadowed_labels] = 0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(labels, num_total_anchors, inside_flags)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            pos_gt_inds = unmap(
+                pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result, pos_gt_inds)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        for i in range(len(bbox_preds)):  # loop over fpn level
+            # avoid 0 area of the predicted bbox
+            bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
+        # TODO: It may directly use the base-class loss function.
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+        batch_size = len(gt_bboxes)
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg,
+         pos_assigned_gt_inds_list) = cls_reg_targets
+
+        num_gts = np.array(list(map(len, gt_labels)))
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+
+        # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
+        # gt index of each anchor bbox in each fpn level.
+        cum_num_gts = list(np.cumsum(num_gts))  # length of batch_size
+        for i, assign in enumerate(pos_assigned_gt_inds_list):
+            # loop over fpn levels
+            for j in range(1, batch_size):
+                # loop over batch size
+                # Convert gt indices in each img to those in the batch
+                assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
+            pos_assigned_gt_inds_list[i] = assign.flatten()
+            labels_list[i] = labels_list[i].flatten()
+        num_gts = sum(map(len, gt_labels))  # total number of gt in the batch
+        # The unique label index of each gt in the batch
+        label_sequence = torch.arange(num_gts, device=device)
+        # Collect the average loss of each gt in each level
+        with torch.no_grad():
+            loss_levels, = multi_apply(
+                self.collect_loss_level_single,
+                losses_cls,
+                losses_bbox,
+                pos_assigned_gt_inds_list,
+                labels_seq=label_sequence)
+            # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
+            loss_levels = torch.stack(loss_levels, dim=0)
+            # Locate the best fpn level for loss back-propagation
+            if loss_levels.numel() == 0:  # zero gt
+                argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
+            else:
+                _, argmin = loss_levels.min(dim=0)
+
+        # Reweight the loss of each (anchor, label) pair, so that only those
+        #  at the best gt level are back-propagated.
+        losses_cls, losses_bbox, pos_inds = multi_apply(
+            self.reweight_loss_single,
+            losses_cls,
+            losses_bbox,
+            pos_assigned_gt_inds_list,
+            labels_list,
+            list(range(len(losses_cls))),
+            min_levels=argmin)
+        num_pos = torch.cat(pos_inds, 0).sum().float()
+        pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
+                                               pos_inds)
+
+        if num_pos == 0:  # No gt
+            avg_factor = num_pos + float(num_total_neg)
+        else:
+            avg_factor = num_pos
+        for i in range(len(losses_cls)):
+            losses_cls[i] /= avg_factor
+            losses_bbox[i] /= avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            num_pos=num_pos / batch_size,
+            pos_recall=pos_recall)
+
+    def calculate_pos_recall(self, cls_scores, labels_list, pos_inds):
+        """Calculate positive recall with score threshold.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores at all fpn levels.
+                Each tensor is in shape (N, num_classes * num_anchors, H, W)
+            labels_list (list[Tensor]): The label that each anchor is assigned
+                to. Shape (N * H * W * num_anchors, )
+            pos_inds (list[Tensor]): List of bool tensors indicating whether
+                the anchor is assigned to a positive label.
+                Shape (N * H * W * num_anchors, )
+
+        Returns:
+            Tensor: A single float number indicating the positive recall.
+        """
+        with torch.no_grad():
+            num_class = self.num_classes
+            scores = [
+                cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
+                for cls, pos in zip(cls_scores, pos_inds)
+            ]
+            labels = [
+                label.reshape(-1)[pos]
+                for label, pos in zip(labels_list, pos_inds)
+            ]
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+            else:
+                scores = scores.softmax(dim=1)
+
+            return accuracy(scores, labels, thresh=self.score_threshold)
+
+    def collect_loss_level_single(self, cls_loss, reg_loss, assigned_gt_inds,
+                                  labels_seq):
+        """Get the average loss in each FPN level w.r.t. each gt label.
+
+        Args:
+            cls_loss (Tensor): Classification loss of each feature map pixel,
+              shape (num_anchor, num_class)
+            reg_loss (Tensor): Regression loss of each feature map pixel,
+              shape (num_anchor, 4)
+            assigned_gt_inds (Tensor): It indicates which gt the prior is
+              assigned to (0-based, -1: no assignment). shape (num_anchor),
+            labels_seq: The rank of labels. shape (num_gt)
+
+        Returns:
+            shape: (num_gt), average loss of each gt in this level
+        """
+        if len(reg_loss.shape) == 2:  # iou loss has shape (num_prior, 4)
+            reg_loss = reg_loss.sum(dim=-1)  # sum loss in tblr dims
+        if len(cls_loss.shape) == 2:
+            cls_loss = cls_loss.sum(dim=-1)  # sum loss in class dims
+        loss = cls_loss + reg_loss
+        assert loss.size(0) == assigned_gt_inds.size(0)
+        # Default loss value is 1e6 for a layer where no anchor is positive
+        #  to ensure it will not be chosen to back-propagate gradient
+        losses_ = loss.new_full(labels_seq.shape, 1e6)
+        for i, l in enumerate(labels_seq):
+            match = assigned_gt_inds == l
+            if match.any():
+                losses_[i] = loss[match].mean()
+        return losses_,
+
+    def reweight_loss_single(self, cls_loss, reg_loss, assigned_gt_inds,
+                             labels, level, min_levels):
+        """Reweight loss values at each level.
+
+        Reassign loss values at each level by masking those where the
+        pre-calculated loss is too large. Then return the reduced losses.
+
+        Args:
+            cls_loss (Tensor): Element-wise classification loss.
+              Shape: (num_anchors, num_classes)
+            reg_loss (Tensor): Element-wise regression loss.
+              Shape: (num_anchors, 4)
+            assigned_gt_inds (Tensor): The gt indices that each anchor bbox
+              is assigned to. -1 denotes a negative anchor, otherwise it is the
+              gt index (0-based). Shape: (num_anchors, ),
+            labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
+            level (int): The current level index in the pyramid
+              (0-4 for RetinaNet)
+            min_levels (Tensor): The best-matching level for each gt.
+              Shape: (num_gts, ),
+
+        Returns:
+            tuple:
+                - cls_loss: Reduced corrected classification loss. Scalar.
+                - reg_loss: Reduced corrected regression loss. Scalar.
+                - pos_flags (Tensor): Corrected bool tensor indicating the
+                  final postive anchors. Shape: (num_anchors, ).
+        """
+        loc_weight = torch.ones_like(reg_loss)
+        cls_weight = torch.ones_like(cls_loss)
+        pos_flags = assigned_gt_inds >= 0  # positive pixel flag
+        pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
+
+        if pos_flags.any():  # pos pixels exist
+            pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
+            zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
+            neg_indices = pos_indices[zeroing_indices]
+
+            if neg_indices.numel():
+                pos_flags[neg_indices] = 0
+                loc_weight[neg_indices] = 0
+                # Only the weight corresponding to the label is
+                #  zeroed out if not selected
+                zeroing_labels = labels[neg_indices]
+                assert (zeroing_labels >= 0).all()
+                cls_weight[neg_indices, zeroing_labels] = 0
+
+        # Weighted loss for both cls and reg loss
+        cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
+        reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
+
+        return cls_loss, reg_loss, pos_flags
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/ga_retina_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/ga_retina_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..8822d1ca78ee2fa2f304a0649e81274830383533
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/ga_retina_head.py
@@ -0,0 +1,109 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.ops import MaskedConv2d
+
+from ..builder import HEADS
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+
+
+@HEADS.register_module()
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(GARetinaHead, self).__init__(num_classes, in_channels, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2,
+                                    1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.retina_cls = MaskedConv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = MaskedConv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the layer."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+
+        self.feature_adaption_cls.init_weights()
+        self.feature_adaption_reg.init_weights()
+
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_loc, std=0.01, bias=bias_cls)
+        normal_init(self.conv_shape, std=0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/ga_rpn_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/ga_rpn_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..d3c3a84b24ee1057198f3c3c581d5887608ff48e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import normal_init
+from mmcv.ops import nms
+
+from ..builder import HEADS
+from .guided_anchor_head import GuidedAnchorHead
+from .rpn_test_mixin import RPNTestMixin
+
+
+@HEADS.register_module()
+class GARPNHead(RPNTestMixin, GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self, in_channels, **kwargs):
+        super(GARPNHead, self).__init__(1, in_channels, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        normal_init(self.rpn_conv, std=0.01)
+        super(GARPNHead, self).init_weights()
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super(GARPNHead, self).forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        losses = super(GARPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_shape)
+            # filter out too small bboxes
+            if cfg.min_bbox_size > 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_inds = torch.nonzero(
+                    (w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size),
+                    as_tuple=False).squeeze()
+                proposals = proposals[valid_inds, :]
+                scores = scores[valid_inds]
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms_thr)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.nms_across_levels:
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1], cfg.nms_thr)
+            proposals = proposals[:cfg.max_num, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_num, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+        return proposals
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/gfl_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/gfl_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..ab7a69abc2d4e37748f4cff671cf2ec5d1360e38
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/gfl_head.py
@@ -0,0 +1,638 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, bbox2distance, bbox_overlaps,
+                        build_assigner, build_sampler, distance2bbox,
+                        images_to_levels, multi_apply, multiclass_nms,
+                        reduce_mean, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Default: 16. You
+            may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@HEADS.register_module()
+class GFLHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 4.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='GN', num_groups=32, requires_grad=True).
+        loss_qfl (dict): Config of Quality Focal Loss (QFL).
+        reg_max (int): Max value of integral set :math: `{0, ..., reg_max}`
+            in QFL setting. Default: 16.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+                 reg_max=16,
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        super(GFLHead, self).__init__(num_classes, in_channels, **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = build_loss(loss_dfl)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        assert self.num_anchors == 1, 'anchor free version'
+        self.gfl_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.gfl_reg = nn.Conv2d(
+            self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.anchor_generator.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.gfl_cls, std=0.01, bias=bias_cls)
+        normal_init(self.gfl_reg, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification and quality (IoU)
+                    joint scores for all scale levels, each is a 4D-tensor,
+                    the channel number is num_classes.
+                bbox_preds (list[Tensor]): Box distribution logits for all
+                    scale levels, each is a 4D-tensor, the channel number is
+                    4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, feats, self.scales)
+
+    def forward_single(self, x, scale):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls and quality joint scores for a single
+                    scale level the channel number is num_classes.
+                bbox_pred (Tensor): Box distribution logits for a single scale
+                    level, the channel number is 4*(n+1), n is max value of
+                    integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.gfl_cls(cls_feat)
+        bbox_pred = scale(self.gfl_reg(reg_feat)).float()
+        return cls_score, bbox_pred
+
+    def anchor_center(self, anchors):
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, stride, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = distance2bbox(pos_anchor_centers,
+                                                 pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            target_corners = bbox2distance(pos_anchor_centers,
+                                           pos_decode_bbox_targets,
+                                           self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = torch.tensor(0).cuda()
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, losses_dfl,\
+            avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.anchor_generator.strides,
+                num_total_samples=num_total_samples)
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into labeled boxes.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                has shape (num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for a single
+                scale level with shape (4*(n+1), H, W), n is max value of
+                integral set.
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): Bbox predictions in shape (N, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (N,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, stride, anchors in zip(
+                cls_scores, bbox_preds, self.anchor_generator.strides,
+                mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            bbox_pred = self.integral(bbox_pred) * stride[0]
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+
+            bboxes = distance2bbox(
+                self.anchor_center(anchors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors Tensor): Number of anchors of each scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4).
+                pos_inds (Tensor): Indices of postive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        #print('NNN:', self.assigner.__class__.__name__)
+        if self.assigner.__class__.__name__=='ATSSAssigner':
+            assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
+                                                 gt_bboxes, gt_bboxes_ignore,
+                                                 gt_labels)
+        else:
+            assign_result = self.assigner.assign(anchors, 
+                                                 gt_bboxes, gt_bboxes_ignore,
+                                                 gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/guided_anchor_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/guided_anchor_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..062df486495acdaf01160a5ed7514c1e77f28741
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,855 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import bias_init_with_prob, normal_init
+from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, build_anchor_generator,
+                        build_assigner, build_bbox_coder, build_sampler,
+                        calc_region, images_to_levels, multi_apply,
+                        multiclass_nms, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(nn.Module):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size.
+        deform_groups (int): Deformable conv group size.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deform_groups=4):
+        super(FeatureAdaption, self).__init__()
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self):
+        normal_init(self.conv_offset, std=0.1)
+        normal_init(self.conv_adaption, std=0.01)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels.
+        approx_anchor_generator (dict): Config dict for approx generator
+        square_anchor_generator (dict): Config dict for square generator
+        anchor_coder (dict): Config dict for anchor coder
+        bbox_coder (dict): Config dict for bbox coder
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+        loss_loc (dict): Config of location loss.
+        loss_shape (dict): Config of anchor shape loss.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of bbox regression loss.
+    """
+
+    def __init__(
+        self,
+        num_classes,
+        in_channels,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]
+        ),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]
+        ),
+        reg_decoded_bbox=False,
+        deform_groups=4,
+        loc_filter_thr=0.01,
+        train_cfg=None,
+        test_cfg=None,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                       loss_weight=1.0)):  # yapf: disable
+        super(AnchorHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = build_anchor_generator(
+            approx_anchor_generator)
+        self.square_anchor_generator = build_anchor_generator(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_anchors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_anchors = 1
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        self.sampling = loss_cls['type'] not in ['FocalLoss']
+        self.ga_sampling = train_cfg is not None and hasattr(
+            train_cfg, 'ga_sampler')
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = build_bbox_coder(anchor_coder)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        # build losses
+        self.loss_loc = build_loss(loss_loc)
+        self.loss_shape = build_loss(loss_shape)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+            self.ga_assigner = build_assigner(self.train_cfg.ga_assigner)
+            if self.ga_sampling:
+                ga_sampler_cfg = self.train_cfg.ga_sampler
+            else:
+                ga_sampler_cfg = dict(type='PseudoSampler')
+            self.ga_sampler = build_sampler(ga_sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_anchors * 2, 1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.conv_reg = MaskedConv2d(self.feat_channels, self.num_anchors * 4,
+                                     1)
+
+    def init_weights(self):
+        normal_init(self.conv_cls, std=0.01)
+        normal_init(self.conv_reg, std=0.01)
+
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_loc, std=0.01, bias=bias_cls)
+        normal_init(self.conv_shape, std=0.01)
+
+        self.feature_adaption.init_weights()
+
+    def forward_single(self, x):
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_sampled_approxs(self, featmap_sizes, img_metas, device='cuda'):
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for i in range(self.approxs_per_octave):
+                    split_valid_flags = flags[i::self.approxs_per_octave]
+                    split_approxs = approxs[i::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg.allowed_border)
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes,
+                    shape_preds,
+                    loc_preds,
+                    img_metas,
+                    use_loc_filter=False,
+                    device='cuda'):
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+                loc masks of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(self,
+                                   squares,
+                                   shape_pred,
+                                   loc_pred,
+                                   use_loc_filter=False):
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            square (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predections of a single level.
+            loc_pred (tensor): Loc predections of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_anchors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, gt_bboxes_list, featmap_sizes):
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg.center_ratio
+        ignore_ratio = self.train_cfg.ignore_ratio
+        img_per_gpu = len(gt_bboxes_list)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=gt_bboxes_list[0].device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = gt_bboxes_list[img_id]
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs,
+                                inside_flags,
+                                flat_squares,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                img_meta,
+                                unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_bboxes (Tensor): Ground truth bboxes of a single image.
+            img_meta (dict): Meta info of a single image.
+            approxs_per_octave (int): number of approxs per octave
+            cfg (dict): RPN train configs.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        if not inside_flags.any():
+            return (None, ) * 5
+        # assign gt and sample anchors
+        expand_inside_flags = inside_flags[:, None].expand(
+            -1, self.approxs_per_octave).reshape(-1)
+        approxs = flat_approxs[expand_inside_flags, :]
+        squares = flat_squares[inside_flags, :]
+
+        assign_result = self.ga_assigner.assign(approxs, squares,
+                                                self.approxs_per_octave,
+                                                gt_bboxes, gt_bboxes_ignore)
+        sampling_result = self.ga_sampler.sample(assign_result, squares,
+                                                 gt_bboxes)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds)
+
+    def ga_shape_targets(self,
+                         approx_list,
+                         inside_flag_list,
+                         square_list,
+                         gt_bboxes_list,
+                         img_metas,
+                         gt_bboxes_ignore_list=None,
+                         unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list]): Multi level approxs of each image.
+            inside_flag_list (list[list]): Multi level inside flags of each
+                image.
+            square_list (list[list]): Multi level squares of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        num_imgs = len(img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             img_metas,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts,
+                          anchor_weights, anchor_total_num):
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_,
+            bbox_gts_,
+            anchor_weights_,
+            avg_factor=anchor_total_num)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred, loc_target, loc_weight,
+                        loc_avg_factor):
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=loc_avg_factor)
+        return loss_loc
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            gt_bboxes, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes, shape_preds, loc_preds, img_metas, device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, gt_bboxes,
+                                              img_metas)
+        if shape_targets is None:
+            return None
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num,
+         anchor_bg_num) = shape_targets
+        anchor_total_num = (
+            anchor_fg_num if not self.ga_sampling else anchor_fg_num +
+            anchor_bg_num)
+
+        # get anchor targets
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                loc_avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                anchor_total_num=anchor_total_num)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   shape_preds,
+                   loc_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                guided_anchor_list,
+                                                loc_mask_list, img_shape,
+                                                scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/nasfcos_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/nasfcos_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..994ce0455e1982110f237b3958a81394c319bb47
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/nasfcos_head.py
@@ -0,0 +1,75 @@
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, Scale, bias_init_with_prob,
+                      caffe2_xavier_init, normal_init)
+
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class NASFCOSHead(FCOSHead):
+    """Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
+
+    It is quite similar with FCOS head, except for the searched structure of
+    classification branch and bbox regression branch, where a structure of
+    "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
+    """
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        dconv3x3_config = dict(
+            type='DCNv2',
+            kernel_size=3,
+            use_bias=True,
+            deform_groups=2,
+            padding=1)
+        conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
+        conv1x1_config = dict(type='Conv', kernel_size=1)
+
+        self.arch_config = [
+            dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
+        ]
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i, op_ in enumerate(self.arch_config):
+            op = copy.deepcopy(op_)
+            chn = self.in_channels if i == 0 else self.feat_channels
+            assert isinstance(op, dict)
+            use_bias = op.pop('use_bias', False)
+            padding = op.pop('padding', 0)
+            kernel_size = op.pop('kernel_size')
+            module = ConvModule(
+                chn,
+                self.feat_channels,
+                kernel_size,
+                stride=1,
+                padding=padding,
+                norm_cfg=self.norm_cfg,
+                bias=use_bias,
+                conv_cfg=op)
+
+            self.cls_convs.append(copy.deepcopy(module))
+            self.reg_convs.append(copy.deepcopy(module))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        # retinanet_bias_init
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_reg, std=0.01)
+        normal_init(self.conv_centerness, std=0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+
+        for branch in [self.cls_convs, self.reg_convs]:
+            for module in branch.modules():
+                if isinstance(module, ConvModule) \
+                        and isinstance(module.conv, nn.Conv2d):
+                    caffe2_xavier_init(module.conv)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/paa_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/paa_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..db28abc7e62c3eaf8305289b760df1c33bf8bf05
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/paa_head.py
@@ -0,0 +1,653 @@
+import numpy as np
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply, multiclass_nms
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+from mmdet.models import HEADS
+from mmdet.models.dense_heads import ATSSHead
+
+EPS = 1e-12
+try:
+    import sklearn.mixture as skm
+except ImportError:
+    skm = None
+
+
+def levels_to_images(mlvl_tensor):
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[torch.Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[torch.Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+@HEADS.register_module()
+class PAAHead(ATSSHead):
+    """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
+    Prediction for Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/kkhoot/PAA/blob/master/paa_core
+    /modeling/rpn/paa/loss.py>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.08103>`_ .
+
+    Args:
+        topk (int): Select topk samples with smallest loss in
+            each level.
+        score_voting (bool): Whether to use score voting in post-process.
+        covariance_type : String describing the type of covariance parameters
+            to be used in :class:`sklearn.mixture.GaussianMixture`.
+            It must be one of:
+
+            - 'full': each component has its own general covariance matrix
+            - 'tied': all components share the same general covariance matrix
+            - 'diag': each component has its own diagonal covariance matrix
+            - 'spherical': each component has its own single variance
+            Default: 'diag'. From 'full' to 'spherical', the gmm fitting
+            process is faster yet the performance could be influenced. For most
+            cases, 'diag' should be a good choice.
+    """
+
+    def __init__(self,
+                 *args,
+                 topk=9,
+                 score_voting=True,
+                 covariance_type='diag',
+                 **kwargs):
+        # topk used in paa reassign process
+        self.topk = topk
+        self.with_score_voting = score_voting
+        self.covariance_type = covariance_type
+        super(PAAHead, self).__init__(*args, **kwargs)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             iou_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
+                boxes can be ignored when are computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            labels, label_weights, bbox_weights, num_pos = multi_apply(
+                self.paa_reassign,
+                pos_losses_list,
+                labels,
+                labels_weight,
+                bboxes_weight,
+                pos_inds,
+                pos_gt_index,
+                anchor_list,
+            )
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+        labels = torch.cat(labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(labels_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                flatten_anchors[pos_inds_flatten],
+                bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_target,
+                iou_target.clamp(min=EPS),
+                avg_factor=iou_target.sum())
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
+
+    def get_pos_loss(self, anchors, cls_score, bbox_pred, label, label_weight,
+                     bbox_target, bbox_weight, pos_inds):
+        """Calculate loss of all potential positive samples obtained from first
+        match process.
+
+        Args:
+            anchors (list[Tensor]): Anchors of each scale.
+            cls_score (Tensor): Box scores of single image with shape
+                (num_anchors, num_classes)
+            bbox_pred (Tensor): Box energies / deltas of single image
+                with shape (num_anchors, 4)
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_target (dict): Regression target of each anchor with
+                shape (num_anchors, 4).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+
+        Returns:
+            Tensor: Losses of all positive samples in single image.
+        """
+        if not len(pos_inds):
+            return cls_score.new([]),
+        anchors_all_level = torch.cat(anchors, 0)
+        pos_scores = cls_score[pos_inds]
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_label = label[pos_inds]
+        pos_label_weight = label_weight[pos_inds]
+        pos_bbox_target = bbox_target[pos_inds]
+        pos_bbox_weight = bbox_weight[pos_inds]
+        pos_anchors = anchors_all_level[pos_inds]
+        pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
+
+        # to keep loss dimension
+        loss_cls = self.loss_cls(
+            pos_scores,
+            pos_label,
+            pos_label_weight,
+            avg_factor=self.loss_cls.loss_weight,
+            reduction_override='none')
+
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred,
+            pos_bbox_target,
+            pos_bbox_weight,
+            avg_factor=self.loss_cls.loss_weight,
+            reduction_override='none')
+
+        loss_cls = loss_cls.sum(-1)
+        pos_loss = loss_bbox + loss_cls
+        return pos_loss,
+
+    def paa_reassign(self, pos_losses, label, label_weight, bbox_weight,
+                     pos_inds, pos_gt_inds, anchors):
+        """Fit loss to GMM distribution and separate positive, ignore, negative
+        samples again with GMM model.
+
+        Args:
+            pos_losses (Tensor): Losses of all positive samples in
+                single image.
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+            pos_gt_inds (Tensor): Gt_index of all positive samples got
+                from first assign process.
+            anchors (list[Tensor]): Anchors of each scale.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - label (Tensor): classification target of each anchor after
+                  paa assign, with shape (num_anchors,)
+                - label_weight (Tensor): Classification loss weight of each
+                  anchor after paa assign, with shape (num_anchors).
+                - bbox_weight (Tensor): Bbox weight of each anchor with shape
+                  (num_anchors, 4).
+                - num_pos (int): The number of positive samples after paa
+                  assign.
+        """
+        if not len(pos_inds):
+            return label, label_weight, bbox_weight, 0
+
+        num_gt = pos_gt_inds.max() + 1
+        num_level = len(anchors)
+        num_anchors_each_level = [item.size(0) for item in anchors]
+        num_anchors_each_level.insert(0, 0)
+        inds_level_interval = np.cumsum(num_anchors_each_level)
+        pos_level_mask = []
+        for i in range(num_level):
+            mask = (pos_inds >= inds_level_interval[i]) & (
+                pos_inds < inds_level_interval[i + 1])
+            pos_level_mask.append(mask)
+        pos_inds_after_paa = [label.new_tensor([])]
+        ignore_inds_after_paa = [label.new_tensor([])]
+        for gt_ind in range(num_gt):
+            pos_inds_gmm = []
+            pos_loss_gmm = []
+            gt_mask = pos_gt_inds == gt_ind
+            for level in range(num_level):
+                level_mask = pos_level_mask[level]
+                level_gt_mask = level_mask & gt_mask
+                value, topk_inds = pos_losses[level_gt_mask].topk(
+                    min(level_gt_mask.sum(), self.topk), largest=False)
+                pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
+                pos_loss_gmm.append(value)
+            pos_inds_gmm = torch.cat(pos_inds_gmm)
+            pos_loss_gmm = torch.cat(pos_loss_gmm)
+            # fix gmm need at least two sample
+            if len(pos_inds_gmm) < 2:
+                continue
+            device = pos_inds_gmm.device
+            pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
+            pos_inds_gmm = pos_inds_gmm[sort_inds]
+            pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
+            min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
+            means_init = np.array([min_loss, max_loss]).reshape(2, 1)
+            weights_init = np.array([0.5, 0.5])
+            precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1)  # full
+            if self.covariance_type == 'spherical':
+                precisions_init = precisions_init.reshape(2)
+            elif self.covariance_type == 'diag':
+                precisions_init = precisions_init.reshape(2, 1)
+            elif self.covariance_type == 'tied':
+                precisions_init = np.array([[1.0]])
+            if skm is None:
+                raise ImportError('Please run "pip install sklearn" '
+                                  'to install sklearn first.')
+            gmm = skm.GaussianMixture(
+                2,
+                weights_init=weights_init,
+                means_init=means_init,
+                precisions_init=precisions_init,
+                covariance_type=self.covariance_type)
+            gmm.fit(pos_loss_gmm)
+            gmm_assignment = gmm.predict(pos_loss_gmm)
+            scores = gmm.score_samples(pos_loss_gmm)
+            gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
+            scores = torch.from_numpy(scores).to(device)
+
+            pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
+                gmm_assignment, scores, pos_inds_gmm)
+            pos_inds_after_paa.append(pos_inds_temp)
+            ignore_inds_after_paa.append(ignore_inds_temp)
+
+        pos_inds_after_paa = torch.cat(pos_inds_after_paa)
+        ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
+        reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
+        reassign_ids = pos_inds[reassign_mask]
+        label[reassign_ids] = self.num_classes
+        label_weight[ignore_inds_after_paa] = 0
+        bbox_weight[reassign_ids] = 0
+        num_pos = len(pos_inds_after_paa)
+        return label, label_weight, bbox_weight, num_pos
+
+    def gmm_separation_scheme(self, gmm_assignment, scores, pos_inds_gmm):
+        """A general separation scheme for gmm model.
+
+        It separates a GMM distribution of candidate samples into three
+        parts, 0 1 and uncertain areas, and you can implement other
+        separation schemes by rewriting this function.
+
+        Args:
+            gmm_assignment (Tensor): The prediction of GMM which is of shape
+                (num_samples,). The 0/1 value indicates the distribution
+                that each sample comes from.
+            scores (Tensor): The probability of sample coming from the
+                fit GMM distribution. The tensor is of shape (num_samples,).
+            pos_inds_gmm (Tensor): All the indexes of samples which are used
+                to fit GMM model. The tensor is of shape (num_samples,)
+
+        Returns:
+            tuple[Tensor]: The indices of positive and ignored samples.
+
+                - pos_inds_temp (Tensor): Indices of positive samples.
+                - ignore_inds_temp (Tensor): Indices of ignore samples.
+        """
+        # The implementation is (c) in Fig.3 in origin paper intead of (b).
+        # You can refer to issues such as
+        # https://github.com/kkhoot/PAA/issues/8 and
+        # https://github.com/kkhoot/PAA/issues/9.
+        fgs = gmm_assignment == 0
+        pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        if fgs.nonzero().numel():
+            _, pos_thr_ind = scores[fgs].topk(1)
+            pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
+            ignore_inds_temp = pos_inds_gmm.new_tensor([])
+        return pos_inds_temp, ignore_inds_temp
+
+    def get_targets(
+        self,
+        anchor_list,
+        valid_flag_list,
+        gt_bboxes_list,
+        img_metas,
+        gt_bboxes_ignore_list=None,
+        gt_labels_list=None,
+        label_channels=1,
+        unmap_outputs=True,
+    ):
+        """Get targets for PAA head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. We direct
+        return the results from _get_targets_single instead map it to levels
+        by images_to_levels function.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels (list[Tensor]): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - label_weights (list[Tensor]): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bbox_targets (list[Tensor]): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bbox_weights (list[Tensor]): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds (list[Tensor]): Contains all index of positive
+                    sample in all anchor.
+                - gt_inds (list[Tensor]): Contains all gt_index of positive
+                    sample in all anchor.
+        """
+
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+
+        (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
+         valid_neg_inds, sampling_result) = results
+
+        # Due to valid flag of anchors, we have to calculate the real pos_inds
+        # in origin anchor set.
+        pos_inds = []
+        for i, single_labels in enumerate(labels):
+            pos_mask = (0 <= single_labels) & (
+                single_labels < self.num_classes)
+            pos_inds.append(pos_mask.nonzero().view(-1))
+
+        gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                gt_inds)
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        This method is same as `AnchorHead._get_targets_single()`.
+        """
+        assert unmap_outputs, 'We must map outputs back to the original' \
+            'set of anchors in PAAhead'
+        return super(ATSSHead, self)._get_targets_single(
+            flat_anchors,
+            valid_flags,
+            gt_bboxes,
+            gt_bboxes_ignore,
+            gt_labels,
+            img_meta,
+            label_channels=1,
+            unmap_outputs=True)
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           iou_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into labeled boxes.
+
+        This method is almost same as `ATSSHead._get_bboxes_single()`.
+        We use sqrt(iou_preds * cls_scores) in NMS process instead of just
+        cls_scores. Besides, score voting is used when `` score_voting``
+        is set to True.
+        """
+        assert with_nms, 'PAA only supports "with_nms=True" now'
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_iou_preds = []
+        for cls_score, bbox_pred, iou_preds, anchors in zip(
+                cls_scores, bbox_preds, iou_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            iou_preds = iou_preds.permute(1, 2, 0).reshape(-1).sigmoid()
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = (scores * iou_preds[:, None]).sqrt().max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                iou_preds = iou_preds[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_iou_preds.append(iou_preds)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_iou_preds = torch.cat(mlvl_iou_preds)
+        mlvl_nms_scores = (mlvl_scores * mlvl_iou_preds[:, None]).sqrt()
+        det_bboxes, det_labels = multiclass_nms(
+            mlvl_bboxes,
+            mlvl_nms_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=None)
+        if self.with_score_voting:
+            det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
+                                                       mlvl_bboxes,
+                                                       mlvl_nms_scores,
+                                                       cfg.score_thr)
+
+        return det_bboxes, det_labels
+
+    def score_voting(self, det_bboxes, det_labels, mlvl_bboxes,
+                     mlvl_nms_scores, score_thr):
+        """Implementation of score voting method works on each remaining boxes
+        after NMS procedure.
+
+        Args:
+            det_bboxes (Tensor): Remaining boxes after NMS procedure,
+                with shape (k, 5), each dimension means
+                (x1, y1, x2, y2, score).
+            det_labels (Tensor): The label of remaining boxes, with shape
+                (k, 1),Labels are 0-based.
+            mlvl_bboxes (Tensor): All boxes before the NMS procedure,
+                with shape (num_anchors,4).
+            mlvl_nms_scores (Tensor): The scores of all boxes which is used
+                in the NMS procedure, with shape (num_anchors, num_class)
+            mlvl_iou_preds (Tensot): The predictions of IOU of all boxes
+                before the NMS procedure, with shape (num_anchors, 1)
+            score_thr (float): The score threshold of bboxes.
+
+        Returns:
+            tuple: Usually returns a tuple containing voting results.
+
+                - det_bboxes_voted (Tensor): Remaining boxes after
+                    score voting procedure, with shape (k, 5), each
+                    dimension means (x1, y1, x2, y2, score).
+                - det_labels_voted (Tensor): Label of remaining bboxes
+                    after voting, with shape (num_anchors,).
+        """
+        candidate_mask = mlvl_nms_scores > score_thr
+        candidate_mask_nozeros = candidate_mask.nonzero()
+        candidate_inds = candidate_mask_nozeros[:, 0]
+        candidate_labels = candidate_mask_nozeros[:, 1]
+        candidate_bboxes = mlvl_bboxes[candidate_inds]
+        candidate_scores = mlvl_nms_scores[candidate_mask]
+        det_bboxes_voted = []
+        det_labels_voted = []
+        for cls in range(self.cls_out_channels):
+            candidate_cls_mask = candidate_labels == cls
+            if not candidate_cls_mask.any():
+                continue
+            candidate_cls_scores = candidate_scores[candidate_cls_mask]
+            candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
+            det_cls_mask = det_labels == cls
+            det_cls_bboxes = det_bboxes[det_cls_mask].view(
+                -1, det_bboxes.size(-1))
+            det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
+                                               candidate_cls_bboxes)
+            for det_ind in range(len(det_cls_bboxes)):
+                single_det_ious = det_candidate_ious[det_ind]
+                pos_ious_mask = single_det_ious > 0.01
+                pos_ious = single_det_ious[pos_ious_mask]
+                pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
+                pos_scores = candidate_cls_scores[pos_ious_mask]
+                pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
+                       pos_scores)[:, None]
+                voted_box = torch.sum(
+                    pis * pos_bboxes, dim=0) / torch.sum(
+                        pis, dim=0)
+                voted_score = det_cls_bboxes[det_ind][-1:][None, :]
+                det_bboxes_voted.append(
+                    torch.cat((voted_box[None, :], voted_score), dim=1))
+                det_labels_voted.append(cls)
+
+        det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
+        det_labels_voted = det_labels.new_tensor(det_labels_voted)
+        return det_bboxes_voted, det_labels_voted
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_retinanet_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_retinanet_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..bd87b9aeb07e05ff94b444ac8999eca3f616711a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_retinanet_head.py
@@ -0,0 +1,154 @@
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.core import images_to_levels
+from ..builder import HEADS
+from ..losses import carl_loss, isr_p
+from .retina_head import RetinaHead
+
+
+@HEADS.register_module()
+class PISARetinaHead(RetinaHead):
+    """PISA Retinanet Head.
+
+    The head owns the same structure with Retinanet Head, but differs in two
+        aspects:
+        1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
+            change the positive loss weights.
+        2. Classification-aware regression loss is adopted as a third loss.
+    """
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image
+                with shape (num_obj, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each image
+                with shape (num_obj, 4).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
+                Default: None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss, regression loss and
+                carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        num_imgs = len(img_metas)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(
+            flatten_cls_scores, dim=1).reshape(-1,
+                                               flatten_cls_scores[0].size(-1))
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(
+            flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
+        flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
+        flatten_label_weights = torch.cat(
+            label_weights_list, dim=1).reshape(-1)
+        flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
+        flatten_bbox_targets = torch.cat(
+            bbox_targets_list, dim=1).reshape(-1, 4)
+        flatten_bbox_weights = torch.cat(
+            bbox_weights_list, dim=1).reshape(-1, 4)
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            all_targets = (flatten_labels, flatten_label_weights,
+                           flatten_bbox_targets, flatten_bbox_weights)
+            with torch.no_grad():
+                all_targets = isr_p(
+                    flatten_cls_scores,
+                    flatten_bbox_preds,
+                    all_targets,
+                    flatten_anchors,
+                    sampling_results_list,
+                    bbox_coder=self.bbox_coder,
+                    loss_cls=self.loss_cls,
+                    num_class=self.num_classes,
+                    **self.train_cfg.isr)
+            (flatten_labels, flatten_label_weights, flatten_bbox_targets,
+             flatten_bbox_weights) = all_targets
+
+        # For convenience we compute loss once instead separating by fpn level,
+        # so that we don't need to separate the weights by level again.
+        # The result should be the same
+        losses_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels,
+            flatten_label_weights,
+            avg_factor=num_total_samples)
+        losses_bbox = self.loss_bbox(
+            flatten_bbox_preds,
+            flatten_bbox_targets,
+            flatten_bbox_weights,
+            avg_factor=num_total_samples)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+        # CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                flatten_cls_scores,
+                flatten_labels,
+                flatten_bbox_preds,
+                flatten_bbox_targets,
+                self.loss_bbox,
+                **self.train_cfg.carl,
+                avg_factor=num_total_pos,
+                sigmoid=True,
+                num_class=self.num_classes)
+            loss_dict.update(loss_carl)
+
+        return loss_dict
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_ssd_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_ssd_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..90ef3c83ed62d8346c8daef01f18ad7bd236623c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/pisa_ssd_head.py
@@ -0,0 +1,139 @@
+import torch
+
+from mmdet.core import multi_apply
+from ..builder import HEADS
+from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
+from .ssd_head import SSDHead
+
+
+# TODO: add loss evaluator for SSD
+@HEADS.register_module()
+class PISASSDHead(SSDHead):
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image
+                with shape (num_obj, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each image
+                with shape (num_obj, 4).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
+                Default: None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss regression loss and
+                carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            unmap_outputs=False,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
+
+        num_images = len(img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        isr_cfg = self.train_cfg.get('isr', None)
+        all_targets = (all_labels.view(-1), all_label_weights.view(-1),
+                       all_bbox_targets.view(-1,
+                                             4), all_bbox_weights.view(-1, 4))
+        # apply ISR-P
+        if isr_cfg is not None:
+            all_targets = isr_p(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_bbox_preds.view(-1, 4),
+                all_targets,
+                torch.cat(all_anchors),
+                sampling_results_list,
+                loss_cls=CrossEntropyLoss(),
+                bbox_coder=self.bbox_coder,
+                **self.train_cfg.isr,
+                num_class=self.num_classes)
+            (new_labels, new_label_weights, new_bbox_targets,
+             new_bbox_weights) = all_targets
+            all_labels = new_labels.view(all_labels.shape)
+            all_label_weights = new_label_weights.view(all_label_weights.shape)
+            all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
+            all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
+
+        # add CARL loss
+        carl_loss_cfg = self.train_cfg.get('carl', None)
+        if carl_loss_cfg is not None:
+            loss_carl = carl_loss(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_targets[0],
+                all_bbox_preds.view(-1, 4),
+                all_targets[2],
+                SmoothL1Loss(beta=1.),
+                **self.train_cfg.carl,
+                avg_factor=num_total_pos,
+                num_class=self.num_classes)
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            num_total_samples=num_total_pos)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if carl_loss_cfg is not None:
+            loss_dict.update(loss_carl)
+        return loss_dict
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/reppoints_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/reppoints_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..03e3fa0f19e575ec22ee6edef8af5a0a7ccf345e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/reppoints_head.py
@@ -0,0 +1,763 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.ops import DeformConv2d
+
+from mmdet.core import (PointGenerator, build_assigner, build_sampler,
+                        images_to_levels, multi_apply, multiclass_nms, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class RepPointsHead(AnchorFreeHead):
+    """RepPoint head.
+
+    Args:
+        point_feat_channels (int): Number of channels of points features.
+        gradient_mul (float): The multiplier to gradients from
+            points refinement and recognition.
+        point_strides (Iterable): points strides.
+        point_base_scale (int): bbox scale for assigning labels.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_init (dict): Config of initial points loss.
+        loss_bbox_refine (dict): Config of points loss in refinement.
+        use_grid_points (bool): If we use bounding box representation, the
+        reppoints is represented as grid points on the bounding box.
+        center_init (bool): Whether to use center point assignment.
+        transform_method (str): The methods to transform RepPoints to bbox.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 point_feat_channels=256,
+                 num_points=9,
+                 gradient_mul=0.1,
+                 point_strides=[8, 16, 32, 64, 128],
+                 point_base_scale=4,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_init=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
+                 loss_bbox_refine=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 use_grid_points=False,
+                 center_init=True,
+                 transform_method='moment',
+                 moment_mul=0.01,
+                 **kwargs):
+        self.num_points = num_points
+        self.point_feat_channels = point_feat_channels
+        self.use_grid_points = use_grid_points
+        self.center_init = center_init
+
+        # we use deform conv to extract points features
+        self.dcn_kernel = int(np.sqrt(num_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        assert self.dcn_kernel * self.dcn_kernel == num_points, \
+            'The points number should be a square number.'
+        assert self.dcn_kernel % 2 == 1, \
+            'The points number should be an odd square number.'
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs)
+
+        self.gradient_mul = gradient_mul
+        self.point_base_scale = point_base_scale
+        self.point_strides = point_strides
+        self.point_generators = [PointGenerator() for _ in self.point_strides]
+
+        self.sampling = loss_cls['type'] not in ['FocalLoss']
+        if self.train_cfg:
+            self.init_assigner = build_assigner(self.train_cfg.init.assigner)
+            self.refine_assigner = build_assigner(
+                self.train_cfg.refine.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.transform_method = transform_method
+        if self.transform_method == 'moment':
+            self.moment_transfer = nn.Parameter(
+                data=torch.zeros(2), requires_grad=True)
+            self.moment_mul = moment_mul
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        self.loss_bbox_init = build_loss(loss_bbox_init)
+        self.loss_bbox_refine = build_loss(loss_bbox_refine)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
+        self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
+                                           self.cls_out_channels, 1, 1, 0)
+        self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
+                                                 self.point_feat_channels, 3,
+                                                 1, 1)
+        self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
+                                                pts_out_dim, 1, 1, 0)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
+        self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
+                                                  pts_out_dim, 1, 1, 0)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.reppoints_cls_conv, std=0.01)
+        normal_init(self.reppoints_cls_out, std=0.01, bias=bias_cls)
+        normal_init(self.reppoints_pts_init_conv, std=0.01)
+        normal_init(self.reppoints_pts_init_out, std=0.01)
+        normal_init(self.reppoints_pts_refine_conv, std=0.01)
+        normal_init(self.reppoints_pts_refine_out, std=0.01)
+
+    def points2bbox(self, pts, y_first=True):
+        """Converting the points set into bounding box.
+
+        :param pts: the input points sets (fields), each points
+            set (fields) is represented as 2n scalar.
+        :param y_first: if y_fisrt=True, the point set is represented as
+            [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+            represented as [x1, y1, x2, y2 ... xn, yn].
+        :return: each points set is converting to a bbox [x1, y1, x2, y2].
+        """
+        pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
+        pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
+                                                                      ...]
+        pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
+                                                                      ...]
+        if self.transform_method == 'minmax':
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'partial_minmax':
+            pts_y = pts_y[:, :4, ...]
+            pts_x = pts_x[:, :4, ...]
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'moment':
+            pts_y_mean = pts_y.mean(dim=1, keepdim=True)
+            pts_x_mean = pts_x.mean(dim=1, keepdim=True)
+            pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
+            pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
+            moment_transfer = (self.moment_transfer * self.moment_mul) + (
+                self.moment_transfer.detach() * (1 - self.moment_mul))
+            moment_width_transfer = moment_transfer[0]
+            moment_height_transfer = moment_transfer[1]
+            half_width = pts_x_std * torch.exp(moment_width_transfer)
+            half_height = pts_y_std * torch.exp(moment_height_transfer)
+            bbox = torch.cat([
+                pts_x_mean - half_width, pts_y_mean - half_height,
+                pts_x_mean + half_width, pts_y_mean + half_height
+            ],
+                             dim=1)
+        else:
+            raise NotImplementedError
+        return bbox
+
+    def gen_grid_from_reg(self, reg, previous_boxes):
+        """Base on the previous bboxes and regression values, we compute the
+        regressed bboxes and generate the grids on the bboxes.
+
+        :param reg: the regression value to previous bboxes.
+        :param previous_boxes: previous bboxes.
+        :return: generate grids on the regressed bboxes.
+        """
+        b, _, h, w = reg.shape
+        bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
+        bwh = (previous_boxes[:, 2:, ...] -
+               previous_boxes[:, :2, ...]).clamp(min=1e-6)
+        grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
+            reg[:, 2:, ...])
+        grid_wh = bwh * torch.exp(reg[:, 2:, ...])
+        grid_left = grid_topleft[:, [0], ...]
+        grid_top = grid_topleft[:, [1], ...]
+        grid_width = grid_wh[:, [0], ...]
+        grid_height = grid_wh[:, [1], ...]
+        intervel = torch.linspace(0., 1., self.dcn_kernel).view(
+            1, self.dcn_kernel, 1, 1).type_as(reg)
+        grid_x = grid_left + grid_width * intervel
+        grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
+        grid_x = grid_x.view(b, -1, h, w)
+        grid_y = grid_top + grid_height * intervel
+        grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
+        grid_y = grid_y.view(b, -1, h, w)
+        grid_yx = torch.stack([grid_y, grid_x], dim=2)
+        grid_yx = grid_yx.view(b, -1, h, w)
+        regressed_bbox = torch.cat([
+            grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
+        ], 1)
+        return grid_yx, regressed_bbox
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x):
+        """Forward feature map of a single FPN level."""
+        dcn_base_offset = self.dcn_base_offset.type_as(x)
+        # If we use center_init, the initial reppoints is from center points.
+        # If we use bounding bbox representation, the initial reppoints is
+        #   from regular grid placed on a pre-defined bbox.
+        if self.use_grid_points or not self.center_init:
+            scale = self.point_base_scale / 2
+            points_init = dcn_base_offset / dcn_base_offset.max() * scale
+            bbox_init = x.new_tensor([-scale, -scale, scale,
+                                      scale]).view(1, 4, 1, 1)
+        else:
+            points_init = 0
+        cls_feat = x
+        pts_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            pts_feat = reg_conv(pts_feat)
+        # initialize reppoints
+        pts_out_init = self.reppoints_pts_init_out(
+            self.relu(self.reppoints_pts_init_conv(pts_feat)))
+        if self.use_grid_points:
+            pts_out_init, bbox_out_init = self.gen_grid_from_reg(
+                pts_out_init, bbox_init.detach())
+        else:
+            pts_out_init = pts_out_init + points_init
+        # refine and classify reppoints
+        pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
+        ) + self.gradient_mul * pts_out_init
+        dcn_offset = pts_out_init_grad_mul - dcn_base_offset
+        cls_out = self.reppoints_cls_out(
+            self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
+        pts_out_refine = self.reppoints_pts_refine_out(
+            self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
+        if self.use_grid_points:
+            pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
+                pts_out_refine, bbox_out_init.detach())
+        else:
+            pts_out_refine = pts_out_refine + pts_out_init.detach()
+        return cls_out, pts_out_init, pts_out_refine
+
+    def get_points(self, featmap_sizes, img_metas, device):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: points of each image, valid flags of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # points center for one time
+        multi_level_points = []
+        for i in range(num_levels):
+            points = self.point_generators[i].grid_points(
+                featmap_sizes[i], self.point_strides[i], device)
+            multi_level_points.append(points)
+        points_list = [[point.clone() for point in multi_level_points]
+                       for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level grids
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            for i in range(num_levels):
+                point_stride = self.point_strides[i]
+                feat_h, feat_w = featmap_sizes[i]
+                h, w = img_meta['pad_shape'][:2]
+                valid_feat_h = min(int(np.ceil(h / point_stride)), feat_h)
+                valid_feat_w = min(int(np.ceil(w / point_stride)), feat_w)
+                flags = self.point_generators[i].valid_flags(
+                    (feat_h, feat_w), (valid_feat_h, valid_feat_w), device)
+                multi_level_flags.append(flags)
+            valid_flag_list.append(multi_level_flags)
+
+        return points_list, valid_flag_list
+
+    def centers_to_bboxes(self, point_list):
+        """Get bboxes according to center points.
+
+        Only used in :class:`MaxIoUAssigner`.
+        """
+        bbox_list = []
+        for i_img, point in enumerate(point_list):
+            bbox = []
+            for i_lvl in range(len(self.point_strides)):
+                scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
+                bbox_shift = torch.Tensor([-scale, -scale, scale,
+                                           scale]).view(1, 4).type_as(point[0])
+                bbox_center = torch.cat(
+                    [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center + bbox_shift)
+            bbox_list.append(bbox)
+        return bbox_list
+
+    def offset_to_pts(self, center_list, pred_list):
+        """Change from point offset to point coordinate."""
+        pts_list = []
+        for i_lvl in range(len(self.point_strides)):
+            pts_lvl = []
+            for i_img in range(len(center_list)):
+                pts_center = center_list[i_img][i_lvl][:, :2].repeat(
+                    1, self.num_points)
+                pts_shift = pred_list[i_lvl][i_img]
+                yx_pts_shift = pts_shift.permute(1, 2, 0).view(
+                    -1, 2 * self.num_points)
+                y_pts_shift = yx_pts_shift[..., 0::2]
+                x_pts_shift = yx_pts_shift[..., 1::2]
+                xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
+                xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
+                pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
+                pts_lvl.append(pts)
+            pts_lvl = torch.stack(pts_lvl, 0)
+            pts_list.append(pts_lvl)
+        return pts_list
+
+    def _point_target_single(self,
+                             flat_proposals,
+                             valid_flags,
+                             gt_bboxes,
+                             gt_bboxes_ignore,
+                             gt_labels,
+                             label_channels=1,
+                             stage='init',
+                             unmap_outputs=True):
+        inside_flags = valid_flags
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample proposals
+        proposals = flat_proposals[inside_flags, :]
+
+        if stage == 'init':
+            assigner = self.init_assigner
+            pos_weight = self.train_cfg.init.pos_weight
+        else:
+            assigner = self.refine_assigner
+            pos_weight = self.train_cfg.refine.pos_weight
+        assign_result = assigner.assign(proposals, gt_bboxes, gt_bboxes_ignore,
+                                        None if self.sampling else gt_labels)
+        sampling_result = self.sampler.sample(assign_result, proposals,
+                                              gt_bboxes)
+
+        num_valid_proposals = proposals.shape[0]
+        bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
+        pos_proposals = torch.zeros_like(proposals)
+        proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
+        labels = proposals.new_full((num_valid_proposals, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        label_weights = proposals.new_zeros(
+            num_valid_proposals, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_gt_bboxes = sampling_result.pos_gt_bboxes
+            bbox_gt[pos_inds, :] = pos_gt_bboxes
+            pos_proposals[pos_inds, :] = proposals[pos_inds, :]
+            proposals_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of proposals
+        if unmap_outputs:
+            num_total_proposals = flat_proposals.size(0)
+            labels = unmap(labels, num_total_proposals, inside_flags)
+            label_weights = unmap(label_weights, num_total_proposals,
+                                  inside_flags)
+            bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
+            pos_proposals = unmap(pos_proposals, num_total_proposals,
+                                  inside_flags)
+            proposals_weights = unmap(proposals_weights, num_total_proposals,
+                                      inside_flags)
+
+        return (labels, label_weights, bbox_gt, pos_proposals,
+                proposals_weights, pos_inds, neg_inds)
+
+    def get_targets(self,
+                    proposals_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    stage='init',
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            proposals_list (list[list]): Multi level points/bboxes of each
+                image.
+            valid_flag_list (list[list]): Multi level valid flags of each
+                image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_bboxes_list (list[Tensor]): Ground truth labels of each box.
+            stage (str): `init` or `refine`. Generate target for init stage or
+                refine stage
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each level.  # noqa: E501
+                - bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
+                - proposal_list (list[Tensor]): Proposals(points/bboxes) of each level.  # noqa: E501
+                - proposal_weights_list (list[Tensor]): Proposal weights of each level.  # noqa: E501
+                - num_total_pos (int): Number of positive samples in all images.  # noqa: E501
+                - num_total_neg (int): Number of negative samples in all images.  # noqa: E501
+        """
+        assert stage in ['init', 'refine']
+        num_imgs = len(img_metas)
+        assert len(proposals_list) == len(valid_flag_list) == num_imgs
+
+        # points number of multi levels
+        num_level_proposals = [points.size(0) for points in proposals_list[0]]
+
+        # concat all level points and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(proposals_list[i]) == len(valid_flag_list[i])
+            proposals_list[i] = torch.cat(proposals_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_gt, all_proposals,
+         all_proposal_weights, pos_inds_list, neg_inds_list) = multi_apply(
+             self._point_target_single,
+             proposals_list,
+             valid_flag_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             stage=stage,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid points
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled points of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        labels_list = images_to_levels(all_labels, num_level_proposals)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_proposals)
+        bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
+        proposals_list = images_to_levels(all_proposals, num_level_proposals)
+        proposal_weights_list = images_to_levels(all_proposal_weights,
+                                                 num_level_proposals)
+        return (labels_list, label_weights_list, bbox_gt_list, proposals_list,
+                proposal_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self, cls_score, pts_pred_init, pts_pred_refine, labels,
+                    label_weights, bbox_gt_init, bbox_weights_init,
+                    bbox_gt_refine, bbox_weights_refine, stride,
+                    num_total_samples_init, num_total_samples_refine):
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        cls_score = cls_score.contiguous()
+        loss_cls = self.loss_cls(
+            cls_score,
+            labels,
+            label_weights,
+            avg_factor=num_total_samples_refine)
+
+        # points loss
+        bbox_gt_init = bbox_gt_init.reshape(-1, 4)
+        bbox_weights_init = bbox_weights_init.reshape(-1, 4)
+        bbox_pred_init = self.points2bbox(
+            pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
+        bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
+        bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
+        bbox_pred_refine = self.points2bbox(
+            pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
+        normalize_term = self.point_base_scale * stride
+        loss_pts_init = self.loss_bbox_init(
+            bbox_pred_init / normalize_term,
+            bbox_gt_init / normalize_term,
+            bbox_weights_init,
+            avg_factor=num_total_samples_init)
+        loss_pts_refine = self.loss_bbox_refine(
+            bbox_pred_refine / normalize_term,
+            bbox_gt_refine / normalize_term,
+            bbox_weights_refine,
+            avg_factor=num_total_samples_refine)
+        return loss_cls, loss_pts_init, loss_pts_refine
+
+    def loss(self,
+             cls_scores,
+             pts_preds_init,
+             pts_preds_refine,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == len(self.point_generators)
+        device = cls_scores[0].device
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        # target for initial stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       img_metas, device)
+        pts_coordinate_preds_init = self.offset_to_pts(center_list,
+                                                       pts_preds_init)
+        if self.train_cfg.init.assigner['type'] == 'PointAssigner':
+            # Assign target for center list
+            candidate_list = center_list
+        else:
+            # transform center list to bbox list and
+            #   assign target for bbox list
+            bbox_list = self.centers_to_bboxes(center_list)
+            candidate_list = bbox_list
+        cls_reg_targets_init = self.get_targets(
+            candidate_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            stage='init',
+            label_channels=label_channels)
+        (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
+         num_total_pos_init, num_total_neg_init) = cls_reg_targets_init
+        num_total_samples_init = (
+            num_total_pos_init +
+            num_total_neg_init if self.sampling else num_total_pos_init)
+
+        # target for refinement stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       img_metas, device)
+        pts_coordinate_preds_refine = self.offset_to_pts(
+            center_list, pts_preds_refine)
+        bbox_list = []
+        for i_img, center in enumerate(center_list):
+            bbox = []
+            for i_lvl in range(len(pts_preds_refine)):
+                bbox_preds_init = self.points2bbox(
+                    pts_preds_init[i_lvl].detach())
+                bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
+                bbox_center = torch.cat(
+                    [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center +
+                            bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
+            bbox_list.append(bbox)
+        cls_reg_targets_refine = self.get_targets(
+            bbox_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            stage='refine',
+            label_channels=label_channels)
+        (labels_list, label_weights_list, bbox_gt_list_refine,
+         candidate_list_refine, bbox_weights_list_refine, num_total_pos_refine,
+         num_total_neg_refine) = cls_reg_targets_refine
+        num_total_samples_refine = (
+            num_total_pos_refine +
+            num_total_neg_refine if self.sampling else num_total_pos_refine)
+
+        # compute loss
+        losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
+            self.loss_single,
+            cls_scores,
+            pts_coordinate_preds_init,
+            pts_coordinate_preds_refine,
+            labels_list,
+            label_weights_list,
+            bbox_gt_list_init,
+            bbox_weights_list_init,
+            bbox_gt_list_refine,
+            bbox_weights_list_refine,
+            self.point_strides,
+            num_total_samples_init=num_total_samples_init,
+            num_total_samples_refine=num_total_samples_refine)
+        loss_dict_all = {
+            'loss_cls': losses_cls,
+            'loss_pts_init': losses_pts_init,
+            'loss_pts_refine': losses_pts_refine
+        }
+        return loss_dict_all
+
+    def get_bboxes(self,
+                   cls_scores,
+                   pts_preds_init,
+                   pts_preds_refine,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        assert len(cls_scores) == len(pts_preds_refine)
+        device = cls_scores[0].device
+        bbox_preds_refine = [
+            self.points2bbox(pts_pred_refine)
+            for pts_pred_refine in pts_preds_refine
+        ]
+        num_levels = len(cls_scores)
+        mlvl_points = [
+            self.point_generators[i].grid_points(cls_scores[i].size()[-2:],
+                                                 self.point_strides[i], device)
+            for i in range(num_levels)
+        ]
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds_refine[i][img_id].detach()
+                for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                mlvl_points, img_shape,
+                                                scale_factor, cfg, rescale,
+                                                with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_points,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for i_lvl, (cls_score, bbox_pred, points) in enumerate(
+                zip(cls_scores, bbox_preds, mlvl_points)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
+            bboxes = bbox_pred * self.point_strides[i_lvl] + bbox_pos_center
+            x1 = bboxes[:, 0].clamp(min=0, max=img_shape[1])
+            y1 = bboxes[:, 1].clamp(min=0, max=img_shape[0])
+            x2 = bboxes[:, 2].clamp(min=0, max=img_shape[1])
+            y2 = bboxes[:, 3].clamp(min=0, max=img_shape[0])
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/retina_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/retina_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..b12416fa8332f02b9a04bbfc7926f6d13875e61b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/retina_head.py
@@ -0,0 +1,114 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RetinaHead(AnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_generator=anchor_generator,
+            **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/retina_sepbn_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/retina_sepbn_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..6b8ce7f0104b90af4b128e0f245473a1c0219fcd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/retina_sepbn_head.py
@@ -0,0 +1,113 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RetinaSepBNHead(AnchorHead):
+    """"RetinaHead with separate BN.
+
+    In RetinaHead, conv/norm layers are shared across different FPN levels,
+    while in RetinaSepBNHead, conv layers are shared across different FPN
+    levels, but BN layers are separated.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 num_ins,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.num_ins = num_ins
+        super(RetinaSepBNHead, self).__init__(num_classes, in_channels,
+                                              **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+        for i in range(self.stacked_convs):
+            for j in range(1, self.num_ins):
+                self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
+                self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs[0]:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs[0]:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for i, x in enumerate(feats):
+            cls_feat = feats[i]
+            reg_feat = feats[i]
+            for cls_conv in self.cls_convs[i]:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs[i]:
+                reg_feat = reg_conv(reg_feat)
+            cls_score = self.retina_cls(cls_feat)
+            bbox_pred = self.retina_reg(reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return cls_scores, bbox_preds
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..f565d1a41b06c92eaeae2e5418fad54dd27ae656
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_head.py
@@ -0,0 +1,168 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import normal_init
+from mmcv.ops import batched_nms
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+from .rpn_test_mixin import RPNTestMixin
+
+
+@HEADS.register_module()
+class RPNHead(RPNTestMixin, AnchorHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+    """  # noqa: W605
+
+    def __init__(self, in_channels, **kwargs):
+        super(RPNHead, self).__init__(1, in_channels, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_anchors * self.cls_out_channels, 1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        normal_init(self.rpn_conv, std=0.01)
+        normal_init(self.rpn_cls, std=0.01)
+        normal_init(self.rpn_reg, std=0.01)
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super(RPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference for each scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:cfg.nms_pre]
+                scores = ranked_scores[:cfg.nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        scores = torch.cat(mlvl_scores)
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        proposals = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_shape)
+        ids = torch.cat(level_ids)
+
+        if cfg.min_bbox_size > 0:
+            w = proposals[:, 2] - proposals[:, 0]
+            h = proposals[:, 3] - proposals[:, 1]
+            valid_inds = torch.nonzero(
+                (w >= cfg.min_bbox_size)
+                & (h >= cfg.min_bbox_size),
+                as_tuple=False).squeeze()
+            if valid_inds.sum().item() != len(proposals):
+                proposals = proposals[valid_inds, :]
+                scores = scores[valid_inds]
+                ids = ids[valid_inds]
+
+        # TODO: remove the hard coded nms type
+        nms_cfg = dict(type='nms', iou_threshold=cfg.nms_thr)
+        dets, keep = batched_nms(proposals, scores, ids, nms_cfg)
+        return dets[:cfg.nms_post]
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_test_mixin.py b/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_test_mixin.py
new file mode 100755
index 0000000000000000000000000000000000000000..4ce5c66f82595f496e6e55719c1caee75150d568
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/rpn_test_mixin.py
@@ -0,0 +1,59 @@
+import sys
+
+from mmdet.core import merge_aug_proposals
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class RPNTestMixin(object):
+    """Test methods of RPN."""
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_bboxes(*rpn_outs, img_metas)
+            return proposal_list
+
+    def simple_test_rpn(self, x, img_metas):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image.
+        """
+        rpn_outs = self(x)
+        proposal_list = self.get_bboxes(*rpn_outs, img_metas)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas):
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, aug_img_meta, self.test_cfg)
+            for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas)
+        ]
+        return merged_proposals
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/sabl_retina_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/sabl_retina_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..73143da4a5a1545c24fcbf60d42b12615da14efd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/sabl_retina_head.py
@@ -0,0 +1,618 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_anchor_generator, build_assigner,
+                        build_bbox_coder, build_sampler, images_to_levels,
+                        multi_apply, multiclass_nms, unmap)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@HEADS.register_module()
+class SABLRetinaHead(BaseDenseHead):
+    """Side-Aware Boundary Localization (SABL) for RetinaNet.
+
+    The anchor generation, assigning and sampling in SABLRetinaHead
+    are the same as GuidedAnchorHead for guided anchoring.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of Convs for classification \
+            and regression branches. Defaults to 4.
+        feat_channels (int): Number of hidden channels. \
+            Defaults to 256.
+        approx_anchor_generator (dict): Config dict for approx generator.
+        square_anchor_generator (dict): Config dict for square generator.
+        conv_cfg (dict): Config dict for ConvModule. Defaults to None.
+        norm_cfg (dict): Config dict for Norm Layer. Defaults to None.
+        bbox_coder (dict): Config dict for bbox coder.
+        reg_decoded_bbox (bool): Whether to regress decoded bbox. \
+            Defaults to False.
+        train_cfg (dict): Training config of SABLRetinaHead.
+        test_cfg (dict): Testing config of SABLRetinaHead.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 feat_channels=256,
+                 approx_anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 square_anchor_generator=dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     scales=[4],
+                     strides=[8, 16, 32, 64, 128]),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 bbox_coder=dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=3.0),
+                 reg_decoded_bbox=False,
+                 train_cfg=None,
+                 test_cfg=None,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.5),
+                 loss_bbox_reg=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)):
+        super(SABLRetinaHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.num_buckets = bbox_coder['num_buckets']
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+
+        self.approx_anchor_generator = build_anchor_generator(
+            approx_anchor_generator)
+        self.square_anchor_generator = build_anchor_generator(
+            square_anchor_generator)
+        self.approxs_per_octave = (
+            self.approx_anchor_generator.num_base_anchors[0])
+
+        # one anchor per location
+        self.num_anchors = 1
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.sampling = loss_cls['type'] not in [
+            'FocalLoss', 'GHMC', 'QualityFocalLoss'
+        ]
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox_cls = build_loss(loss_bbox_cls)
+        self.loss_bbox_reg = build_loss(loss_bbox_reg)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+        self._init_layers()
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.retina_bbox_reg = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+        self.retina_bbox_cls = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+
+    def init_weights(self):
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_bbox_reg, std=0.01)
+        normal_init(self.retina_bbox_cls, std=0.01)
+
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_cls_pred = self.retina_bbox_cls(reg_feat)
+        bbox_reg_pred = self.retina_bbox_reg(reg_feat)
+        bbox_pred = (bbox_cls_pred, bbox_reg_pred)
+        return cls_score, bbox_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        return squares_list
+
+    def get_target(self,
+                   approx_list,
+                   inside_flag_list,
+                   square_list,
+                   gt_bboxes_list,
+                   img_metas,
+                   gt_bboxes_ignore_list=None,
+                   gt_labels_list=None,
+                   label_channels=None,
+                   sampling=True,
+                   unmap_outputs=True):
+        """Compute bucketing targets.
+        Args:
+            approx_list (list[list]): Multi level approxs of each image.
+            inside_flag_list (list[list]): Multi level inside flags of each
+                image.
+            square_list (list[list]): Multi level squares of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+            gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+            label_channels (int): Channel of label.
+            sampling (bool): Sample Anchors or not.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple: Returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each \
+                    level.
+                - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
+                    each level.
+                - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
+                    each level.
+                - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
+                    each level.
+                - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
+                    each level.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        num_imgs = len(img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_cls_targets,
+         all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             img_metas,
+             label_channels=label_channels,
+             sampling=sampling,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_squares)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_squares)
+        bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
+                                                 num_level_squares)
+        bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
+                                                 num_level_squares)
+        bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
+                                                 num_level_squares)
+        bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
+                                                 num_level_squares)
+        return (labels_list, label_weights_list, bbox_cls_targets_list,
+                bbox_cls_weights_list, bbox_reg_targets_list,
+                bbox_reg_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           flat_approxs,
+                           inside_flags,
+                           flat_squares,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           img_meta,
+                           label_channels=None,
+                           sampling=True,
+                           unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_bboxes (Tensor): Ground truth bboxes of a single image, \
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            sampling (bool): Sample Anchors or not.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple:
+
+                - labels_list (Tensor): Labels in a single image
+                - label_weights (Tensor): Label weights in a single image
+                - bbox_cls_targets (Tensor): BBox cls targets in a single image
+                - bbox_cls_weights (Tensor): BBox cls weights in a single image
+                - bbox_reg_targets (Tensor): BBox reg targets in a single image
+                - bbox_reg_weights (Tensor): BBox reg weights in a single image
+                - num_total_pos (int): Number of positive samples \
+                    in a single image
+                - num_total_neg (int): Number of negative samples \
+                    in a single image
+        """
+        if not inside_flags.any():
+            return (None, ) * 8
+        # assign gt and sample anchors
+        expand_inside_flags = inside_flags[:, None].expand(
+            -1, self.approxs_per_octave).reshape(-1)
+        approxs = flat_approxs[expand_inside_flags, :]
+        squares = flat_squares[inside_flags, :]
+
+        assign_result = self.assigner.assign(approxs, squares,
+                                             self.approxs_per_octave,
+                                             gt_bboxes, gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, squares,
+                                              gt_bboxes)
+
+        num_valid_squares = squares.shape[0]
+        bbox_cls_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_cls_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        labels = squares.new_full((num_valid_squares, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
+             pos_bbox_cls_weights) = self.bbox_coder.encode(
+                 sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
+            bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
+            bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
+            bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
+                                     inside_flags)
+        return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
+                bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds)
+
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_cls_targets, bbox_cls_weights, bbox_reg_targets,
+                    bbox_reg_weights, num_total_samples):
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
+        bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
+        bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
+        bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
+        (bbox_cls_pred, bbox_reg_pred) = bbox_pred
+        bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        loss_bbox_cls = self.loss_bbox_cls(
+            bbox_cls_pred,
+            bbox_cls_targets.long(),
+            bbox_cls_weights,
+            avg_factor=num_total_samples * 4 * self.side_num)
+        loss_bbox_reg = self.loss_bbox_reg(
+            bbox_reg_pred,
+            bbox_reg_targets,
+            bbox_reg_weights,
+            avg_factor=num_total_samples * 4 * self.bbox_coder.offset_topk)
+        return loss_cls, loss_bbox_cls, loss_bbox_reg
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
+            self, featmap_sizes, img_metas, device=device)
+
+        square_list = self.get_anchors(featmap_sizes, img_metas, device=device)
+
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_target(
+            approxs_list,
+            inside_flag_list,
+            square_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            sampling=self.sampling)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_cls_targets_list,
+         bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+        losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_cls_targets_list,
+            bbox_cls_weights_list,
+            bbox_reg_targets_list,
+            bbox_reg_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox_cls=losses_bbox_cls,
+            loss_bbox_reg=losses_bbox_reg)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+
+        device = cls_scores[0].device
+        mlvl_anchors = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_cls_pred_list = [
+                bbox_preds[i][0][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_reg_pred_list = [
+                bbox_preds[i][1][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self.get_bboxes_single(cls_score_list,
+                                               bbox_cls_pred_list,
+                                               bbox_reg_pred_list,
+                                               mlvl_anchors[img_id], img_shape,
+                                               scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_cls_preds,
+                          bbox_reg_preds,
+                          mlvl_anchors,
+                          img_shape,
+                          scale_factor,
+                          cfg,
+                          rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_confids = []
+        assert len(cls_scores) == len(bbox_cls_preds) == len(
+            bbox_reg_preds) == len(mlvl_anchors)
+        for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
+                cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_cls_pred.size(
+            )[-2:] == bbox_reg_pred.size()[-2::]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_cls_pred = bbox_cls_pred[topk_inds, :]
+                bbox_reg_pred = bbox_reg_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bbox_preds = [
+                bbox_cls_pred.contiguous(),
+                bbox_reg_pred.contiguous()
+            ]
+            bboxes, confids = self.bbox_coder.decode(
+                anchors.contiguous(), bbox_preds, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_confids.append(confids)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_confids = torch.cat(mlvl_confids)
+        if self.use_sigmoid_cls:
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        det_bboxes, det_labels = multiclass_nms(
+            mlvl_bboxes,
+            mlvl_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=mlvl_confids)
+        return det_bboxes, det_labels
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..44d64730a93bfe8ecfa7ca2029f21eab8fc9dd38
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
@@ -0,0 +1,1063 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init, DepthwiseSeparableConvModule
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (anchor_inside_flags, bbox2distance, bbox_overlaps,
+                        build_assigner, build_sampler, distance2bbox, distance2kps, kps2distance,
+                        images_to_levels, multi_apply, multiclass_nms,
+                        reduce_mean, unmap)
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+def last_zero_init(m):
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Default: 16. You
+            may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@HEADS.register_module()
+class SCRFDHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Default: 4.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='GN', num_groups=32, requires_grad=True).
+        loss_qfl (dict): Config of Quality Focal Loss (QFL).
+        reg_max (int): Max value of integral set :math: `{0, ..., reg_max}`
+            in QFL setting. Default: 16.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 feat_mults=None,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl=None,
+                 reg_max=8,
+                 cls_reg_share=False,
+                 strides_share=True,
+                 scale_mode = 1,
+                 dw_conv = False,
+                 use_kps = False,
+                 loss_kps=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
+                 #loss_kps=dict(type='SmoothL1Loss', beta=1.0, loss_weight=0.3),
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.feat_mults = feat_mults
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        self.cls_reg_share = cls_reg_share
+        self.strides_share = strides_share
+        self.scale_mode = scale_mode
+        self.use_dfl = True
+        self.dw_conv = dw_conv
+        self.NK = 5
+        self.extra_flops = 0.0
+        if loss_dfl is None or not loss_dfl:
+            self.use_dfl = False
+        self.use_scale = False
+        self.use_kps = use_kps
+        if self.scale_mode>0 and (self.strides_share or self.scale_mode==2):
+            self.use_scale = True
+        #print('USE-SCALE:', self.use_scale)
+        super(SCRFDHead, self).__init__(num_classes, in_channels, **kwargs)
+
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.integral = Integral(self.reg_max)
+        if self.use_dfl:
+            self.loss_dfl = build_loss(loss_dfl)
+        #print('USE_DFL:', self.use_dfl)
+        self.loss_kps = build_loss(loss_kps)
+        self.loss_kps_std = 1.0
+        #print(self.bbox_coder.__class__)
+        self.train_step = 0
+        self.pos_count = {}
+        self.gtgroup_count = {}
+        for stride in self.anchor_generator.strides:
+            self.pos_count[stride[0]] = 0
+
+    def _get_conv_module(self, in_channel, out_channel):
+        if not self.dw_conv:
+            conv = ConvModule(
+                    in_channel,
+                    out_channel,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg)
+        else:
+            conv = DepthwiseSeparableConvModule(
+                    in_channel,
+                    out_channel,
+                    3,
+                    stride=1,
+                    padding=1,
+                    pw_norm_cfg=self.norm_cfg,
+                    dw_norm_cfg=self.norm_cfg)
+        return conv
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        conv_strides = [0] if self.strides_share else self.anchor_generator.strides
+        self.cls_stride_convs = nn.ModuleDict()
+        self.reg_stride_convs = nn.ModuleDict()
+        self.stride_cls = nn.ModuleDict()
+        self.stride_reg = nn.ModuleDict()
+        if self.use_kps:
+            self.stride_kps = nn.ModuleDict()
+        for stride_idx, conv_stride in enumerate(conv_strides):
+            #print('create convs for stride:', conv_stride)
+            key = str(conv_stride)
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            stacked_convs = self.stacked_convs[stride_idx] if isinstance(self.stacked_convs, (list, tuple)) else self.stacked_convs
+            feat_mult = self.feat_mults[stride_idx] if self.feat_mults is not None else 1
+            feat_ch = int(self.feat_channels*feat_mult)
+            for i in range(stacked_convs):
+                chn = self.in_channels if i == 0 else last_feat_ch
+                cls_convs.append( self._get_conv_module(chn, feat_ch) )
+                if not self.cls_reg_share:
+                    reg_convs.append( self._get_conv_module(chn, feat_ch) )
+                last_feat_ch = feat_ch
+            self.cls_stride_convs[key] = cls_convs
+            self.reg_stride_convs[key] = reg_convs
+            self.stride_cls[key] = nn.Conv2d(
+                feat_ch, self.cls_out_channels * self.num_anchors, 3, padding=1)
+            if not self.use_dfl:
+                self.stride_reg[key] = nn.Conv2d(
+                    feat_ch, 4 * self.num_anchors, 3, padding=1)
+            else:
+                self.stride_reg[key] = nn.Conv2d(
+                    feat_ch, 4 * (self.reg_max + 1) * self.num_anchors, 3, padding=1)
+            if self.use_kps:
+                self.stride_kps[key] = nn.Conv2d(
+                    feat_ch, self.NK*2*self.num_anchors, 3, padding=1)
+        #assert self.num_anchors == 1, 'anchor free version'
+        #extra_gflops /= 1e9
+        #print('extra_gflops: %.6fG'%extra_gflops)
+        if self.use_scale:
+            self.scales = nn.ModuleList(
+                [Scale(1.0) for _ in self.anchor_generator.strides])
+        else:
+            self.scales = [None for _ in self.anchor_generator.strides]
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for stride, cls_convs in self.cls_stride_convs.items():
+            #print('init cls for stride:', stride)
+            for m in cls_convs:
+                if not self.dw_conv:
+                    try:
+                        normal_init(m.conv, std=0.01)
+                    except:
+                        pass
+                else:
+                    normal_init(m.depthwise_conv.conv, std=0.01)
+                    normal_init(m.pointwise_conv.conv, std=0.01)
+        for stride, reg_convs in self.reg_stride_convs.items():
+            for m in reg_convs:
+                if not self.dw_conv:
+                    normal_init(m.conv, std=0.01)
+                else:
+                    normal_init(m.depthwise_conv.conv, std=0.01)
+                    normal_init(m.pointwise_conv.conv, std=0.01)
+        #bias_cls = bias_init_with_prob(0.01)
+        bias_cls = -4.595
+        #bias_cls = -1.595
+        for stride, conv in self.stride_cls.items():
+            normal_init(conv, std=0.01, bias=bias_cls)
+        for stride, conv in self.stride_reg.items():
+            normal_init(conv, std=0.01)
+        if self.use_kps:
+            for stride, conv in self.stride_kps.items():
+                normal_init(conv, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification and quality (IoU)
+                    joint scores for all scale levels, each is a 4D-tensor,
+                    the channel number is num_classes.
+                bbox_preds (list[Tensor]): Box distribution logits for all
+                    scale levels, each is a 4D-tensor, the channel number is
+                    4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, feats, self.scales, self.anchor_generator.strides)
+
+    def forward_single(self, x, scale, stride):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls and quality joint scores for a single
+                    scale level the channel number is num_classes.
+                bbox_pred (Tensor): Box distribution logits for a single scale
+                    level, the channel number is 4*(n+1), n is max value of
+                    integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        #print('forward_single in stride:', stride)
+        cls_convs = self.cls_stride_convs['0'] if self.strides_share else self.cls_stride_convs[str(stride)]
+        for cls_conv in cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        if not self.cls_reg_share:
+            reg_convs = self.reg_stride_convs['0'] if self.strides_share else self.reg_stride_convs[str(stride)]
+            for reg_conv in reg_convs:
+                reg_feat = reg_conv(reg_feat)
+        else:
+            reg_feat = cls_feat
+        cls_pred_module = self.stride_cls['0'] if self.strides_share else self.stride_cls[str(stride)]
+        cls_score = cls_pred_module(cls_feat)
+        reg_pred_module = self.stride_reg['0'] if self.strides_share else self.stride_reg[str(stride)]
+        _bbox_pred = reg_pred_module(reg_feat)
+        if self.use_scale:
+            bbox_pred = scale(_bbox_pred)
+        else:
+            bbox_pred = _bbox_pred
+        if self.use_kps:
+            kps_pred_module = self.stride_kps['0'] if self.strides_share else self.stride_kps[str(stride)]
+            kps_pred = kps_pred_module(reg_feat)
+        else:
+            kps_pred = bbox_pred.new_zeros( (bbox_pred.shape[0], self.NK*2, bbox_pred.shape[2], bbox_pred.shape[3]) )
+        if torch.onnx.is_in_onnx_export():
+            assert not self.use_dfl
+            print('in-onnx-export', cls_score.shape, bbox_pred.shape)
+            #print(scale.parameters())
+            #for p in scale.parameters():
+                #print(p.name, p.data)
+                #scale_val = p.data.item()
+                #print(scale_val)
+            #print('EEE1', cls_score.shape)
+            #cls_score = torch.sigmoid(cls_score).reshape(1, self.cls_out_channels, -1).permute(0, 2, 1)
+            #print('EEE2', cls_score.shape)
+            #if self.use_dfl:
+            #    bbox_pred = self.integral(bbox_pred) * stride[0]
+            #else:
+            #    bbox_pred = bbox_pred.reshape( (-1,4) ) * stride[0]
+            #if self.use_dfl:
+            #    bbox_pred = bbox_pred.reshape(1, (self.reg_max+1)*4, -1).permute(0, 2, 1)
+            #    bbox_pred = bbox_pred.reshape( (1, -1, 4, self.reg_max+1) )
+            #    bbox_pred = F.softmax(bbox_pred, dim=3)
+            #else:
+            #    bbox_pred = bbox_pred.reshape(1, 4, -1).permute(0, 2, 1)
+            #kps_pred = kps_pred.reshape(1, 10, -1).permute(0, 2, 1)
+
+            # Add output batch dim, based on pull request #1593
+            batch_size = cls_score.shape[0]
+            cls_score = cls_score.permute(0, 2, 3, 1).reshape(batch_size, -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 4)
+            kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)
+
+        return cls_score, bbox_pred, kps_pred
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_keypointss=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypointss, img_metas)
+        #print('AAA', gt_bboxes[0].shape, gt_keypointss[0].shape)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.anchor_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def anchor_center(self, anchors):
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_single(self, anchors, cls_score, bbox_pred, kps_pred, labels, label_weights,
+                    bbox_targets, kps_targets, kps_weights, stride, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            num_total_samples (int): Number of positive samples that is
+                reduced over all GPUs.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        use_qscore = True
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        if not self.use_dfl:
+            bbox_pred = bbox_pred.permute(0, 2, 3,
+                                          1).reshape(-1, 4)
+        else:
+            bbox_pred = bbox_pred.permute(0, 2, 3,
+                                          1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        if self.use_kps:
+            kps_pred = kps_pred.permute(0, 2, 3,
+                                          1).reshape(-1, self.NK*2)
+            kps_targets = kps_targets.reshape( (-1, self.NK*2) )
+            kps_weights = kps_weights.reshape( (-1, self.NK*2) )
+            #print('AAA000', kps_targets.shape, kps_weights.shape)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+
+
+            if self.use_dfl:
+                pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+                pos_decode_bbox_pred = distance2bbox(pos_anchor_centers,
+                                                     pos_bbox_pred_corners)
+            else:
+                pos_decode_bbox_pred = distance2bbox(pos_anchor_centers,
+                                                     pos_bbox_pred)
+            if self.use_kps:
+                pos_kps_targets = kps_targets[pos_inds]
+                pos_kps_pred = kps_pred[pos_inds]
+                #print('CCC000', kps_weights.shape)
+                pos_kps_weights = kps_weights.max(dim=1)[0][pos_inds] * weight_targets
+                #pos_kps_weights = kps_weights.max(dim=1)[0][pos_inds]
+                pos_kps_weights = pos_kps_weights.reshape( (-1, 1) )
+                #pos_kps_weights = kps_weights.max(dim=1, keepdims=True)[0][pos_inds]
+                #print('SSS', pos_kps_weights.sum())
+
+                #pos_decode_kps_targets = pos_kps_targets / stride[0]
+                #pos_decode_kps_pred = distance2kps(pos_anchor_centers, pos_kps_pred)
+
+                pos_decode_kps_targets = kps2distance(pos_anchor_centers, pos_kps_targets / stride[0])
+                pos_decode_kps_pred = pos_kps_pred
+                #print('ZZZ', pos_decode_kps_targets.shape, pos_decode_kps_pred.shape)
+                #print(pos_kps_weights[0,:].detach().cpu().numpy())
+                #print(pos_decode_kps_targets[0,:].detach().cpu().numpy())
+                #print(pos_decode_kps_pred[0,:].detach().cpu().numpy())
+
+
+                #print('CCC111', weight_targets.shape, pos_bbox_pred.shape, pos_decode_bbox_pred.shape, pos_kps_pred.shape, pos_decode_kps_pred.shape, pos_kps_weights.shape)
+
+            if use_qscore:
+                score[pos_inds] = bbox_overlaps(
+                    pos_decode_bbox_pred.detach(),
+                    pos_decode_bbox_targets,
+                    is_aligned=True)
+            else:
+                score[pos_inds] = 1.0
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            if self.use_kps:
+                loss_kps = self.loss_kps(
+                    pos_decode_kps_pred * self.loss_kps_std,
+                    pos_decode_kps_targets * self.loss_kps_std,
+                    weight=pos_kps_weights,
+                    avg_factor=1.0)
+            else:
+                loss_kps = kps_pred.sum() * 0
+
+            # dfl loss
+            if self.use_dfl:
+                pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+                target_corners = bbox2distance(pos_anchor_centers,
+                                               pos_decode_bbox_targets,
+                                               self.reg_max).reshape(-1)
+                loss_dfl = self.loss_dfl(
+                    pred_corners,
+                    target_corners,
+                    weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                    avg_factor=4.0)
+            else:
+                loss_dfl = bbox_pred.sum() * 0
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            loss_kps = kps_pred.sum() * 0
+            weight_targets = torch.tensor(0).cuda()
+
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=num_total_samples)
+
+
+        return loss_cls, loss_bbox, loss_dfl, loss_kps, weight_targets.sum()
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             kps_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_keypointss,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor] | None): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            gt_keypointss,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, keypoints_targets_list, keypoints_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_total_samples = reduce_mean(
+            torch.tensor(num_total_pos, dtype=torch.float,
+                         device=device)).item()
+        num_total_samples = max(num_total_samples, 1.0)
+
+        losses_cls, losses_bbox, losses_dfl, losses_kps,\
+            avg_factor = multi_apply(
+                self.loss_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                kps_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                keypoints_targets_list,
+                keypoints_weights_list,
+                self.anchor_generator.strides,
+                num_total_samples=num_total_samples)
+
+        #if self.train_step%100==0:
+        #    print('loss_cls:', losses_cls)
+        #    print('avg_factor:', avg_factor)
+
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if self.use_kps:
+            losses_kps = list(map(lambda x: x / avg_factor, losses_kps))
+            losses['loss_kps'] = losses_kps
+        if self.use_dfl:
+            losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+            losses['loss_dfl'] = losses_dfl
+        return losses
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'kps_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   kps_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class labelof the
+                corresponding box.
+
+        Example:
+            >>> import mmcv
+            >>> self = AnchorHead(
+            >>>     num_classes=9,
+            >>>     in_channels=1,
+            >>>     anchor_generator=dict(
+            >>>         type='AnchorGenerator',
+            >>>         scales=[8],
+            >>>         ratios=[0.5, 1.0, 2.0],
+            >>>         strides=[4,]))
+            >>> img_metas = [{'img_shape': (32, 32, 3), 'scale_factor': 1}]
+            >>> cfg = mmcv.Config(dict(
+            >>>     score_thr=0.00,
+            >>>     nms=dict(type='nms', iou_thr=1.0),
+            >>>     max_per_img=10))
+            >>> feat = torch.rand(1, 1, 3, 3)
+            >>> cls_score, bbox_pred = self.forward_single(feat)
+            >>> # note the input lists are over different levels, not images
+            >>> cls_scores, bbox_preds = [cls_score], [bbox_pred]
+            >>> result_list = self.get_bboxes(cls_scores, bbox_preds,
+            >>>                               img_metas, cfg)
+            >>> det_bboxes, det_labels = result_list[0]
+            >>> assert len(result_list) == 1
+            >>> assert det_bboxes.shape[1] == 5
+            >>> assert len(det_bboxes) == len(det_labels) == cfg.max_per_img
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            if with_nms:
+                # some heads don't support with_nms argument
+                proposals = self._get_bboxes_single(cls_score_list,
+                                                    bbox_pred_list,
+                                                    mlvl_anchors, img_shape,
+                                                    scale_factor, cfg, rescale)
+            else:
+                proposals = self._get_bboxes_single(cls_score_list,
+                                                    bbox_pred_list,
+                                                    mlvl_anchors, img_shape,
+                                                    scale_factor, cfg, rescale,
+                                                    with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into labeled boxes.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                has shape (num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for a single
+                scale level with shape (4*(n+1), H, W), n is max value of
+                integral set.
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): Bbox predictions in shape (N, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (N,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, stride, anchors in zip(
+                cls_scores, bbox_preds, self.anchor_generator.strides,
+                mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            if self.use_dfl:
+                bbox_pred = self.integral(bbox_pred) * stride[0]
+            else:
+                bbox_pred = bbox_pred.reshape( (-1,4) ) * stride[0]
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+
+            bboxes = distance2bbox(
+                self.anchor_center(anchors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    gt_keypointss_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True):
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        if gt_keypointss_list is None:
+            gt_keypointss_list = [None for _ in range(num_imgs)]
+        #print('QQQ:', num_imgs, gt_bboxes_list[0].shape)
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, all_keypoints_targets, all_keypoints_weights, 
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             gt_keypointss_list,
+             img_metas,
+             label_channels=label_channels,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        keypoints_targets_list = images_to_levels(all_keypoints_targets,
+                                             num_level_anchors)
+        keypoints_weights_list = images_to_levels(all_keypoints_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, keypoints_targets_list, keypoints_weights_list,
+                num_total_pos,
+                num_total_neg)
+
+    def _get_target_single(self,
+                           flat_anchors,
+                           valid_flags,
+                           num_level_anchors,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           gt_labels,
+                           gt_keypointss,
+                           img_meta,
+                           label_channels=1,
+                           unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors Tensor): Number of anchors of each scale level.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            img_meta (dict): Meta info of the image.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4).
+                pos_inds (Tensor): Indices of postive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        if self.assigner.__class__.__name__=='ATSSAssigner':
+            assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
+                                                 gt_bboxes, gt_bboxes_ignore,
+                                                 gt_labels)
+        else:
+            assign_result = self.assigner.assign(anchors, 
+                                                 gt_bboxes, gt_bboxes_ignore,
+                                                 gt_labels)
+
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        kps_targets = anchors.new_zeros(size=(anchors.shape[0], self.NK*2))
+        kps_weights = anchors.new_zeros(size=(anchors.shape[0], self.NK*2))
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if self.use_kps:
+                pos_assigned_gt_inds = sampling_result.pos_assigned_gt_inds
+                #print('BBB', anchors.shape, gt_bboxes.shape, gt_keypointss.shape, pos_inds.shape, bbox_targets.shape, pos_bbox_targets.shape)
+                kps_targets[pos_inds, :] = gt_keypointss[pos_assigned_gt_inds,:,:2].reshape( (-1, self.NK*2) )
+                kps_weights[pos_inds, :] = torch.mean(gt_keypointss[pos_assigned_gt_inds,:,2], dim=1, keepdims=True)
+            #kps_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            if self.use_kps:
+                kps_targets = unmap(kps_targets, num_total_anchors, inside_flags)
+                kps_weights = unmap(kps_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                kps_targets, kps_weights,
+                pos_inds, neg_inds)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/ssd_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/ssd_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..42554c12c6f19ce48af1b49bc19f029b849250d6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/ssd_head.py
@@ -0,0 +1,259 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_anchor_generator, build_assigner,
+                        build_bbox_coder, build_sampler, multi_apply)
+from ..builder import HEADS
+from ..losses import smooth_l1_loss
+from .anchor_head import AnchorHead
+
+
+# TODO: add loss evaluator for SSD
+@HEADS.register_module()
+class SSDHead(AnchorHead):
+    """SSD head used in https://arxiv.org/abs/1512.02325.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied on decoded bounding boxes. Default: False
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes=80,
+                 in_channels=(512, 1024, 512, 256, 256, 256),
+                 anchor_generator=dict(
+                     type='SSDAnchorGenerator',
+                     scale_major=False,
+                     input_size=300,
+                     strides=[8, 16, 32, 64, 100, 300],
+                     ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+                     basesize_ratio_range=(0.1, 0.9)),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[.0, .0, .0, .0],
+                     target_stds=[1.0, 1.0, 1.0, 1.0],
+                 ),
+                 reg_decoded_bbox=False,
+                 train_cfg=None,
+                 test_cfg=None):
+        super(AnchorHead, self).__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.cls_out_channels = num_classes + 1  # add background class
+        self.anchor_generator = build_anchor_generator(anchor_generator)
+        num_anchors = self.anchor_generator.num_base_anchors
+
+        reg_convs = []
+        cls_convs = []
+        for i in range(len(in_channels)):
+            reg_convs.append(
+                nn.Conv2d(
+                    in_channels[i],
+                    num_anchors[i] * 4,
+                    kernel_size=3,
+                    padding=1))
+            cls_convs.append(
+                nn.Conv2d(
+                    in_channels[i],
+                    num_anchors[i] * (num_classes + 1),
+                    kernel_size=3,
+                    padding=1))
+        self.reg_convs = nn.ModuleList(reg_convs)
+        self.cls_convs = nn.ModuleList(cls_convs)
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = False
+        self.cls_focal_loss = False
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        # set sampling=False for archor_target
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # SSD sampling=False so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform', bias=0)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
+                                            self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+
+    def loss_single(self, cls_score, bbox_pred, anchor, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Compute loss of a single image.
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) &
+                    (labels < self.num_classes)).nonzero().reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero().view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
+
+        if self.reg_decoded_bbox:
+            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
+
+        loss_bbox = smooth_l1_loss(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=self.train_cfg.smoothl1_beta,
+            avg_factor=num_total_samples)
+        return loss_cls[None], loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=1,
+            unmap_outputs=False)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        num_images = len(img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            num_total_samples=num_total_pos)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/transformer_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/transformer_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..df3fc9411ce34dec88a68846015bc0fbf7a47bb0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/transformer_head.py
@@ -0,0 +1,655 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear, build_activation_layer
+from mmcv.runner import force_fp32
+
+from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
+                        build_assigner, build_sampler, multi_apply,
+                        reduce_mean)
+from mmdet.models.utils import (FFN, build_positional_encoding,
+                                build_transformer)
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class TransformerHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (dict, optional): Config for transformer.
+        positional_encoding (dict, optional): Config for position encoding.
+        loss_cls (dict, optional): Config of the classification loss.
+            Default `CrossEntropyLoss`.
+        loss_bbox (dict, optional): Config of the regression loss.
+            Default `L1Loss`.
+        loss_iou (dict, optional): Config of the regression iou loss.
+            Default `GIoULoss`.
+        tran_cfg (dict, optional): Training config of transformer head.
+        test_cfg (dict, optional): Testing config of transformer head.
+
+    Example:
+        >>> import torch
+        >>> self = TransformerHead(80, 2048)
+        >>> x = torch.rand(1, 2048, 32, 32)
+        >>> mask = torch.ones(1, 32, 32).to(x.dtype)
+        >>> mask[:, :16, :15] = 0
+        >>> all_cls_scores, all_bbox_preds = self(x, mask)
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_fcs=2,
+                 transformer=dict(
+                     type='Transformer',
+                     embed_dims=256,
+                     num_heads=8,
+                     num_encoder_layers=6,
+                     num_decoder_layers=6,
+                     feedforward_channels=2048,
+                     dropout=0.1,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN'),
+                     num_fcs=2,
+                     pre_norm=False,
+                     return_intermediate_dec=True),
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 train_cfg=dict(
+                     assigner=dict(
+                         type='HungarianAssigner',
+                         cls_weight=1.,
+                         bbox_weight=5.,
+                         iou_weight=2.,
+                         iou_calculator=dict(type='BboxOverlaps2D'),
+                         iou_mode='giou')),
+                 test_cfg=dict(max_per_img=100),
+                 **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(AnchorFreeHead, self).__init__()
+        use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \
+            'not supported in DETR, since background is needed for the ' \
+            'matching process.'
+        assert 'embed_dims' in transformer \
+            and 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        embed_dims = transformer['embed_dims']
+        assert num_feats * 2 == embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {embed_dims}' \
+            f' and {num_feats}.'
+        assert test_cfg is not None and 'max_per_img' in test_cfg
+
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None:
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            assert loss_cls['loss_weight'] == assigner['cls_weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_bbox['loss_weight'] == assigner['bbox_weight'], \
+                'The regression L1 weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_iou['loss_weight'] == assigner['iou_weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.num_fcs = num_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.use_sigmoid_cls = use_sigmoid_cls
+        self.embed_dims = embed_dims
+        self.num_query = test_cfg['max_per_img']
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(
+            self.in_channels, self.embed_dims, kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_fcs,
+            self.act_cfg,
+            dropout=0.0,
+            add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self, distribution='uniform'):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, img_meta,
+                                             gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Defalut False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the ouputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        # exclude background
+        scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+        return det_bboxes, det_labels
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/vfnet_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/vfnet_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..7243bb62893839568ec51928d88a5ad40b02a66c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/vfnet_head.py
@@ -0,0 +1,794 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
+from mmcv.ops import DeformConv2d
+from mmcv.runner import force_fp32
+
+from mmdet.core import (bbox2distance, bbox_overlaps, build_anchor_generator,
+                        build_assigner, build_sampler, distance2bbox,
+                        multi_apply, multiclass_nms, reduce_mean)
+from ..builder import HEADS, build_loss
+from .atss_head import ATSSHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class VFNetHead(ATSSHead, FCOSHead):
+    """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
+    Detector.<https://arxiv.org/abs/2008.13367>`_.
+
+    The VFNet predicts IoU-aware classification scores which mix the
+    object presence confidence and object localization accuracy as the
+    detection score. It is built on the FCOS architecture and uses ATSS
+    for defining positive/negative training examples. The VFNet is trained
+    with Varifocal Loss and empolys star-shaped deformable convolution to
+    extract features for a bbox.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: False.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        sync_num_pos (bool): If true, synchronize the number of positive
+            examples across GPUs. Default: True
+        gradient_mul (float): The multiplier to gradients from bbox refinement
+            and recognition. Default: 0.1.
+        bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
+            'stride'. Default: reg_denom
+        loss_cls_fl (dict): Config of focal loss.
+        use_vfl (bool): If true, use varifocal loss for training.
+            Default: True.
+        loss_cls (dict): Config of varifocal loss.
+        loss_bbox (dict): Config of localization loss, GIoU Loss.
+        loss_bbox (dict): Config of localization refinement loss, GIoU Loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        use_atss (bool): If true, use ATSS to define positive/negative
+            examples. Default: True.
+        anchor_generator (dict): Config of anchor generator for ATSS.
+
+    Example:
+        >>> self = VFNetHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
+                                 (512, INF)),
+                 center_sampling=False,
+                 center_sample_radius=1.5,
+                 sync_num_pos=True,
+                 gradient_mul=0.1,
+                 bbox_norm_type='reg_denom',
+                 loss_cls_fl=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 use_vfl=True,
+                 loss_cls=dict(
+                     type='VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
+                 loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 use_atss=True,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     octave_base_scale=8,
+                     scales_per_octave=1,
+                     center_offset=0.0,
+                     strides=[8, 16, 32, 64, 128]),
+                 **kwargs):
+        # dcn base offsets, adapted from reppoints_head.py
+        self.num_dconv_points = 9
+        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super(FCOSHead, self).__init__(
+            num_classes, in_channels, norm_cfg=norm_cfg, **kwargs)
+        self.regress_ranges = regress_ranges
+        self.reg_denoms = [
+            regress_range[-1] for regress_range in regress_ranges
+        ]
+        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.sync_num_pos = sync_num_pos
+        self.bbox_norm_type = bbox_norm_type
+        self.gradient_mul = gradient_mul
+        self.use_vfl = use_vfl
+        if self.use_vfl:
+            self.loss_cls = build_loss(loss_cls)
+        else:
+            self.loss_cls = build_loss(loss_cls_fl)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_bbox_refine = build_loss(loss_bbox_refine)
+
+        # for getting ATSS targets
+        self.use_atss = use_atss
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.anchor_generator = build_anchor_generator(anchor_generator)
+        self.anchor_center_offset = anchor_generator['center_offset']
+        self.num_anchors = self.anchor_generator.num_base_anchors[0]
+        self.sampling = False
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super(FCOSHead, self)._init_cls_convs()
+        super(FCOSHead, self)._init_reg_convs()
+        self.relu = nn.ReLU(inplace=True)
+        self.vfnet_reg_conv = ConvModule(
+            self.feat_channels,
+            self.feat_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            bias=self.conv_bias)
+        self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_reg_refine_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_cls_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        normal_init(self.vfnet_reg_conv.conv, std=0.01)
+        normal_init(self.vfnet_reg, std=0.01)
+        normal_init(self.vfnet_reg_refine_dconv, std=0.01)
+        normal_init(self.vfnet_reg_refine, std=0.01)
+        normal_init(self.vfnet_cls_dconv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.vfnet_cls, std=0.01, bias=bias_cls)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box offsets for each
+                    scale level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+                bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                    each scale level, each is a 4D-tensor, the channel
+                    number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.scales_refine, self.strides, self.reg_denoms)
+
+    def forward_single(self, x, scale, scale_refine, stride, reg_denom):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
+                resize the refined bbox prediction.
+            stride (int): The corresponding stride for feature maps,
+                used to normalize the bbox prediction when
+                bbox_norm_type = 'stride'.
+            reg_denom (int): The corresponding regression range for feature
+                maps, only used to normalize the bbox prediction when
+                bbox_norm_type = 'reg_denom'.
+
+        Returns:
+            tuple: iou-aware cls scores for each box, bbox predictions and
+                refined bbox predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+
+        # predict the bbox_pred of different level
+        reg_feat_init = self.vfnet_reg_conv(reg_feat)
+        if self.bbox_norm_type == 'reg_denom':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
+        elif self.bbox_norm_type == 'stride':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * stride
+        else:
+            raise NotImplementedError
+
+        # compute star deformable convolution offsets
+        # converting dcn_offset to reg_feat.dtype thus VFNet can be
+        # trained with FP16
+        dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
+                                          stride).to(reg_feat.dtype)
+
+        # refine the bbox_pred
+        reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
+        bbox_pred_refine = scale_refine(
+            self.vfnet_reg_refine(reg_feat)).float().exp()
+        bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
+
+        # predict the iou-aware cls score
+        cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
+        cls_score = self.vfnet_cls(cls_feat)
+
+        return cls_score, bbox_pred, bbox_pred_refine
+
+    def star_dcn_offset(self, bbox_pred, gradient_mul, stride):
+        """Compute the star deformable conv offsets.
+
+        Args:
+            bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
+            gradient_mul (float): Gradient multiplier.
+            stride (int): The corresponding stride for feature maps,
+                used to project the bbox onto the feature map.
+
+        Returns:
+            dcn_offsets (Tensor): The offsets for deformable convolution.
+        """
+        dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
+        bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
+            gradient_mul * bbox_pred
+        # map to the feature map scale
+        bbox_pred_grad_mul = bbox_pred_grad_mul / stride
+        N, C, H, W = bbox_pred.size()
+
+        x1 = bbox_pred_grad_mul[:, 0, :, :]
+        y1 = bbox_pred_grad_mul[:, 1, :, :]
+        x2 = bbox_pred_grad_mul[:, 2, :, :]
+        y2 = bbox_pred_grad_mul[:, 3, :, :]
+        bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
+            N, 2 * self.num_dconv_points, H, W)
+        bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 5, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 11, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 12, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 14, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 16, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 17, :, :] = x2  # x2
+        dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
+
+        return dcn_offset
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'bbox_preds_refine'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             bbox_preds_refine,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box offsets for each
+                scale level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level, each is a 4D-tensor, the channel
+                number is num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
+            cls_scores, all_level_points, gt_bboxes, gt_labels, img_metas,
+            gt_bboxes_ignore)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and bbox_preds_refine
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3,
+                              1).reshape(-1,
+                                         self.cls_out_channels).contiguous()
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds_refine = [
+            bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred_refine in bbox_preds_refine
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = torch.where(
+            ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
+        num_pos = len(pos_inds)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
+        pos_labels = flatten_labels[pos_inds]
+
+        # sync num_pos across all gpus
+        if self.sync_num_pos:
+            num_pos_avg_per_gpu = reduce_mean(
+                pos_inds.new_tensor(num_pos).float()).item()
+            num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
+        else:
+            num_pos_avg_per_gpu = num_pos
+
+        if num_pos > 0:
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_points = flatten_points[pos_inds]
+
+            pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = distance2bbox(pos_points,
+                                                     pos_bbox_targets)
+            iou_targets_ini = bbox_overlaps(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                is_aligned=True).clamp(min=1e-6)
+            bbox_weights_ini = iou_targets_ini.clone().detach()
+            iou_targets_ini_avg_per_gpu = reduce_mean(
+                bbox_weights_ini.sum()).item()
+            bbox_avg_factor_ini = max(iou_targets_ini_avg_per_gpu, 1.0)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_ini,
+                avg_factor=bbox_avg_factor_ini)
+
+            pos_decoded_bbox_preds_refine = \
+                distance2bbox(pos_points, pos_bbox_preds_refine)
+            iou_targets_rf = bbox_overlaps(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                is_aligned=True).clamp(min=1e-6)
+            bbox_weights_rf = iou_targets_rf.clone().detach()
+            iou_targets_rf_avg_per_gpu = reduce_mean(
+                bbox_weights_rf.sum()).item()
+            bbox_avg_factor_rf = max(iou_targets_rf_avg_per_gpu, 1.0)
+            loss_bbox_refine = self.loss_bbox_refine(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_rf,
+                avg_factor=bbox_avg_factor_rf)
+
+            # build IoU-aware cls_score targets
+            if self.use_vfl:
+                pos_ious = iou_targets_rf.clone().detach()
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+                cls_iou_targets[pos_inds, pos_labels] = pos_ious
+        else:
+            loss_bbox = pos_bbox_preds.sum() * 0
+            loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
+            if self.use_vfl:
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+
+        if self.use_vfl:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                cls_iou_targets,
+                avg_factor=num_pos_avg_per_gpu)
+        else:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                flatten_labels,
+                weight=label_weights,
+                avg_factor=num_pos_avg_per_gpu)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_bbox_rf=loss_bbox_refine)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'bbox_preds_refine'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   bbox_preds_refine,
+                   img_metas,
+                   cfg=None,
+                   rescale=None,
+                   with_nms=True):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box offsets for each scale
+                level with shape (N, num_points * 4, H, W).
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level with shape (N, num_points * 4, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used. Default: None.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before returning boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds_refine[i][img_id].detach()
+                for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            det_bboxes = self._get_bboxes_single(cls_score_list,
+                                                 bbox_pred_list, mlvl_points,
+                                                 img_shape, scale_factor, cfg,
+                                                 rescale, with_nms)
+            result_list.append(det_bboxes)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_points,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for a single scale
+                level with shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box offsets for a single scale
+                level with shape (num_points * 4, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arrange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before returning boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): BBox predictions in shape (n, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (n,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, points in zip(cls_scores, bbox_preds,
+                                                mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).contiguous().sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4).contiguous()
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if 0 < nms_pre < scores.shape[0]:
+                max_scores, _ = scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        if with_nms:
+            det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points according to feature map sizes."""
+        h, w = featmap_size
+        x_range = torch.arange(
+            0, w * stride, stride, dtype=dtype, device=device)
+        y_range = torch.arange(
+            0, h * stride, stride, dtype=dtype, device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        # to be compatible with anchor points in ATSS
+        if self.use_atss:
+            points = torch.stack(
+                (x.reshape(-1), y.reshape(-1)), dim=-1) + \
+                     stride * self.anchor_center_offset
+        else:
+            points = torch.stack(
+                (x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2
+        return points
+
+    def get_targets(self, cls_scores, mlvl_points, gt_bboxes, gt_labels,
+                    img_metas, gt_bboxes_ignore):
+        """A wrapper for computing ATSS and FCOS targets for points in multiple
+        images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level.
+                label_weights (Tensor/None): Label weights of all levels.
+                bbox_targets_list (list[Tensor]): Regression targets of each
+                    level, (l, t, r, b).
+                bbox_weights (Tensor/None): Bbox weights of all levels.
+        """
+        if self.use_atss:
+            return self.get_atss_targets(cls_scores, mlvl_points, gt_bboxes,
+                                         gt_labels, img_metas,
+                                         gt_bboxes_ignore)
+        else:
+            self.norm_on_bbox = False
+            return self.get_fcos_targets(mlvl_points, gt_bboxes, gt_labels)
+
+    def _get_target_single(self, *args, **kwargs):
+        """Avoid ambiguity in multiple inheritance."""
+        if self.use_atss:
+            return ATSSHead._get_target_single(self, *args, **kwargs)
+        else:
+            return FCOSHead._get_target_single(self, *args, **kwargs)
+
+    def get_fcos_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute FCOS regression and classification targets for points in
+        multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                labels (list[Tensor]): Labels of each level.
+                label_weights: None, to be compatible with ATSS targets.
+                bbox_targets (list[Tensor]): BBox targets of each level.
+                bbox_weights: None, to be compatible with ATSS targets.
+        """
+        labels, bbox_targets = FCOSHead.get_targets(self, points,
+                                                    gt_bboxes_list,
+                                                    gt_labels_list)
+        label_weights = None
+        bbox_weights = None
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_atss_targets(self,
+                         cls_scores,
+                         mlvl_points,
+                         gt_bboxes,
+                         gt_labels,
+                         img_metas,
+                         gt_bboxes_ignore=None):
+        """A wrapper for computing ATSS targets for points in multiple images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4). Default: None.
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level.
+                label_weights (Tensor): Label weights of all levels.
+                bbox_targets_list (list[Tensor]): Regression targets of each
+                    level, (l, t, r, b).
+                bbox_weights (Tensor): Bbox weights of all levels.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+
+        cls_reg_targets = ATSSHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            unmap_outputs=True)
+        if cls_reg_targets is None:
+            return None
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
+
+        bbox_targets_list = [
+            bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
+        ]
+
+        num_imgs = len(img_metas)
+        # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
+        bbox_targets_list = self.transform_bbox_targets(
+            bbox_targets_list, mlvl_points, num_imgs)
+
+        labels_list = [labels.reshape(-1) for labels in labels_list]
+        label_weights_list = [
+            label_weights.reshape(-1) for label_weights in label_weights_list
+        ]
+        bbox_weights_list = [
+            bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
+        ]
+        label_weights = torch.cat(label_weights_list)
+        bbox_weights = torch.cat(bbox_weights_list)
+        return labels_list, label_weights, bbox_targets_list, bbox_weights
+
+    def transform_bbox_targets(self, decoded_bboxes, mlvl_points, num_imgs):
+        """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
+
+        Args:
+            decoded_bboxes (list[Tensor]): Regression targets of each level,
+                in the form of (x1, y1, x2, y2).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            num_imgs (int): the number of images in a batch.
+
+        Returns:
+            bbox_targets (list[Tensor]): Regression targets of each level in
+                the form of (l, t, r, b).
+        """
+        # TODO: Re-implemented in Class PointCoder
+        assert len(decoded_bboxes) == len(mlvl_points)
+        num_levels = len(decoded_bboxes)
+        mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
+        bbox_targets = []
+        for i in range(num_levels):
+            bbox_target = bbox2distance(mlvl_points[i], decoded_bboxes[i])
+            bbox_targets.append(bbox_target)
+
+        return bbox_targets
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Override the method in the parent class to avoid changing para's
+        name."""
+        pass
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/yolact_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/yolact_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..824246ce7a888adaa2dd545fc4553b82c035e099
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/yolact_head.py
@@ -0,0 +1,939 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import build_sampler, fast_nms, images_to_levels, multi_apply
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class YOLACTHead(AnchorHead):
+    """YOLACT box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that YOLACT head is a light version of RetinaNet head.
+    Four differences are described as follows:
+
+    1. YOLACT box head has three-times fewer anchors.
+    2. YOLACT box head shares the convs for box and cls branches.
+    3. YOLACT box head uses OHEM instead of Focal loss.
+    4. YOLACT box head predicts a set of mask coefficients for each box.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (dict): Config dict for anchor generator
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        num_head_convs (int): Number of the conv layers shared by
+            box and cls branches.
+        num_protos (int): Number of the mask coefficients.
+        use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
+            cls loss calculation. If false, ``loss_single`` will be used.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=3,
+                     scales_per_octave=1,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     reduction='none',
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+                 num_head_convs=1,
+                 num_protos=32,
+                 use_ohem=True,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 **kwargs):
+        self.num_head_convs = num_head_convs
+        self.num_protos = num_protos
+        self.use_ohem = use_ohem
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(YOLACTHead, self).__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            anchor_generator=anchor_generator,
+            **kwargs)
+        if self.use_ohem:
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+            self.sampling = False
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.head_convs = nn.ModuleList()
+        for i in range(self.num_head_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.head_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.conv_reg = nn.Conv2d(
+            self.feat_channels, self.num_anchors * 4, 3, padding=1)
+        self.conv_coeff = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.num_protos,
+            3,
+            padding=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.head_convs:
+            xavier_init(m.conv, distribution='uniform', bias=0)
+        xavier_init(self.conv_cls, distribution='uniform', bias=0)
+        xavier_init(self.conv_reg, distribution='uniform', bias=0)
+        xavier_init(self.conv_coeff, distribution='uniform', bias=0)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_anchors * 4.
+                coeff_pred (Tensor): Mask coefficients for a single scale \
+                    level, the channels number is num_anchors * num_protos.
+        """
+        for head_conv in self.head_convs:
+            x = head_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        coeff_pred = self.conv_coeff(x).tanh()
+        return cls_score, bbox_pred, coeff_pred
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """A combination of the func:``AnchorHead.loss`` and
+        func:``SSDHead.loss``.
+
+        When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
+        otherwise, it follows ``AnchorHead.loss``. Besides, it additionally
+        returns ``sampling_results``.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            tuple:
+                dict[str, Tensor]: A dictionary of loss components.
+                List[:obj:``SamplingResult``]: Sampler results for each image.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels,
+            unmap_outputs=not self.use_ohem,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg, sampling_results) = cls_reg_targets
+
+        if self.use_ohem:
+            num_images = len(img_metas)
+            all_cls_scores = torch.cat([
+                s.permute(0, 2, 3, 1).reshape(
+                    num_images, -1, self.cls_out_channels) for s in cls_scores
+            ], 1)
+            all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+            all_label_weights = torch.cat(label_weights_list,
+                                          -1).view(num_images, -1)
+            all_bbox_preds = torch.cat([
+                b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+                for b in bbox_preds
+            ], -2)
+            all_bbox_targets = torch.cat(bbox_targets_list,
+                                         -2).view(num_images, -1, 4)
+            all_bbox_weights = torch.cat(bbox_weights_list,
+                                         -2).view(num_images, -1, 4)
+
+            # concat all level anchors to a single tensor
+            all_anchors = []
+            for i in range(num_images):
+                all_anchors.append(torch.cat(anchor_list[i]))
+
+            # check NaN and Inf
+            assert torch.isfinite(all_cls_scores).all().item(), \
+                'classification scores become infinite or NaN!'
+            assert torch.isfinite(all_bbox_preds).all().item(), \
+                'bbox predications become infinite or NaN!'
+
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_single_OHEM,
+                all_cls_scores,
+                all_bbox_preds,
+                all_anchors,
+                all_labels,
+                all_label_weights,
+                all_bbox_targets,
+                all_bbox_weights,
+                num_total_samples=num_total_pos)
+        else:
+            num_total_samples = (
+                num_total_pos +
+                num_total_neg if self.sampling else num_total_pos)
+
+            # anchor number of multi levels
+            num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+            # concat all level anchors and flags to a single tensor
+            concat_anchor_list = []
+            for i in range(len(anchor_list)):
+                concat_anchor_list.append(torch.cat(anchor_list[i]))
+            all_anchor_list = images_to_levels(concat_anchor_list,
+                                               num_level_anchors)
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_single,
+                cls_scores,
+                bbox_preds,
+                all_anchor_list,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                num_total_samples=num_total_samples)
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox), sampling_results
+
+    def loss_single_OHEM(self, cls_score, bbox_pred, anchors, labels,
+                         label_weights, bbox_targets, bbox_weights,
+                         num_total_samples):
+        """"See func:``SSDHead.loss``."""
+        loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) &
+                    (labels < self.num_classes)).nonzero().reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero().view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        if num_pos_samples == 0:
+            num_neg_samples = neg_inds.size(0)
+        else:
+            num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
+            if num_neg_samples > neg_inds.size(0):
+                num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
+        if self.reg_decoded_bbox:
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        return loss_cls[None], loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'coeff_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   coeff_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        """"Similiar to func:``AnchorHead.get_bboxes``, but additionally
+        processes coeff_preds.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Returns:
+            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
+                a 3-tuple. The first item is an (n, 5) tensor, where the
+                first 4 columns are bounding box positions
+                (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                between 0 and 1. The second item is an (n,) tensor where each
+                item is the predicted class label of the corresponding box.
+                The third item is an (n, num_protos) tensor where each item
+                is the predicted mask coefficients of instance inside the
+                corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        det_bboxes = []
+        det_labels = []
+        det_coeffs = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            coeff_pred_list = [
+                coeff_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            bbox_res = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               coeff_pred_list, mlvl_anchors,
+                                               img_shape, scale_factor, cfg,
+                                               rescale)
+            det_bboxes.append(bbox_res[0])
+            det_labels.append(bbox_res[1])
+            det_coeffs.append(bbox_res[2])
+        return det_bboxes, det_labels, det_coeffs
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           coeff_preds_list,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        """"Similiar to func:``AnchorHead._get_bboxes_single``, but
+        additionally processes coeff_preds_list and uses fast NMS instead of
+        traditional NMS.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_anchors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_anchors * 4, H, W).
+            coeff_preds_list (list[Tensor]): Mask coefficients for a single
+                scale level with shape (num_anchors * num_protos, H, W).
+            mlvl_anchors (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]: The first item is an (n, 5) tensor,
+                where the first 4 columns are bounding box positions
+                (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between
+                0 and 1. The second item is an (n,) tensor where each item is
+                the predicted class label of the corresponding box. The third
+                item is an (n, num_protos) tensor where each item is the
+                predicted mask coefficients of instance inside the
+                corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_coeffs = []
+        for cls_score, bbox_pred, coeff_pred, anchors in \
+                zip(cls_score_list, bbox_pred_list,
+                    coeff_preds_list, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            coeff_pred = coeff_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_protos)
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                coeff_pred = coeff_pred[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_coeffs.append(coeff_pred)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_coeffs = torch.cat(mlvl_coeffs)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        det_bboxes, det_labels, det_coeffs = fast_nms(mlvl_bboxes, mlvl_scores,
+                                                      mlvl_coeffs,
+                                                      cfg.score_thr,
+                                                      cfg.iou_thr, cfg.top_k,
+                                                      cfg.max_per_img)
+        return det_bboxes, det_labels, det_coeffs
+
+
+@HEADS.register_module()
+class YOLACTSegmHead(nn.Module):
+    """YOLACT segmentation head used in https://arxiv.org/abs/1904.02689.
+
+    Apply a semantic segmentation loss on feature space using layers that are
+    only evaluated during training to increase performance with no speed
+    penalty.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_segm (dict): Config of semantic segmentation loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=256,
+                 loss_segm=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0)):
+        super(YOLACTSegmHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.loss_segm = build_loss(loss_segm)
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.segm_conv = nn.Conv2d(
+            self.in_channels, self.num_classes, kernel_size=1)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        xavier_init(self.segm_conv, distribution='uniform')
+
+    def forward(self, x):
+        """Forward feature from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: Predicted semantic segmentation map with shape
+                (N, num_classes, H, W).
+        """
+        return self.segm_conv(x)
+
+    @force_fp32(apply_to=('segm_pred', ))
+    def loss(self, segm_pred, gt_masks, gt_labels):
+        """Compute loss of the head.
+
+        Args:
+            segm_pred (list[Tensor]): Predicted semantic segmentation map
+                with shape (N, num_classes, H, W).
+            gt_masks (list[Tensor]): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_segm = []
+        num_imgs, num_classes, mask_h, mask_w = segm_pred.size()
+        for idx in range(num_imgs):
+            cur_segm_pred = segm_pred[idx]
+            cur_gt_masks = gt_masks[idx].float()
+            cur_gt_labels = gt_labels[idx]
+            segm_targets = self.get_targets(cur_segm_pred, cur_gt_masks,
+                                            cur_gt_labels)
+            if segm_targets is None:
+                loss = self.loss_segm(cur_segm_pred,
+                                      torch.zeros_like(cur_segm_pred),
+                                      torch.zeros_like(cur_segm_pred))
+            else:
+                loss = self.loss_segm(
+                    cur_segm_pred,
+                    segm_targets,
+                    avg_factor=num_imgs * mask_h * mask_w)
+            loss_segm.append(loss)
+        return dict(loss_segm=loss_segm)
+
+    def get_targets(self, segm_pred, gt_masks, gt_labels):
+        """Compute semantic segmentation targets for each image.
+
+        Args:
+            segm_pred (Tensor): Predicted semantic segmentation map
+                with shape (num_classes, H, W).
+            gt_masks (Tensor): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_labels (Tensor): Class indices corresponding to each box.
+
+        Returns:
+            Tensor: Semantic segmentation targets with shape
+                (num_classes, H, W).
+        """
+        if gt_masks.size(0) == 0:
+            return None
+        num_classes, mask_h, mask_w = segm_pred.size()
+        with torch.no_grad():
+            downsampled_masks = F.interpolate(
+                gt_masks.unsqueeze(0), (mask_h, mask_w),
+                mode='bilinear',
+                align_corners=False).squeeze(0)
+            downsampled_masks = downsampled_masks.gt(0.5).float()
+            segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
+            for obj_idx in range(downsampled_masks.size(0)):
+                segm_targets[gt_labels[obj_idx] - 1] = torch.max(
+                    segm_targets[gt_labels[obj_idx] - 1],
+                    downsampled_masks[obj_idx])
+            return segm_targets
+
+
+@HEADS.register_module()
+class YOLACTProtonet(nn.Module):
+    """YOLACT mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask prototypes for YOLACT.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        proto_channels (tuple[int]): Output channels of protonet convs.
+        proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
+        include_last_relu (Bool): If keep the last relu of protonet.
+        num_protos (int): Number of prototypes.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_mask_weight (float): Reweight the mask loss by this factor.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=256,
+                 proto_channels=(256, 256, 256, None, 256, 32),
+                 proto_kernel_sizes=(3, 3, 3, -2, 3, 1),
+                 include_last_relu=True,
+                 num_protos=32,
+                 loss_mask_weight=1.0,
+                 max_masks_to_train=100):
+        super(YOLACTProtonet, self).__init__()
+        self.in_channels = in_channels
+        self.proto_channels = proto_channels
+        self.proto_kernel_sizes = proto_kernel_sizes
+        self.include_last_relu = include_last_relu
+        self.protonet = self._init_layers()
+
+        self.loss_mask_weight = loss_mask_weight
+        self.num_protos = num_protos
+        self.num_classes = num_classes
+        self.max_masks_to_train = max_masks_to_train
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        """A helper function to take a config setting and turn it into a
+        network."""
+        # Possible patterns:
+        # ( 256, 3) -> conv
+        # ( 256,-2) -> deconv
+        # (None,-2) -> bilinear interpolate
+        in_channels = self.in_channels
+        protonets = nn.ModuleList()
+        for num_channels, kernel_size in zip(self.proto_channels,
+                                             self.proto_kernel_sizes):
+            if kernel_size > 0:
+                layer = nn.Conv2d(
+                    in_channels,
+                    num_channels,
+                    kernel_size,
+                    padding=kernel_size // 2)
+            else:
+                if num_channels is None:
+                    layer = InterpolateModule(
+                        scale_factor=-kernel_size,
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    layer = nn.ConvTranspose2d(
+                        in_channels,
+                        num_channels,
+                        -kernel_size,
+                        padding=kernel_size // 2)
+            protonets.append(layer)
+            protonets.append(nn.ReLU(inplace=True))
+            in_channels = num_channels if num_channels is not None \
+                else in_channels
+        if not self.include_last_relu:
+            protonets = protonets[:-1]
+        return nn.Sequential(*protonets)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.protonet:
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, x, coeff_pred, bboxes, img_meta, sampling_results=None):
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+            coeff_pred (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W).
+            bboxes (list[Tensor]): Box used for cropping with shape
+                (N, num_anchors * 4, H, W). During training, they are
+                ground truth boxes. During testing, they are predicted
+                boxes.
+            img_meta (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            sampling_results (List[:obj:``SamplingResult``]): Sampler results
+                for each image.
+
+        Returns:
+            list[Tensor]: Predicted instance segmentation masks.
+        """
+        prototypes = self.protonet(x)
+        prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
+
+        num_imgs = x.size(0)
+        # Training state
+        if self.training:
+            coeff_pred_list = []
+            for coeff_pred_per_level in coeff_pred:
+                coeff_pred_per_level = \
+                    coeff_pred_per_level.permute(0, 2, 3, 1)\
+                    .reshape(num_imgs, -1, self.num_protos)
+                coeff_pred_list.append(coeff_pred_per_level)
+            coeff_pred = torch.cat(coeff_pred_list, dim=1)
+
+        mask_pred_list = []
+        for idx in range(num_imgs):
+            cur_prototypes = prototypes[idx]
+            cur_coeff_pred = coeff_pred[idx]
+            cur_bboxes = bboxes[idx]
+            cur_img_meta = img_meta[idx]
+
+            # Testing state
+            if not self.training:
+                bboxes_for_cropping = cur_bboxes
+            else:
+                cur_sampling_results = sampling_results[idx]
+                pos_assigned_gt_inds = \
+                    cur_sampling_results.pos_assigned_gt_inds
+                bboxes_for_cropping = cur_bboxes[pos_assigned_gt_inds].clone()
+                pos_inds = cur_sampling_results.pos_inds
+                cur_coeff_pred = cur_coeff_pred[pos_inds]
+
+            # Linearly combine the prototypes with the mask coefficients
+            mask_pred = cur_prototypes @ cur_coeff_pred.t()
+            mask_pred = torch.sigmoid(mask_pred)
+
+            h, w = cur_img_meta['img_shape'][:2]
+            bboxes_for_cropping[:, 0] /= w
+            bboxes_for_cropping[:, 1] /= h
+            bboxes_for_cropping[:, 2] /= w
+            bboxes_for_cropping[:, 3] /= h
+
+            mask_pred = self.crop(mask_pred, bboxes_for_cropping)
+            mask_pred = mask_pred.permute(2, 0, 1).contiguous()
+            mask_pred_list.append(mask_pred)
+        return mask_pred_list
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, gt_masks, gt_bboxes, img_meta, sampling_results):
+        """Compute loss of the head.
+
+        Args:
+            mask_pred (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_masks (list[Tensor]): Ground truth masks for each image with
+                the same shape of the input image.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_meta (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            sampling_results (List[:obj:``SamplingResult``]): Sampler results
+                for each image.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_mask = []
+        num_imgs = len(mask_pred)
+        total_pos = 0
+        for idx in range(num_imgs):
+            cur_mask_pred = mask_pred[idx]
+            cur_gt_masks = gt_masks[idx].float()
+            cur_gt_bboxes = gt_bboxes[idx]
+            cur_img_meta = img_meta[idx]
+            cur_sampling_results = sampling_results[idx]
+
+            pos_assigned_gt_inds = cur_sampling_results.pos_assigned_gt_inds
+            num_pos = pos_assigned_gt_inds.size(0)
+            # Since we're producing (near) full image masks,
+            # it'd take too much vram to backprop on every single mask.
+            # Thus we select only a subset.
+            if num_pos > self.max_masks_to_train:
+                perm = torch.randperm(num_pos)
+                select = perm[:self.max_masks_to_train]
+                cur_mask_pred = cur_mask_pred[select]
+                pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+                num_pos = self.max_masks_to_train
+            total_pos += num_pos
+
+            gt_bboxes_for_reweight = cur_gt_bboxes[pos_assigned_gt_inds]
+
+            mask_targets = self.get_targets(cur_mask_pred, cur_gt_masks,
+                                            pos_assigned_gt_inds)
+            if num_pos == 0:
+                loss = cur_mask_pred.sum() * 0.
+            elif mask_targets is None:
+                loss = F.binary_cross_entropy(cur_mask_pred,
+                                              torch.zeros_like(cur_mask_pred),
+                                              torch.zeros_like(cur_mask_pred))
+            else:
+                cur_mask_pred = torch.clamp(cur_mask_pred, 0, 1)
+                loss = F.binary_cross_entropy(
+                    cur_mask_pred, mask_targets,
+                    reduction='none') * self.loss_mask_weight
+
+                h, w = cur_img_meta['img_shape'][:2]
+                gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
+                                   gt_bboxes_for_reweight[:, 0]) / w
+                gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
+                                    gt_bboxes_for_reweight[:, 1]) / h
+                loss = loss.mean(dim=(1,
+                                      2)) / gt_bboxes_width / gt_bboxes_height
+                loss = torch.sum(loss)
+            loss_mask.append(loss)
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = [x / total_pos for x in loss_mask]
+
+        return dict(loss_mask=loss_mask)
+
+    def get_targets(self, mask_pred, gt_masks, pos_assigned_gt_inds):
+        """Compute instance segmentation targets for each image.
+
+        Args:
+            mask_pred (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_masks (Tensor): Ground truth masks for each image with
+                the same shape of the input image.
+            pos_assigned_gt_inds (Tensor): GT indices of the corresponding
+                positive samples.
+        Returns:
+            Tensor: Instance segmentation targets with shape
+                (num_instances, H, W).
+        """
+        if gt_masks.size(0) == 0:
+            return None
+        mask_h, mask_w = mask_pred.shape[-2:]
+        gt_masks = F.interpolate(
+            gt_masks.unsqueeze(0), (mask_h, mask_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        gt_masks = gt_masks.gt(0.5).float()
+        mask_targets = gt_masks[pos_assigned_gt_inds]
+        return mask_targets
+
+    def get_seg_masks(self, mask_pred, label_pred, img_meta, rescale):
+        """Resize, binarize, and format the instance mask predictions.
+
+        Args:
+            mask_pred (Tensor): shape (N, H, W).
+            label_pred (Tensor): shape (N, ).
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+        Returns:
+            list[ndarray]: Mask predictions grouped by their predicted classes.
+        """
+        ori_shape = img_meta['ori_shape']
+        scale_factor = img_meta['scale_factor']
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+        else:
+            img_h = np.round(ori_shape[0] * scale_factor[1]).astype(np.int32)
+            img_w = np.round(ori_shape[1] * scale_factor[0]).astype(np.int32)
+
+        cls_segms = [[] for _ in range(self.num_classes)]
+        if mask_pred.size(0) == 0:
+            return cls_segms
+
+        mask_pred = F.interpolate(
+            mask_pred.unsqueeze(0), (img_h, img_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0) > 0.5
+        mask_pred = mask_pred.cpu().numpy().astype(np.uint8)
+
+        for m, l in zip(mask_pred, label_pred):
+            cls_segms[l].append(m)
+        return cls_segms
+
+    def crop(self, masks, boxes, padding=1):
+        """Crop predicted masks by zeroing out everything not in the predicted
+        bbox.
+
+        Args:
+            masks (Tensor): shape [H, W, N].
+            boxes (Tensor): bbox coords in relative point form with
+                shape [N, 4].
+
+        Return:
+            Tensor: The cropped masks.
+        """
+        h, w, n = masks.size()
+        x1, x2 = self.sanitize_coordinates(
+            boxes[:, 0], boxes[:, 2], w, padding, cast=False)
+        y1, y2 = self.sanitize_coordinates(
+            boxes[:, 1], boxes[:, 3], h, padding, cast=False)
+
+        rows = torch.arange(
+            w, device=masks.device, dtype=x1.dtype).view(1, -1,
+                                                         1).expand(h, w, n)
+        cols = torch.arange(
+            h, device=masks.device, dtype=x1.dtype).view(-1, 1,
+                                                         1).expand(h, w, n)
+
+        masks_left = rows >= x1.view(1, 1, -1)
+        masks_right = rows < x2.view(1, 1, -1)
+        masks_up = cols >= y1.view(1, 1, -1)
+        masks_down = cols < y2.view(1, 1, -1)
+
+        crop_mask = masks_left * masks_right * masks_up * masks_down
+
+        return masks * crop_mask.float()
+
+    def sanitize_coordinates(self, x1, x2, img_size, padding=0, cast=True):
+        """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
+        and x2 <= image_size. Also converts from relative to absolute
+        coordinates and casts the results to long tensors.
+
+        Warning: this does things in-place behind the scenes so
+        copy if necessary.
+
+        Args:
+            _x1 (Tensor): shape (N, ).
+            _x2 (Tensor): shape (N, ).
+            img_size (int): Size of the input image.
+            padding (int): x1 >= padding, x2 <= image_size-padding.
+            cast (bool): If cast is false, the result won't be cast to longs.
+
+        Returns:
+            tuple:
+                x1 (Tensor): Sanitized _x1.
+                x2 (Tensor): Sanitized _x2.
+        """
+        x1 = x1 * img_size
+        x2 = x2 * img_size
+        if cast:
+            x1 = x1.long()
+            x2 = x2.long()
+        x1 = torch.min(x1, x2)
+        x2 = torch.max(x1, x2)
+        x1 = torch.clamp(x1 - padding, min=0)
+        x2 = torch.clamp(x2 + padding, max=img_size)
+        return x1, x2
+
+
+class InterpolateModule(nn.Module):
+    """This is a module version of F.interpolate.
+
+    Any arguments you give it just get passed along for the ride.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+        self.args = args
+        self.kwargs = kwargs
+
+    def forward(self, x):
+        """Forward features from the upstream network."""
+        return F.interpolate(x, *self.args, **self.kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/dense_heads/yolo_head.py b/insightface/detection/scrfd/mmdet/models/dense_heads/yolo_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..93d051e7a8b6735e2e353def2775e6c85451b193
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/dense_heads/yolo_head.py
@@ -0,0 +1,536 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import (build_anchor_generator, build_assigner,
+                        build_bbox_coder, build_sampler, images_to_levels,
+                        multi_apply, multiclass_nms)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class YOLOV3Head(BaseDenseHead, BBoxTestMixin):
+    """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (List[int]): The number of output channels per scale
+            before the final 1x1 layer. Default: (1024, 512, 256).
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        featmap_strides (List[int]): The stride of each scale.
+            Should be in descending order. Default: (32, 16, 8).
+        one_hot_smoother (float): Set a non-zero value to enable label-smooth
+            Default: 0.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        loss_cls (dict): Config of classification loss.
+        loss_conf (dict): Config of confidence loss.
+        loss_xy (dict): Config of xy coordinate loss.
+        loss_wh (dict): Config of wh coordinate loss.
+        train_cfg (dict): Training config of YOLOV3 head. Default: None.
+        test_cfg (dict): Testing config of YOLOV3 head. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 out_channels=(1024, 512, 256),
+                 anchor_generator=dict(
+                     type='YOLOAnchorGenerator',
+                     base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(10, 13), (16, 30), (33, 23)]],
+                     strides=[32, 16, 8]),
+                 bbox_coder=dict(type='YOLOBBoxCoder'),
+                 featmap_strides=[32, 16, 8],
+                 one_hot_smoother=0.,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_conf=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_xy=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_wh=dict(type='MSELoss', loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None):
+        super(YOLOV3Head, self).__init__()
+        # Check params
+        assert (len(in_channels) == len(out_channels) == len(featmap_strides))
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            if hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.one_hot_smoother = one_hot_smoother
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.anchor_generator = build_anchor_generator(anchor_generator)
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_conf = build_loss(loss_conf)
+        self.loss_xy = build_loss(loss_xy)
+        self.loss_wh = build_loss(loss_wh)
+        # usually the numbers of anchors for each level are the same
+        # except SSD detectors
+        self.num_anchors = self.anchor_generator.num_base_anchors[0]
+        assert len(
+            self.anchor_generator.num_base_anchors) == len(featmap_strides)
+        self._init_layers()
+
+    @property
+    def num_levels(self):
+        return len(self.featmap_strides)
+
+    @property
+    def num_attrib(self):
+        """int: number of attributes in pred_map, bboxes (4) +
+        objectness (1) + num_classes"""
+
+        return 5 + self.num_classes
+
+    def _init_layers(self):
+        self.convs_bridge = nn.ModuleList()
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_bridge = ConvModule(
+                self.in_channels[i],
+                self.out_channels[i],
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            conv_pred = nn.Conv2d(self.out_channels[i],
+                                  self.num_anchors * self.num_attrib, 1)
+
+            self.convs_bridge.append(conv_bridge)
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for m in self.convs_pred:
+            normal_init(m, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        assert len(feats) == self.num_levels
+        pred_maps = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            x = self.convs_bridge[i](x)
+            pred_map = self.convs_pred[i](x)
+            pred_maps.append(pred_map)
+
+        return tuple(pred_maps),
+
+    @force_fp32(apply_to=('pred_maps', ))
+    def get_bboxes(self,
+                   pred_maps,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            pred_maps (list[Tensor]): Raw predictions for a batch of images.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used. Default: None.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of the
+                corresponding box.
+        """
+        result_list = []
+        num_levels = len(pred_maps)
+        for img_id in range(len(img_metas)):
+            pred_maps_list = [
+                pred_maps[i][img_id].detach() for i in range(num_levels)
+            ]
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(pred_maps_list, scale_factor,
+                                                cfg, rescale, with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           pred_maps_list,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            pred_maps_list (list[Tensor]): Prediction maps for different scales
+                of each single image in the batch.
+            scale_factor (ndarray): Scale factor of the image arrange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple(Tensor):
+                det_bboxes (Tensor): BBox predictions in shape (n, 5), where
+                    the first 4 columns are bounding box positions
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column is a score
+                    between 0 and 1.
+                det_labels (Tensor): A (n,) tensor where each item is the
+                    predicted class label of the corresponding box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(pred_maps_list) == self.num_levels
+        multi_lvl_bboxes = []
+        multi_lvl_cls_scores = []
+        multi_lvl_conf_scores = []
+        num_levels = len(pred_maps_list)
+        featmap_sizes = [
+            pred_maps_list[i].shape[-2:] for i in range(num_levels)
+        ]
+        multi_lvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, pred_maps_list[0][0].device)
+        for i in range(self.num_levels):
+            # get some key info for current scale
+            pred_map = pred_maps_list[i]
+            stride = self.featmap_strides[i]
+
+            # (h, w, num_anchors*num_attrib) -> (h*w*num_anchors, num_attrib)
+            pred_map = pred_map.permute(1, 2, 0).reshape(-1, self.num_attrib)
+
+            pred_map[..., :2] = torch.sigmoid(pred_map[..., :2])
+            bbox_pred = self.bbox_coder.decode(multi_lvl_anchors[i],
+                                               pred_map[..., :4], stride)
+            # conf and cls
+            conf_pred = torch.sigmoid(pred_map[..., 4]).view(-1)
+            cls_pred = torch.sigmoid(pred_map[..., 5:]).view(
+                -1, self.num_classes)  # Cls pred one-hot.
+
+            # Filtering out all predictions with conf < conf_thr
+            conf_thr = cfg.get('conf_thr', -1)
+            if conf_thr > 0:
+                # add as_tuple=False for compatibility in Pytorch 1.6
+                # flatten would create a Reshape op with constant values,
+                # and raise RuntimeError when doing inference in ONNX Runtime
+                # with a different input image (#4221).
+                conf_inds = conf_pred.ge(conf_thr).nonzero(
+                    as_tuple=False).squeeze(1)
+                bbox_pred = bbox_pred[conf_inds, :]
+                cls_pred = cls_pred[conf_inds, :]
+                conf_pred = conf_pred[conf_inds]
+
+            # Get top-k prediction
+            nms_pre = cfg.get('nms_pre', -1)
+            if 0 < nms_pre < conf_pred.size(0) and (
+                    not torch.onnx.is_in_onnx_export()):
+                _, topk_inds = conf_pred.topk(nms_pre)
+                bbox_pred = bbox_pred[topk_inds, :]
+                cls_pred = cls_pred[topk_inds, :]
+                conf_pred = conf_pred[topk_inds]
+
+            # Save the result of current scale
+            multi_lvl_bboxes.append(bbox_pred)
+            multi_lvl_cls_scores.append(cls_pred)
+            multi_lvl_conf_scores.append(conf_pred)
+
+        # Merge the results of different scales together
+        multi_lvl_bboxes = torch.cat(multi_lvl_bboxes)
+        multi_lvl_cls_scores = torch.cat(multi_lvl_cls_scores)
+        multi_lvl_conf_scores = torch.cat(multi_lvl_conf_scores)
+
+        if with_nms and (multi_lvl_conf_scores.size(0) == 0):
+            return torch.zeros((0, 5)), torch.zeros((0, ))
+
+        if rescale:
+            multi_lvl_bboxes /= multi_lvl_bboxes.new_tensor(scale_factor)
+
+        # In mmdet 2.x, the class_id for background is num_classes.
+        # i.e., the last column.
+        padding = multi_lvl_cls_scores.new_zeros(multi_lvl_cls_scores.shape[0],
+                                                 1)
+        multi_lvl_cls_scores = torch.cat([multi_lvl_cls_scores, padding],
+                                         dim=1)
+
+        # Support exporting to onnx without nms
+        if with_nms and cfg.get('nms', None) is not None:
+            det_bboxes, det_labels = multiclass_nms(
+                multi_lvl_bboxes,
+                multi_lvl_cls_scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=multi_lvl_conf_scores)
+            return det_bboxes, det_labels
+        else:
+            return (multi_lvl_bboxes, multi_lvl_cls_scores,
+                    multi_lvl_conf_scores)
+
+    @force_fp32(apply_to=('pred_maps', ))
+    def loss(self,
+             pred_maps,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            pred_maps (list[Tensor]): Prediction map for each scale level,
+                shape (N, num_anchors * num_attrib, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(img_metas)
+        device = pred_maps[0][0].device
+
+        featmap_sizes = [
+            pred_maps[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        responsible_flag_list = []
+        for img_id in range(len(img_metas)):
+            responsible_flag_list.append(
+                self.anchor_generator.responsible_flags(
+                    featmap_sizes, gt_bboxes[img_id], device))
+
+        target_maps_list, neg_maps_list = self.get_targets(
+            anchor_list, responsible_flag_list, gt_bboxes, gt_labels)
+
+        losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
+            self.loss_single, pred_maps, target_maps_list, neg_maps_list)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_conf=losses_conf,
+            loss_xy=losses_xy,
+            loss_wh=losses_wh)
+
+    def loss_single(self, pred_map, target_map, neg_map):
+        """Compute loss of a single image from a batch.
+
+        Args:
+            pred_map (Tensor): Raw predictions for a single level.
+            target_map (Tensor): The Ground-Truth target for a single level.
+            neg_map (Tensor): The negative masks for a single level.
+
+        Returns:
+            tuple:
+                loss_cls (Tensor): Classification loss.
+                loss_conf (Tensor): Confidence loss.
+                loss_xy (Tensor): Regression loss of x, y coordinate.
+                loss_wh (Tensor): Regression loss of w, h coordinate.
+        """
+
+        num_imgs = len(pred_map)
+        pred_map = pred_map.permute(0, 2, 3,
+                                    1).reshape(num_imgs, -1, self.num_attrib)
+        neg_mask = neg_map.float()
+        pos_mask = target_map[..., 4]
+        pos_and_neg_mask = neg_mask + pos_mask
+        pos_mask = pos_mask.unsqueeze(dim=-1)
+        if torch.max(pos_and_neg_mask) > 1.:
+            warnings.warn('There is overlap between pos and neg sample.')
+            pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
+
+        pred_xy = pred_map[..., :2]
+        pred_wh = pred_map[..., 2:4]
+        pred_conf = pred_map[..., 4]
+        pred_label = pred_map[..., 5:]
+
+        target_xy = target_map[..., :2]
+        target_wh = target_map[..., 2:4]
+        target_conf = target_map[..., 4]
+        target_label = target_map[..., 5:]
+
+        loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
+        loss_conf = self.loss_conf(
+            pred_conf, target_conf, weight=pos_and_neg_mask)
+        loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
+        loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
+
+        return loss_cls, loss_conf, loss_xy, loss_wh
+
+    def get_targets(self, anchor_list, responsible_flag_list, gt_bboxes_list,
+                    gt_labels_list):
+        """Compute target maps for anchors in multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_total_anchors, 4).
+            responsible_flag_list (list[list[Tensor]]): Multi level responsible
+                flags of each image. Each element is a tensor of shape
+                (num_total_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+                - target_map_list (list[Tensor]): Target map of each level.
+                - neg_map_list (list[Tensor]): Negative map of each level.
+        """
+        num_imgs = len(anchor_list)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        results = multi_apply(self._get_targets_single, anchor_list,
+                              responsible_flag_list, gt_bboxes_list,
+                              gt_labels_list)
+
+        all_target_maps, all_neg_maps = results
+        assert num_imgs == len(all_target_maps) == len(all_neg_maps)
+        target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
+        neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
+
+        return target_maps_list, neg_maps_list
+
+    def _get_targets_single(self, anchors, responsible_flags, gt_bboxes,
+                            gt_labels):
+        """Generate matching bounding box prior and converted GT.
+
+        Args:
+            anchors (list[Tensor]): Multi-level anchors of the image.
+            responsible_flags (list[Tensor]): Multi-level responsible flags of
+                anchors
+            gt_bboxes (Tensor): Ground truth bboxes of single image.
+            gt_labels (Tensor): Ground truth labels of single image.
+
+        Returns:
+            tuple:
+                target_map (Tensor): Predication target map of each
+                    scale level, shape (num_total_anchors,
+                    5+num_classes)
+                neg_map (Tensor): Negative map of each scale level,
+                    shape (num_total_anchors,)
+        """
+
+        anchor_strides = []
+        for i in range(len(anchors)):
+            anchor_strides.append(
+                torch.tensor(self.featmap_strides[i],
+                             device=gt_bboxes.device).repeat(len(anchors[i])))
+        concat_anchors = torch.cat(anchors)
+        concat_responsible_flags = torch.cat(responsible_flags)
+
+        anchor_strides = torch.cat(anchor_strides)
+        assert len(anchor_strides) == len(concat_anchors) == \
+               len(concat_responsible_flags)
+        assign_result = self.assigner.assign(concat_anchors,
+                                             concat_responsible_flags,
+                                             gt_bboxes)
+        sampling_result = self.sampler.sample(assign_result, concat_anchors,
+                                              gt_bboxes)
+
+        target_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), self.num_attrib)
+
+        target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
+            sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes,
+            anchor_strides[sampling_result.pos_inds])
+
+        target_map[sampling_result.pos_inds, 4] = 1
+
+        gt_labels_one_hot = F.one_hot(
+            gt_labels, num_classes=self.num_classes).float()
+        if self.one_hot_smoother != 0:  # label smooth
+            gt_labels_one_hot = gt_labels_one_hot * (
+                1 - self.one_hot_smoother
+            ) + self.one_hot_smoother / self.num_classes
+        target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
+            sampling_result.pos_assigned_gt_inds]
+
+        neg_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), dtype=torch.uint8)
+        neg_map[sampling_result.neg_inds] = 1
+
+        return target_map, neg_map
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/__init__.py b/insightface/detection/scrfd/mmdet/models/detectors/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e54a219c27a37be9e983537b7710b1ef4fa98d8b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/__init__.py
@@ -0,0 +1,36 @@
+from .atss import ATSS
+from .base import BaseDetector
+from .cascade_rcnn import CascadeRCNN
+from .cornernet import CornerNet
+from .detr import DETR
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .fcos import FCOS
+from .fovea import FOVEA
+from .fsaf import FSAF
+from .gfl import GFL
+from .grid_rcnn import GridRCNN
+from .htc import HybridTaskCascade
+from .mask_rcnn import MaskRCNN
+from .mask_scoring_rcnn import MaskScoringRCNN
+from .nasfcos import NASFCOS
+from .paa import PAA
+from .point_rend import PointRend
+from .reppoints_detector import RepPointsDetector
+from .retinanet import RetinaNet
+from .rpn import RPN
+from .single_stage import SingleStageDetector
+from .trident_faster_rcnn import TridentFasterRCNN
+from .two_stage import TwoStageDetector
+from .vfnet import VFNet
+from .yolact import YOLACT
+from .yolo import YOLOV3
+from .scrfd import SCRFD
+
+__all__ = [
+    'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
+    'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade',
+    'RetinaNet', 'FCOS', 'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector',
+    'FOVEA', 'FSAF', 'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA',
+    'YOLOV3', 'YOLACT', 'VFNet', 'DETR', 'TridentFasterRCNN', 'SCRFD'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/atss.py b/insightface/detection/scrfd/mmdet/models/detectors/atss.py
new file mode 100755
index 0000000000000000000000000000000000000000..db7139c6b4fcd7e83007cdb785520743ddae7066
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/atss.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class ATSS(SingleStageDetector):
+    """Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(ATSS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/base.py b/insightface/detection/scrfd/mmdet/models/detectors/base.py
new file mode 100755
index 0000000000000000000000000000000000000000..7c6d5e9612b73f070af2de980c81f9453d520d9d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/base.py
@@ -0,0 +1,355 @@
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from mmcv.runner import auto_fp16
+from mmcv.utils import print_log
+
+from mmdet.utils import get_root_logger
+
+
+class BaseDetector(nn.Module, metaclass=ABCMeta):
+    """Base class for detectors."""
+
+    def __init__(self):
+        super(BaseDetector, self).__init__()
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self):
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self):
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self):
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Extract features from images."""
+        pass
+
+    def extract_feats(self, imgs):
+        """Extract features from multiple images.
+
+        Args:
+            imgs (list[torch.Tensor]): A list of images. The images are
+                augmented from the same image but in different ways.
+
+        Returns:
+            list[torch.Tensor]: Features of different images
+        """
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            img (list[Tensor]): List of tensors of shape (1, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys, see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            kwargs (keyword arguments): Specific to concrete implementation.
+        """
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        batch_input_shape = tuple(imgs[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+
+    async def async_simple_test(self, img, img_metas, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def simple_test(self, img, img_metas, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test function with test time augmentation."""
+        pass
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if pretrained is not None:
+            logger = get_root_logger()
+            print_log(f'load model from: {pretrained}', logger=logger)
+
+    async def aforward_test(self, *, img, img_metas, **kwargs):
+        for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(img)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(img)}) '
+                             f'!= num of image metas ({len(img_metas)})')
+        # TODO: remove the restriction of samples_per_gpu == 1 when prepared
+        samples_per_gpu = img[0].size(0)
+        assert samples_per_gpu == 1
+
+        if num_augs == 1:
+            return await self.async_simple_test(img[0], img_metas[0], **kwargs)
+        else:
+            raise NotImplementedError
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) '
+                             f'!= num of image meta ({len(img_metas)})')
+
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        for img, img_meta in zip(imgs, img_metas):
+            batch_size = len(img_meta)
+            for img_id in range(batch_size):
+                img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:])
+
+        if num_augs == 1:
+            # proposals (List[List[Tensor]]): the outer list indicates
+            # test-time augs (multiscale, flip, etc.) and the inner list
+            # indicates images in a batch.
+            # The Tensor should have a shape Px4, where P is the number of
+            # proposals.
+            if 'proposals' in kwargs:
+                kwargs['proposals'] = kwargs['proposals'][0]
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            assert imgs[0].size(0) == 1, 'aug test does not support ' \
+                                         'inference with batch size ' \
+                                         f'{imgs[0].size(0)}'
+            # TODO: support test augmentation for predefined proposals
+            assert 'proposals' not in kwargs
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def _parse_losses(self, losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary infomation.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars contains \
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def train_step(self, data, optimizer):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
+                ``num_samples``.
+
+                - ``loss`` is a tensor for back propagation, which can be a \
+                weighted sum of multiple losses.
+                - ``log_vars`` contains all the variables to be sent to the
+                logger.
+                - ``num_samples`` indicates the batch size (when the model is \
+                DDP, it means the batch size on each GPU), which is used for \
+                averaging the logs.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def val_step(self, data, optimizer):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color='green',
+                    text_color='green',
+                    thickness=1,
+                    font_scale=0.5,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor or tuple): The results to draw over `img`
+                bbox_result or (bbox_result, segm_result).
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        if isinstance(result, tuple):
+            bbox_result, segm_result = result
+            if isinstance(segm_result, tuple):
+                segm_result = segm_result[0]  # ms rcnn
+        else:
+            bbox_result, segm_result = result, None
+        bboxes = np.vstack(bbox_result)
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+        # draw segmentation masks
+        if segm_result is not None and len(labels) > 0:  # non empty
+            segms = mmcv.concat_list(segm_result)
+            inds = np.where(bboxes[:, -1] > score_thr)[0]
+            np.random.seed(42)
+            color_masks = [
+                np.random.randint(0, 256, (1, 3), dtype=np.uint8)
+                for _ in range(max(labels) + 1)
+            ]
+            for i in inds:
+                i = int(i)
+                color_mask = color_masks[labels[i]]
+                sg = segms[i]
+                if isinstance(sg, torch.Tensor):
+                    sg = sg.detach().cpu().numpy()
+                mask = sg.astype(bool)
+                img[mask] = img[mask] * 0.5 + color_mask * 0.5
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        mmcv.imshow_det_bboxes(
+            img,
+            bboxes,
+            labels,
+            class_names=self.CLASSES,
+            score_thr=score_thr,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            thickness=thickness,
+            font_scale=font_scale,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/cascade_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/cascade_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..47cc7cef984123804c4f99900d496807cde3c0e6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/cascade_rcnn.py
@@ -0,0 +1,37 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class CascadeRCNN(TwoStageDetector):
+    r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
+    Detection <https://arxiv.org/abs/1906.09756>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(CascadeRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def show_result(self, data, result, **kwargs):
+        """Show prediction results of the detector."""
+        if self.with_mask:
+            ms_bbox_result, ms_segm_result = result
+            if isinstance(ms_bbox_result, dict):
+                result = (ms_bbox_result['ensemble'],
+                          ms_segm_result['ensemble'])
+        else:
+            if isinstance(result, dict):
+                result = result['ensemble']
+        return super(CascadeRCNN, self).show_result(data, result, **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/cornernet.py b/insightface/detection/scrfd/mmdet/models/detectors/cornernet.py
new file mode 100755
index 0000000000000000000000000000000000000000..bb8ccc1465ab66d1615ca16701a533a22b156295
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/cornernet.py
@@ -0,0 +1,95 @@
+import torch
+
+from mmdet.core import bbox2result, bbox_mapping_back
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class CornerNet(SingleStageDetector):
+    """CornerNet.
+
+    This detector is the implementation of the paper `CornerNet: Detecting
+    Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(CornerNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                        test_cfg, pretrained)
+
+    def merge_aug_results(self, aug_results, img_metas):
+        """Merge augmented detection bboxes and score.
+
+        Args:
+            aug_results (list[list[Tensor]]): Det_bboxes and det_labels of each
+                image.
+            img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: (bboxes, labels)
+        """
+        recovered_bboxes, aug_labels = [], []
+        for bboxes_labels, img_info in zip(aug_results, img_metas):
+            img_shape = img_info[0]['img_shape']  # using shape before padding
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            bboxes, labels = bboxes_labels
+            bboxes, scores = bboxes[:, :4], bboxes[:, -1:]
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
+            recovered_bboxes.append(torch.cat([bboxes, scores], dim=-1))
+            aug_labels.append(labels)
+
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        labels = torch.cat(aug_labels)
+
+        if bboxes.shape[0] > 0:
+            out_bboxes, out_labels = self.bbox_head._bboxes_nms(
+                bboxes, labels, self.bbox_head.test_cfg)
+        else:
+            out_bboxes, out_labels = bboxes, labels
+
+        return out_bboxes, out_labels
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Augment testing of CornerNet.
+
+        Args:
+            imgs (list[Tensor]): Augmented images.
+            img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Note:
+            ``imgs`` must including flipped image pairs.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        img_inds = list(range(len(imgs)))
+
+        assert img_metas[0][0]['flip'] + img_metas[1][0]['flip'], (
+            'aug test must have flipped image pair')
+        aug_results = []
+        for ind, flip_ind in zip(img_inds[0::2], img_inds[1::2]):
+            img_pair = torch.cat([imgs[ind], imgs[flip_ind]])
+            x = self.extract_feat(img_pair)
+            outs = self.bbox_head(x)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, [img_metas[ind], img_metas[flip_ind]], False, False)
+            aug_results.append(bbox_list[0])
+            aug_results.append(bbox_list[1])
+
+        bboxes, labels = self.merge_aug_results(aug_results, img_metas)
+        bbox_results = bbox2result(bboxes, labels, self.bbox_head.num_classes)
+
+        return [bbox_results]
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/detr.py b/insightface/detection/scrfd/mmdet/models/detectors/detr.py
new file mode 100755
index 0000000000000000000000000000000000000000..5ff82a280daa0a015f662bdf2509fa11542d46d4
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/detr.py
@@ -0,0 +1,46 @@
+from mmdet.core import bbox2result
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class DETR(SingleStageDetector):
+    r"""Implementation of `DETR: End-to-End Object Detection with
+    Transformers <https://arxiv.org/pdf/2005.12872>`_"""
+
+    def __init__(self,
+                 backbone,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DETR, self).__init__(backbone, None, bbox_head, train_cfg,
+                                   test_cfg, pretrained)
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        batch_size = len(img_metas)
+        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
+            f'mode is supported. Found batch_size {batch_size}.'
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x, img_metas)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in bbox_list
+        ]
+        return bbox_results
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/fast_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/fast_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..3d6e242767b927ed37198b6bc7862abecef99a33
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,52 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class FastRCNN(TwoStageDetector):
+    """Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
+
+    def __init__(self,
+                 backbone,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(FastRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def forward_test(self, imgs, img_metas, proposals, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+            proposals (List[List[Tensor]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. The Tensor should have a shape Px4, where
+                P is the number of proposals.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) '
+                             f'!= num of image meta ({len(img_metas)})')
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], proposals[0],
+                                    **kwargs)
+        else:
+            # TODO: support test-time augmentation
+            assert NotImplementedError
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/faster_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/faster_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..81bad0f43a48b1022c4cd996e26d6c90be93d4d0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,24 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class FasterRCNN(TwoStageDetector):
+    """Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(FasterRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/fcos.py b/insightface/detection/scrfd/mmdet/models/detectors/fcos.py
new file mode 100755
index 0000000000000000000000000000000000000000..58485c1864a11a66168b7597f345ea759ce20551
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/fcos.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FCOS(SingleStageDetector):
+    """Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/fovea.py b/insightface/detection/scrfd/mmdet/models/detectors/fovea.py
new file mode 100755
index 0000000000000000000000000000000000000000..22a578efffbd108db644d907bae95c7c8df31f2e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/fovea.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FOVEA(SingleStageDetector):
+    """Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(FOVEA, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/fsaf.py b/insightface/detection/scrfd/mmdet/models/detectors/fsaf.py
new file mode 100755
index 0000000000000000000000000000000000000000..9f10fa1ae10f31e6cb5de65505b14a4fc97dd022
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/fsaf.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class FSAF(SingleStageDetector):
+    """Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(FSAF, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                   test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/gfl.py b/insightface/detection/scrfd/mmdet/models/detectors/gfl.py
new file mode 100755
index 0000000000000000000000000000000000000000..64d65cb2dfb7a56f57e08c3fcad67e1539e1e841
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/gfl.py
@@ -0,0 +1,16 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class GFL(SingleStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(GFL, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                  test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/grid_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/grid_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..b6145a1464cd940bd4f98eaa15f6f9ecf6a10a20
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/grid_rcnn.py
@@ -0,0 +1,29 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class GridRCNN(TwoStageDetector):
+    """Grid R-CNN.
+
+    This detector is the implementation of:
+    - Grid R-CNN (https://arxiv.org/abs/1811.12030)
+    - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(GridRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/htc.py b/insightface/detection/scrfd/mmdet/models/detectors/htc.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9efdf420fa7373f7f1d116f8d97836d73b457bf
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/htc.py
@@ -0,0 +1,15 @@
+from ..builder import DETECTORS
+from .cascade_rcnn import CascadeRCNN
+
+
+@DETECTORS.register_module()
+class HybridTaskCascade(CascadeRCNN):
+    """Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
+
+    def __init__(self, **kwargs):
+        super(HybridTaskCascade, self).__init__(**kwargs)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the detector has a semantic head"""
+        return self.roi_head.with_semantic
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/mask_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/mask_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..c15a7733170e059d2825138b3812319915b7cad6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,24 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class MaskRCNN(TwoStageDetector):
+    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(MaskRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/mask_scoring_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/mask_scoring_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..b6252b6e1d234a201725342a5780fade7e21957c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/mask_scoring_rcnn.py
@@ -0,0 +1,27 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class MaskScoringRCNN(TwoStageDetector):
+    """Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(MaskScoringRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/nasfcos.py b/insightface/detection/scrfd/mmdet/models/detectors/nasfcos.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb0148351546f45a451ef5f7a2a9ef4024e85b7c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/nasfcos.py
@@ -0,0 +1,20 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class NASFCOS(SingleStageDetector):
+    """NAS-FCOS: Fast Neural Architecture Search for Object Detection.
+
+    https://arxiv.org/abs/1906.0442
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(NASFCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                      test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/paa.py b/insightface/detection/scrfd/mmdet/models/detectors/paa.py
new file mode 100755
index 0000000000000000000000000000000000000000..9b4bb5e0939b824d9fef7fc3bd49a0164c29613a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/paa.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class PAA(SingleStageDetector):
+    """Implementation of `PAA <https://arxiv.org/pdf/2007.08103.pdf>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(PAA, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                  test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/point_rend.py b/insightface/detection/scrfd/mmdet/models/detectors/point_rend.py
new file mode 100755
index 0000000000000000000000000000000000000000..808ef2258ae88301d349db3aaa2711f223e5c971
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/point_rend.py
@@ -0,0 +1,29 @@
+from ..builder import DETECTORS
+from .two_stage import TwoStageDetector
+
+
+@DETECTORS.register_module()
+class PointRend(TwoStageDetector):
+    """PointRend: Image Segmentation as Rendering
+
+    This detector is the implementation of
+    `PointRend <https://arxiv.org/abs/1912.08193>`_.
+
+    """
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+        super(PointRend, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/reppoints_detector.py b/insightface/detection/scrfd/mmdet/models/detectors/reppoints_detector.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5f6be31e14488e4b8a006b7142a82c872388d82
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/reppoints_detector.py
@@ -0,0 +1,22 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class RepPointsDetector(SingleStageDetector):
+    """RepPoints: Point Set Representation for Object Detection.
+
+        This detector is the implementation of:
+        - RepPoints detector (https://arxiv.org/pdf/1904.11490)
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(RepPointsDetector,
+              self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg,
+                             pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/retinanet.py b/insightface/detection/scrfd/mmdet/models/detectors/retinanet.py
new file mode 100755
index 0000000000000000000000000000000000000000..41378e8bc74bf9d5cbc7e3e6630bb1e6657049f9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/retinanet.py
@@ -0,0 +1,17 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class RetinaNet(SingleStageDetector):
+    """Implementation of `RetinaNet <https://arxiv.org/abs/1708.02002>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(RetinaNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                        test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/rpn.py b/insightface/detection/scrfd/mmdet/models/detectors/rpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..36c38afc5aaed8219d80eb1170f444410f4c8135
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/rpn.py
@@ -0,0 +1,153 @@
+import mmcv
+from mmcv.image import tensor2imgs
+
+from mmdet.core import bbox_mapping
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class RPN(BaseDetector):
+    """Implementation of Region Proposal Network."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(RPN, self).__init__()
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck) if neck is not None else None
+        rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+        rpn_head.update(train_cfg=rpn_train_cfg)
+        rpn_head.update(test_cfg=test_cfg.rpn)
+        self.rpn_head = build_head(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(RPN, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        self.rpn_head.init_weights()
+
+    def extract_feat(self, img):
+        """Extract features.
+
+        Args:
+            img (torch.Tensor): Image tensor with shape (n, c, h ,w).
+
+        Returns:
+            list[torch.Tensor]: Multi-level features that may have
+                different resolutions.
+        """
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        x = self.extract_feat(img)
+        rpn_outs = self.rpn_head(x)
+        return rpn_outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        if self.train_cfg.rpn.get('debug', False):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+
+        x = self.extract_feat(img)
+        losses = self.rpn_head.forward_train(x, img_metas, gt_bboxes, None,
+                                             gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[np.ndarray]: proposals
+        """
+        x = self.extract_feat(img)
+        proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        if rescale:
+            for proposals, meta in zip(proposal_list, img_metas):
+                proposals[:, :4] /= proposals.new_tensor(meta['scale_factor'])
+
+        return [proposal.cpu().numpy() for proposal in proposal_list]
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[np.ndarray]: proposals
+        """
+        proposal_list = self.rpn_head.aug_test_rpn(
+            self.extract_feats(imgs), img_metas)
+        if not rescale:
+            for proposals, img_meta in zip(proposal_list, img_metas[0]):
+                img_shape = img_meta['img_shape']
+                scale_factor = img_meta['scale_factor']
+                flip = img_meta['flip']
+                flip_direction = img_meta['flip_direction']
+                proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape,
+                                                scale_factor, flip,
+                                                flip_direction)
+        return [proposal.cpu().numpy() for proposal in proposal_list]
+
+    def show_result(self, data, result, dataset=None, top_k=20):
+        """Show RPN proposals on the image.
+
+        Although we assume batch size is 1, this method supports arbitrary
+        batch size.
+        """
+        img_tensor = data['img'][0]
+        img_metas = data['img_metas'][0].data[0]
+        imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+        assert len(imgs) == len(img_metas)
+        for img, img_meta in zip(imgs, img_metas):
+            h, w, _ = img_meta['img_shape']
+            img_show = img[:h, :w, :]
+            mmcv.imshow_bboxes(img_show, result, top_k=top_k)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/scrfd.py b/insightface/detection/scrfd/mmdet/models/detectors/scrfd.py
new file mode 100755
index 0000000000000000000000000000000000000000..332e0a30cf6df56d26606dd7f141b6072b16bc1d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/scrfd.py
@@ -0,0 +1,100 @@
+from mmdet.core import bbox2result
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+import torch
+
+
+@DETECTORS.register_module()
+class SCRFD(SingleStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(SCRFD, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                  test_cfg, pretrained)
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_keypointss=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_keypointss, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        #print(len(outs))
+        if torch.onnx.is_in_onnx_export():
+            print('single_stage.py in-onnx-export')
+            print(outs.__class__)
+            cls_score, bbox_pred, kps_pred = outs
+            for c in cls_score:
+                print(c.shape)
+            for c in bbox_pred:
+                print(c.shape)
+            #print(outs[0].shape, outs[1].shape)
+            if self.bbox_head.use_kps:
+                for c in kps_pred:
+                    print(c.shape)
+                return (cls_score, bbox_pred, kps_pred)
+            else:
+                return (cls_score, bbox_pred)
+            #return outs
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        # skip post-processing when exporting to ONNX
+        #if torch.onnx.is_in_onnx_export():
+        #    return bbox_list
+
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in bbox_list
+        ]
+        return bbox_results
+
+    def feature_test(self, img):
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/single_stage.py b/insightface/detection/scrfd/mmdet/models/detectors/single_stage.py
new file mode 100755
index 0000000000000000000000000000000000000000..6ed6edece9438aa999b9f6ff6a3050fe3443cbbe
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/single_stage.py
@@ -0,0 +1,160 @@
+import torch
+import torch.nn as nn
+
+from mmdet.core import bbox2result
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(SingleStageDetector, self).__init__()
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(SingleStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        self.bbox_head.init_weights()
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/get_flops.py`
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        #print(len(outs))
+        if torch.onnx.is_in_onnx_export():
+            print('single_stage.py in-onnx-export')
+            print(outs.__class__)
+            cls_score, bbox_pred = outs
+            for c in cls_score:
+                print(c.shape)
+            for c in bbox_pred:
+                print(c.shape)
+            #print(outs[0].shape, outs[1].shape)
+            return outs
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        # skip post-processing when exporting to ONNX
+        if torch.onnx.is_in_onnx_export():
+            return bbox_list
+
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            imgs (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        assert hasattr(self.bbox_head, 'aug_test'), \
+            f'{self.bbox_head.__class__.__name__}' \
+            ' does not support test-time augmentation'
+        print('aug-test:', len(imgs))
+        feats = self.extract_feats(imgs)
+        return [self.bbox_head.aug_test(feats, img_metas, rescale=rescale)]
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/trident_faster_rcnn.py b/insightface/detection/scrfd/mmdet/models/detectors/trident_faster_rcnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..f0fd80d41407162df71ba5349fc659d4713cdb6e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/trident_faster_rcnn.py
@@ -0,0 +1,66 @@
+from ..builder import DETECTORS
+from .faster_rcnn import FasterRCNN
+
+
+@DETECTORS.register_module()
+class TridentFasterRCNN(FasterRCNN):
+    """Implementation of `TridentNet <https://arxiv.org/abs/1901.01892>`_"""
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 roi_head,
+                 train_cfg,
+                 test_cfg,
+                 neck=None,
+                 pretrained=None):
+
+        super(TridentFasterRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+        assert self.backbone.num_branch == self.roi_head.num_branch
+        assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx
+        self.num_branch = self.backbone.num_branch
+        self.test_branch_idx = self.backbone.test_branch_idx
+
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+        if proposals is None:
+            num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+            trident_img_metas = img_metas * num_branch
+            proposal_list = self.rpn_head.simple_test_rpn(x, trident_img_metas)
+        else:
+            proposal_list = proposals
+
+        return self.roi_head.simple_test(
+            x, proposal_list, trident_img_metas, rescale=rescale)
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+        trident_img_metas = [img_metas * num_branch for img_metas in img_metas]
+        proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def forward_train(self, img, img_metas, gt_bboxes, gt_labels, **kwargs):
+        """make copies of img and gts to fit multi-branch."""
+        trident_gt_bboxes = tuple(gt_bboxes * self.num_branch)
+        trident_gt_labels = tuple(gt_labels * self.num_branch)
+        trident_img_metas = tuple(img_metas * self.num_branch)
+
+        return super(TridentFasterRCNN,
+                     self).forward_train(img, trident_img_metas,
+                                         trident_gt_bboxes, trident_gt_labels)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/two_stage.py b/insightface/detection/scrfd/mmdet/models/detectors/two_stage.py
new file mode 100755
index 0000000000000000000000000000000000000000..d66923f32605dc58f0d78a1eed92a846577cae0d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,210 @@
+import torch
+import torch.nn as nn
+
+# from mmdet.core import bbox2result, bbox2roi, build_assigner, build_sampler
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector
+
+
+@DETECTORS.register_module()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = build_neck(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            self.rpn_head = build_head(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            self.roi_head = build_head(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self):
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in detector.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_roi_head:
+            self.roi_head.init_weights(pretrained)
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/get_flops.py`
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(img)
+        # rpn
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            outs = outs + (rpn_outs, )
+        proposals = torch.randn(1000, 4).to(img.device)
+        # roi_head
+        roi_outs = self.roi_head.forward_dummy(x, proposals)
+        outs = outs + (roi_outs, )
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      proposals=None,
+                      **kwargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+            proposals : override rpn proposals with custom proposals. Use when
+                `with_rpn` is False.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_losses, proposal_list = self.rpn_head.forward_train(
+                x,
+                img_metas,
+                gt_bboxes,
+                gt_labels=None,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposal_cfg=proposal_cfg)
+            losses.update(rpn_losses)
+        else:
+            proposal_list = proposals
+
+        roi_losses = self.roi_head.forward_train(x, img_metas, proposal_list,
+                                                 gt_bboxes, gt_labels,
+                                                 gt_bboxes_ignore, gt_masks,
+                                                 **kwargs)
+        losses.update(roi_losses)
+
+        return losses
+
+    async def async_simple_test(self,
+                                img,
+                                img_meta,
+                                proposals=None,
+                                rescale=False):
+        """Async test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+
+        if proposals is None:
+            proposal_list = await self.rpn_head.async_simple_test_rpn(
+                x, img_meta)
+        else:
+            proposal_list = proposals
+
+        return await self.roi_head.async_simple_test(
+            x, proposal_list, img_meta, rescale=rescale)
+
+    def simple_test(self, img, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        x = self.extract_feat(img)
+
+        if proposals is None:
+            proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
+        else:
+            proposal_list = proposals
+
+        return self.roi_head.simple_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        proposal_list = self.rpn_head.aug_test_rpn(x, img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/vfnet.py b/insightface/detection/scrfd/mmdet/models/detectors/vfnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..e23f89674c919921219ffd3486587a2d3c318fbd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/vfnet.py
@@ -0,0 +1,18 @@
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class VFNet(SingleStageDetector):
+    """Implementation of `VarifocalNet
+    (VFNet).<https://arxiv.org/abs/2008.13367>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(VFNet, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                    test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/yolact.py b/insightface/detection/scrfd/mmdet/models/detectors/yolact.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f6c7ff09b5adb792a2560cd5b4ee374f42f2a59
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/yolact.py
@@ -0,0 +1,146 @@
+import torch
+
+from mmdet.core import bbox2result
+from ..builder import DETECTORS, build_head
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLACT(SingleStageDetector):
+    """Implementation of `YOLACT <https://arxiv.org/abs/1904.02689>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 segm_head,
+                 mask_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(YOLACT, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                     test_cfg, pretrained)
+        self.segm_head = build_head(segm_head)
+        self.mask_head = build_head(mask_head)
+        self.init_segm_mask_weights()
+
+    def init_segm_mask_weights(self):
+        """Initialize weights of the YOLACT semg head and YOLACT mask head."""
+        self.segm_head.init_weights()
+        self.mask_head.init_weights()
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/get_flops.py`
+        """
+        raise NotImplementedError
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # convert Bitmap mask or Polygon Mask to Tensor here
+        gt_masks = [
+            gt_mask.to_tensor(dtype=torch.uint8, device=img.device)
+            for gt_mask in gt_masks
+        ]
+
+        x = self.extract_feat(img)
+
+        cls_score, bbox_pred, coeff_pred = self.bbox_head(x)
+        bbox_head_loss_inputs = (cls_score, bbox_pred) + (gt_bboxes, gt_labels,
+                                                          img_metas)
+        losses, sampling_results = self.bbox_head.loss(
+            *bbox_head_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+
+        segm_head_outs = self.segm_head(x[0])
+        loss_segm = self.segm_head.loss(segm_head_outs, gt_masks, gt_labels)
+        losses.update(loss_segm)
+
+        mask_pred = self.mask_head(x[0], coeff_pred, gt_bboxes, img_metas,
+                                   sampling_results)
+        loss_mask = self.mask_head.loss(mask_pred, gt_masks, gt_bboxes,
+                                        img_metas, sampling_results)
+        losses.update(loss_mask)
+
+        # check NaN and Inf
+        for loss_name in losses.keys():
+            assert torch.isfinite(torch.stack(losses[loss_name]))\
+                .all().item(), '{} becomes infinite or NaN!'\
+                .format(loss_name)
+
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation."""
+        x = self.extract_feat(img)
+
+        cls_score, bbox_pred, coeff_pred = self.bbox_head(x)
+
+        bbox_inputs = (cls_score, bbox_pred,
+                       coeff_pred) + (img_metas, self.test_cfg, rescale)
+        det_bboxes, det_labels, det_coeffs = self.bbox_head.get_bboxes(
+            *bbox_inputs)
+        bbox_results = [
+            bbox2result(det_bbox, det_label, self.bbox_head.num_classes)
+            for det_bbox, det_label in zip(det_bboxes, det_labels)
+        ]
+
+        num_imgs = len(img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.mask_head.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                for i in range(len(det_bboxes))
+            ]
+            mask_preds = self.mask_head(x[0], det_coeffs, _bboxes, img_metas)
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], det_labels[i], img_metas[i], rescale)
+                    segm_results.append(segm_result)
+        return list(zip(bbox_results, segm_results))
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations."""
+        raise NotImplementedError
diff --git a/insightface/detection/scrfd/mmdet/models/detectors/yolo.py b/insightface/detection/scrfd/mmdet/models/detectors/yolo.py
new file mode 100755
index 0000000000000000000000000000000000000000..240aab20f857befe25e64114300ebb15a66c6a70
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/detectors/yolo.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+from ..builder import DETECTORS
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class YOLOV3(SingleStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(YOLOV3, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                     test_cfg, pretrained)
diff --git a/insightface/detection/scrfd/mmdet/models/losses/__init__.py b/insightface/detection/scrfd/mmdet/models/losses/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..bb887d3735df692aa0c7b3496c18add6b9c52391
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/__init__.py
@@ -0,0 +1,28 @@
+from .accuracy import Accuracy, accuracy
+from .ae_loss import AssociativeEmbeddingLoss
+from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+                                 cross_entropy, mask_cross_entropy)
+from .focal_loss import FocalLoss, sigmoid_focal_loss
+from .gaussian_focal_loss import GaussianFocalLoss
+from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss
+from .ghm_loss import GHMC, GHMR
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, GIoULoss, IoULoss,
+                       bounded_iou_loss, iou_loss)
+from .mse_loss import MSELoss, mse_loss
+from .pisa_loss import carl_loss, isr_p
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+from .varifocal_loss import VarifocalLoss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss', 'GHMC',
+    'GHMR', 'reduce_loss', 'weight_reduce_loss', 'weighted_loss', 'L1Loss',
+    'l1_loss', 'isr_p', 'carl_loss', 'AssociativeEmbeddingLoss',
+    'GaussianFocalLoss', 'QualityFocalLoss', 'DistributionFocalLoss',
+    'VarifocalLoss'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/losses/accuracy.py b/insightface/detection/scrfd/mmdet/models/losses/accuracy.py
new file mode 100755
index 0000000000000000000000000000000000000000..924ebbed7739b47515e27c03cd814a7c5db200d1
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/accuracy.py
@@ -0,0 +1,76 @@
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class)
+        target (torch.Tensor): The target of each prediction, shape (N, )
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == 2 and target.ndim == 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()  # transpose to shape (maxk, N)
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, ), thresh=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/insightface/detection/scrfd/mmdet/models/losses/ae_loss.py b/insightface/detection/scrfd/mmdet/models/losses/ae_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..6077652cff7e3dcd13f81ce8d2a85b52ac8b99f8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/ae_loss.py
@@ -0,0 +1,100 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+def ae_loss_per_image(tl_preds, br_preds, match):
+    """Associative Embedding Loss in one image.
+
+    Associative Embedding Loss including two parts: pull loss and push loss.
+    Pull loss makes embedding vectors from same object closer to each other.
+    Push loss distinguish embedding vector from different objects, and makes
+        the gap between them is large enough.
+
+    During computing, usually there are 3 cases:
+        - no object in image: both pull loss and push loss will be 0.
+        - one object in image: push loss will be 0 and pull loss is computed
+            by the two corner of the only object.
+        - more than one objects in image: pull loss is computed by corner pairs
+            from each object, push loss is computed by each object with all
+            other objects. We use confusion matrix with 0 in diagonal to
+            compute the push loss.
+
+    Args:
+        tl_preds (tensor): Embedding feature map of left-top corner.
+        br_preds (tensor): Embedding feature map of bottim-right corner.
+        match (list): Downsampled coordinates pair of each ground truth box.
+    """
+
+    tl_list, br_list, me_list = [], [], []
+    if len(match) == 0:  # no object in image
+        pull_loss = tl_preds.sum() * 0.
+        push_loss = tl_preds.sum() * 0.
+    else:
+        for m in match:
+            [tl_y, tl_x], [br_y, br_x] = m
+            tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1)
+            br_e = br_preds[:, br_y, br_x].view(-1, 1)
+            tl_list.append(tl_e)
+            br_list.append(br_e)
+            me_list.append((tl_e + br_e) / 2.0)
+
+        tl_list = torch.cat(tl_list)
+        br_list = torch.cat(br_list)
+        me_list = torch.cat(me_list)
+
+        assert tl_list.size() == br_list.size()
+
+        # N is object number in image, M is dimension of embedding vector
+        N, M = tl_list.size()
+
+        pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2)
+        pull_loss = pull_loss.sum() / N
+
+        margin = 1  # exp setting of CornerNet, details in section 3.3 of paper
+
+        # confusion matrix of push loss
+        conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list
+        conf_weight = 1 - torch.eye(N).type_as(me_list)
+        conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs())
+
+        if N > 1:  # more than one object in current image
+            push_loss = F.relu(conf_mat).sum() / (N * (N - 1))
+        else:
+            push_loss = tl_preds.sum() * 0.
+
+    return pull_loss, push_loss
+
+
+@LOSSES.register_module()
+class AssociativeEmbeddingLoss(nn.Module):
+    """Associative Embedding Loss.
+
+    More details can be found in
+    `Associative Embedding <https://arxiv.org/abs/1611.05424>`_ and
+    `CornerNet <https://arxiv.org/abs/1808.01244>`_ .
+    Code is modified from `kp_utils.py <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L180>`_  # noqa: E501
+
+    Args:
+        pull_weight (float): Loss weight for corners from same object.
+        push_weight (float): Loss weight for corners from different object.
+    """
+
+    def __init__(self, pull_weight=0.25, push_weight=0.25):
+        super(AssociativeEmbeddingLoss, self).__init__()
+        self.pull_weight = pull_weight
+        self.push_weight = push_weight
+
+    def forward(self, pred, target, match):
+        """Forward function."""
+        batch = pred.size(0)
+        pull_all, push_all = 0.0, 0.0
+        for i in range(batch):
+            pull, push = ae_loss_per_image(pred[i], target[i], match[i])
+
+            pull_all += self.pull_weight * pull
+            push_all += self.push_weight * push
+
+        return pull_all, push_all
diff --git a/insightface/detection/scrfd/mmdet/models/losses/balanced_l1_loss.py b/insightface/detection/scrfd/mmdet/models/losses/balanced_l1_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..3790a80b8a72e8405c068ba4097ae0046b68e7f5
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/balanced_l1_loss.py
@@ -0,0 +1,118 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def balanced_l1_loss(pred,
+                     target,
+                     beta=1.0,
+                     alpha=0.5,
+                     gamma=1.5,
+                     reduction='mean'):
+    """Calculate balanced L1 loss.
+
+    Please see the `Libra R-CNN <https://arxiv.org/pdf/1904.02701.pdf>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 4).
+        target (torch.Tensor): The learning target of the prediction with
+            shape (N, 4).
+        beta (float): The loss is a piecewise function of prediction and target
+            and ``beta`` serves as a threshold for the difference between the
+            prediction and target. Defaults to 1.0.
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss.
+            Defaults to 1.5.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+
+    diff = torch.abs(pred - target)
+    b = np.e**(gamma / alpha) - 1
+    loss = torch.where(
+        diff < beta, alpha / b *
+        (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff,
+        gamma * diff + gamma / b - alpha * beta)
+
+    return loss
+
+
+@LOSSES.register_module()
+class BalancedL1Loss(nn.Module):
+    """Balanced L1 Loss.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Args:
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5.
+        beta (float, optional): The loss is a piecewise function of prediction
+            and target. ``beta`` serves as a threshold for the difference
+            between the prediction and target. Defaults to 1.0.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha=0.5,
+                 gamma=1.5,
+                 beta=1.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(BalancedL1Loss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape (N, 4).
+            target (torch.Tensor): The learning target of the prediction with
+                shape (N, 4).
+            weight (torch.Tensor, optional): Sample-wise loss weight with
+                shape (N, ).
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * balanced_l1_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
diff --git a/insightface/detection/scrfd/mmdet/models/losses/cross_entropy_loss.py b/insightface/detection/scrfd/mmdet/models/losses/cross_entropy_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..1f283dd8d0b43182bb5148bd048d52a9a236ce44
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/cross_entropy_loss.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # element-wise losses
+    loss = F.cross_entropy(pred, label, weight=class_weight, reduction='none')
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    if label_weights is None:
+        bin_label_weights = None
+    else:
+        bin_label_weights = label_weights.view(-1, 1).expand(
+            label_weights.size(0), label_channels)
+
+    return bin_labels, bin_label_weights
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    if pred.dim() != label.dim():
+        label, weight = _expand_onehot_labels(label, weight, pred.size(-1))
+
+    # weighted element-wise losses
+    if weight is not None:
+        weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask'
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@LOSSES.register_module()
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction (str, optional): The method used to reduce the loss.
+                Options are "none", "mean" and "sum".
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(
+                self.class_weight, device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
diff --git a/insightface/detection/scrfd/mmdet/models/losses/focal_loss.py b/insightface/detection/scrfd/mmdet/models/losses/focal_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..be252e5792a6698e9faea9f4661881fb5a66ca88
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,157 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A warpper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target, gamma, alpha, None,
+                               'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * sigmoid_focal_loss(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/insightface/detection/scrfd/mmdet/models/losses/gaussian_focal_loss.py b/insightface/detection/scrfd/mmdet/models/losses/gaussian_focal_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..54307a1e0f542e3e49876d41f0774ceaff6b5073
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/gaussian_focal_loss.py
@@ -0,0 +1,89 @@
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def gaussian_focal_loss(pred, gaussian_target, alpha=2.0, gamma=4.0):
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+    """
+    eps = 1e-12
+    pos_weights = gaussian_target.eq(1)
+    neg_weights = (1 - gaussian_target).pow(gamma)
+    pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    return pos_loss + neg_loss
+
+
+@LOSSES.register_module()
+class GaussianFocalLoss(nn.Module):
+    """GaussianFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+    Code is modified from `kp_utils.py
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
+    Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
+    not 0/1 binary target.
+
+    Args:
+        alpha (float): Power of prediction.
+        gamma (float): Power of target for negtive samples.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 alpha=2.0,
+                 gamma=4.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(GaussianFocalLoss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction
+                in gaussian distribution.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_reg = self.loss_weight * gaussian_focal_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_reg
diff --git a/insightface/detection/scrfd/mmdet/models/losses/gfocal_loss.py b/insightface/detection/scrfd/mmdet/models/losses/gfocal_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..73102da820866d39ffa676a8e854fadf8da6f594
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/gfocal_loss.py
@@ -0,0 +1,185 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid()
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@LOSSES.register_module()
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(QualityFocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([torch.Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * quality_focal_loss(
+                pred,
+                target,
+                weight,
+                beta=self.beta,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@LOSSES.register_module()
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
diff --git a/insightface/detection/scrfd/mmdet/models/losses/ghm_loss.py b/insightface/detection/scrfd/mmdet/models/losses/ghm_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..8969a23fd98bb746415f96ac5e4ad9e37ba3af52
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/ghm_loss.py
@@ -0,0 +1,172 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module()
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+    """
+
+    def __init__(self, bins=10, momentum=0, use_sigmoid=True, loss_weight=1.0):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, label_weight, *args, **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_onehot_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, weights, reduction='sum') / tot
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@LOSSES.register_module()
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+    """
+
+    def __init__(self, mu=0.02, bins=10, momentum=0, loss_weight=1.0):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+
+    # TODO: support reduction parameter
+    def forward(self, pred, target, label_weight, avg_factor=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+        Returns:
+            The gradient harmonized loss.
+        """
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+
+        loss = loss * weights
+        loss = loss.sum() / tot
+        return loss * self.loss_weight
diff --git a/insightface/detection/scrfd/mmdet/models/losses/iou_loss.py b/insightface/detection/scrfd/mmdet/models/losses/iou_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..7fd06d463094bd034405ac352ce7f4a9415b6bb2
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,420 @@
+import math
+
+import torch
+import torch.nn as nn
+
+from mmdet.core import bbox_overlaps
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def iou_loss(pred, target, eps=1e-6):
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+    loss = -ious.log()
+    return loss
+
+
+@weighted_loss
+def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes.
+        target (torch.Tensor): Target bboxes.
+        beta (float): beta parameter in smoothl1.
+        eps (float): eps to avoid NaN.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).view(loss_dx.size(0), -1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+@weighted_loss
+def diou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@weighted_loss
+def ciou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + v**2 / (1 - ious + v))
+    loss = 1 - cious
+    return loss
+
+
+@LOSSES.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(IoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class BoundedIoULoss(nn.Module):
+
+    def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0):
+        super(BoundedIoULoss, self).__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class GIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            print('XXXXX', pred.shape, weight.shape)
+            return (weight.unsqueeze(1) * pred).sum()
+            #return (pred * weight).sum()  # 0
+        
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class DIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(DIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class CIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(CIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
diff --git a/insightface/detection/scrfd/mmdet/models/losses/mse_loss.py b/insightface/detection/scrfd/mmdet/models/losses/mse_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..68d05752a245548862f4c9919448d4fb8dc1b8ca
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/mse_loss.py
@@ -0,0 +1,49 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def mse_loss(pred, target):
+    """Warpper of mse loss."""
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        loss = self.loss_weight * mse_loss(
+            pred,
+            target,
+            weight,
+            reduction=self.reduction,
+            avg_factor=avg_factor)
+        return loss
diff --git a/insightface/detection/scrfd/mmdet/models/losses/pisa_loss.py b/insightface/detection/scrfd/mmdet/models/losses/pisa_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb907a2ea9a0470063d5bb1cfdf5c7c3a054995f
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/pisa_loss.py
@@ -0,0 +1,180 @@
+import torch
+
+from mmdet.core import bbox_overlaps
+
+
+def isr_p(cls_score,
+          bbox_pred,
+          bbox_targets,
+          rois,
+          sampling_results,
+          loss_cls,
+          bbox_coder,
+          k=2,
+          bias=0,
+          num_class=80):
+    """Importance-based Sample Reweighting (ISR_P), positive part.
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are
+            labels, label_weights, bbox_targets, bbox_weights, respectively.
+        rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs
+            (two_stage) in shape (n, 5).
+        sampling_results (obj): Sampling results.
+        loss_cls (func): Classification loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        num_class (int): Number of classes, default: 80.
+
+    Return:
+        tuple([Tensor]): labels, imp_based_label_weights, bbox_targets,
+            bbox_target_weights
+    """
+
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    pos_labels = labels[pos_label_inds]
+
+    # if no positive samples, return the original targets
+    num_pos = float(pos_label_inds.size(0))
+    if num_pos == 0:
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    # merge pos_assigned_gt_inds of per image to a single tensor
+    gts = list()
+    last_max_gt = 0
+    for i in range(len(sampling_results)):
+        gt_i = sampling_results[i].pos_assigned_gt_inds
+        gts.append(gt_i + last_max_gt)
+        if len(gt_i) != 0:
+            last_max_gt = gt_i.max() + 1
+    gts = torch.cat(gts)
+    assert len(gts) == num_pos
+
+    cls_score = cls_score.detach()
+    bbox_pred = bbox_pred.detach()
+
+    # For single stage detectors, rois here indicate anchors, in shape (N, 4)
+    # For two stage detectors, rois are in shape (N, 5)
+    if rois.size(-1) == 5:
+        pos_rois = rois[pos_label_inds][:, 1:]
+    else:
+        pos_rois = rois[pos_label_inds]
+
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4)
+    else:
+        pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4)
+
+    # compute iou of the predicted bbox and the corresponding GT
+    pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4)
+    pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred)
+    target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target)
+    ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True)
+
+    pos_imp_weights = label_weights[pos_label_inds]
+    # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally,
+    # then sorted again within the same-rank group
+    max_l_num = pos_labels.bincount().max()
+    for label in pos_labels.unique():
+        l_inds = (pos_labels == label).nonzero().view(-1)
+        l_gts = gts[l_inds]
+        for t in l_gts.unique():
+            t_inds = l_inds[l_gts == t]
+            t_ious = ious[t_inds]
+            _, t_iou_rank_idx = t_ious.sort(descending=True)
+            _, t_iou_rank = t_iou_rank_idx.sort()
+            ious[t_inds] += max_l_num - t_iou_rank.float()
+        l_ious = ious[l_inds]
+        _, l_iou_rank_idx = l_ious.sort(descending=True)
+        _, l_iou_rank = l_iou_rank_idx.sort()  # IoU-HLR
+        # linearly map HLR to label weights
+        pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num
+
+    pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k)
+
+    # normalize to make the new weighted loss value equal to the original loss
+    pos_loss_cls = loss_cls(
+        cls_score[pos_label_inds], pos_labels, reduction_override='none')
+    if pos_loss_cls.dim() > 1:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:,
+                                                                        None]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None]
+    else:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights
+    pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum()
+    pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio
+    label_weights[pos_label_inds] = pos_imp_weights
+
+    bbox_targets = labels, label_weights, bbox_targets, bbox_weights
+    return bbox_targets
+
+
+def carl_loss(cls_score,
+              labels,
+              bbox_pred,
+              bbox_targets,
+              loss_bbox,
+              k=1,
+              bias=0.2,
+              avg_factor=None,
+              sigmoid=False,
+              num_class=80):
+    """Classification-Aware Regression Loss (CARL).
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        labels (Tensor): Targets of classification.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (Tensor): Target of bbox regression.
+        loss_bbox (func): Regression loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping.
+        bias (float): Shift of the non-linear mapping.
+        avg_factor (int): Average factor used in regression loss.
+        sigmoid (bool): Activation of the classification score.
+        num_class (int): Number of classes, default: 80.
+
+    Return:
+        dict: CARL loss dict.
+    """
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    if pos_label_inds.numel() == 0:
+        return dict(loss_carl=cls_score.sum()[None] * 0.)
+    pos_labels = labels[pos_label_inds]
+
+    # multiply pos_cls_score with the corresponding bbox weight
+    # and remain gradient
+    if sigmoid:
+        pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels]
+    else:
+        pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels]
+    carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k)
+
+    # normalize carl_loss_weight to make its sum equal to num positive
+    num_pos = float(pos_cls_score.size(0))
+    weight_ratio = num_pos / carl_loss_weights.sum()
+    carl_loss_weights *= weight_ratio
+
+    if avg_factor is None:
+        avg_factor = bbox_targets.size(0)
+    # if is class agnostic, bbox pred is in shape (N, 4)
+    # otherwise, bbox pred is in shape (N, #classes, 4)
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels]
+    else:
+        pos_bbox_preds = bbox_pred[pos_label_inds]
+    ori_loss_reg = loss_bbox(
+        pos_bbox_preds,
+        bbox_targets[pos_label_inds],
+        reduction_override='none') / avg_factor
+    loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum()
+    return dict(loss_carl=loss_carl[None])
diff --git a/insightface/detection/scrfd/mmdet/models/losses/smooth_l1_loss.py b/insightface/detection/scrfd/mmdet/models/losses/smooth_l1_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..ad5e8a4dfbf77697ce8fefdd02731dd8b29592a8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,136 @@
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, beta=1.0, reduction='mean', loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/insightface/detection/scrfd/mmdet/models/losses/utils.py b/insightface/detection/scrfd/mmdet/models/losses/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..ee90f3ac0eb4a3e341c29564f56d74e731ea0866
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/utils.py
@@ -0,0 +1,99 @@
+import functools
+
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Avarage factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        #print('LLL', pred.shape, target.shape, loss.shape, weight.shape)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/insightface/detection/scrfd/mmdet/models/losses/varifocal_loss.py b/insightface/detection/scrfd/mmdet/models/losses/varifocal_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a84307fe0649f06a58e2530bde9857252e71db0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/losses/varifocal_loss.py
@@ -0,0 +1,131 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+def varifocal_loss(pred,
+                   target,
+                   weight=None,
+                   alpha=0.75,
+                   gamma=2.0,
+                   iou_weighted=True,
+                   reduction='mean',
+                   avg_factor=None):
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction. Defaults to None.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and
+            "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    else:
+        focal_weight = (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class VarifocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.75,
+                 gamma=2.0,
+                 iou_weighted=True,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(VarifocalLoss, self).__init__()
+        assert use_sigmoid is True, \
+            'Only sigmoid varifocal loss supported now.'
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * varifocal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                iou_weighted=self.iou_weighted,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/insightface/detection/scrfd/mmdet/models/necks/__init__.py b/insightface/detection/scrfd/mmdet/models/necks/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a45250fe676823ccccafb38cd5631c939d25e37
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/__init__.py
@@ -0,0 +1,17 @@
+from .bfp import BFP
+from .channel_mapper import ChannelMapper
+from .fpn import FPN
+from .fpn_carafe import FPN_CARAFE
+from .hrfpn import HRFPN
+from .nas_fpn import NASFPN
+from .nasfcos_fpn import NASFCOS_FPN
+from .pafpn import PAFPN
+from .rfp import RFP
+from .yolo_neck import YOLOV3Neck
+from .lfpn import LFPN
+
+__all__ = [
+    'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+    'NASFCOS_FPN', 'RFP', 'YOLOV3Neck',
+    'LFPN'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/necks/bfp.py b/insightface/detection/scrfd/mmdet/models/necks/bfp.py
new file mode 100755
index 0000000000000000000000000000000000000000..65f00fb3066492f69c86c7a4ee97df0187d9b4e9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/bfp.py
@@ -0,0 +1,104 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.cnn.bricks import NonLocal2d
+import numpy as np
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class BFP(nn.Module):
+    """BFP (Balanced Feature Pyrmamids)
+
+    BFP takes multi-level features as inputs and gather them into a single one,
+    then refine the gathered feature and scatter the refined results to
+    multi-level features. This module is used in Libra R-CNN (CVPR 2019), see
+    the paper `Libra R-CNN: Towards Balanced Learning for Object Detection
+    <https://arxiv.org/abs/1904.02701>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        num_levels (int): Number of input feature levels.
+        conv_cfg (dict): The config dict for convolution layers.
+        norm_cfg (dict): The config dict for normalization layers.
+        refine_level (int): Index of integration and refine level of BSF in
+            multi-level features from bottom to top.
+        refine_type (str): Type of the refine op, currently support
+            [None, 'conv', 'non_local'].
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_levels,
+                 refine_level=2,
+                 refine_type=None,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        super(BFP, self).__init__()
+        assert refine_type in [None, 'conv', 'non_local']
+
+        self.in_channels = in_channels
+        self.num_levels = num_levels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.refine_level = refine_level
+        self.refine_type = refine_type
+        assert 0 <= self.refine_level < self.num_levels
+
+        if self.refine_type == 'conv':
+            self.refine = ConvModule(
+                self.in_channels,
+                self.in_channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        elif self.refine_type == 'non_local':
+            self.refine = NonLocal2d(
+                self.in_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def init_weights(self):
+        """Initialize the weights of FPN module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_levels
+
+        # step 1: gather multi-level features by resize and average
+        feats = []
+        gather_size = inputs[self.refine_level].size()[2:]
+        for i in range(self.num_levels):
+            if i < self.refine_level:
+                gathered = F.adaptive_max_pool2d(
+                    inputs[i], output_size=gather_size)
+            else:
+                gathered = F.interpolate(
+                    inputs[i], size=gather_size, mode='nearest')
+            feats.append(gathered)
+
+        bsf = np.mean(feats)
+
+        # step 2: refine gathered features
+        if self.refine_type is not None:
+            bsf = self.refine(bsf)
+
+        # step 3: scatter refined features to multi-levels by a residual path
+        outs = []
+        for i in range(self.num_levels):
+            out_size = inputs[i].size()[2:]
+            if i < self.refine_level:
+                residual = F.interpolate(bsf, size=out_size, mode='nearest')
+            else:
+                residual = F.adaptive_max_pool2d(bsf, output_size=out_size)
+            outs.append(residual + inputs[i])
+
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/channel_mapper.py b/insightface/detection/scrfd/mmdet/models/necks/channel_mapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..a4f5ed44caefb1612df67785b1f4f0d9ec46ee93
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/channel_mapper.py
@@ -0,0 +1,74 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, xavier_init
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class ChannelMapper(nn.Module):
+    r"""Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU').
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU')):
+        super(ChannelMapper, self).__init__()
+        assert isinstance(in_channels, list)
+
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of ChannelMapper module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/fpn.py b/insightface/detection/scrfd/mmdet/models/necks/fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..5e5dfe685964f06e7a66b63a13e66162e63fcafd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/fpn.py
@@ -0,0 +1,221 @@
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.runner import auto_fp16
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(nn.Module):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=True,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # TODO: deprecate `extra_convs_on_inputs`
+                warnings.simplefilter('once')
+                warnings.warn(
+                    '"extra_convs_on_inputs" will be deprecated in v2.9.0,'
+                    'Please use "add_extra_convs"', DeprecationWarning)
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of FPN module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/fpn_carafe.py b/insightface/detection/scrfd/mmdet/models/necks/fpn_carafe.py
new file mode 100755
index 0000000000000000000000000000000000000000..b97a6aa73432279e8785bf54a0579c58db46e9d4
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/fpn_carafe.py
@@ -0,0 +1,267 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer, xavier_init
+from mmcv.ops.carafe import CARAFEPack
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN_CARAFE(nn.Module):
+    """FPN_CARAFE is a more flexible implementation of FPN. It allows more
+    choice for upsample methods during the top-down pathway.
+
+    It can reproduce the preformance of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+
+    Args:
+        in_channels (list[int]): Number of channels for each input feature map.
+        out_channels (int): Output channels of feature pyramids.
+        num_outs (int): Number of output stages.
+        start_level (int): Start level of feature pyramids.
+            (Default: 0)
+        end_level (int): End level of feature pyramids.
+            (Default: -1 indicates the last level).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        activate (str): Type of activation function in ConvModule
+            (Default: None indicates w/o activation).
+        order (dict): Order of components in ConvModule.
+        upsample (str): Type of upsample layer.
+        upsample_cfg (dict): Dictionary to construct and config upsample layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 order=('conv', 'norm', 'act'),
+                 upsample_cfg=dict(
+                     type='carafe',
+                     up_kernel=5,
+                     up_group=1,
+                     encoder_kernel=3,
+                     encoder_dilation=1)):
+        super(FPN_CARAFE, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_bias = norm_cfg is None
+        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample = self.upsample_cfg.get('type')
+        self.relu = nn.ReLU(inplace=False)
+
+        self.order = order
+        assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')]
+
+        assert self.upsample in [
+            'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None
+        ]
+        if self.upsample in ['deconv', 'pixel_shuffle']:
+            assert hasattr(
+                self.upsample_cfg,
+                'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0
+            self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel')
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        self.upsample_modules = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            if i != self.backbone_end_level - 1:
+                upsample_cfg_ = self.upsample_cfg.copy()
+                if self.upsample == 'deconv':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsample_cfg_.update(channels=out_channels, scale_factor=2)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsample_cfg_.update(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsample_module = build_upsample_layer(upsample_cfg_)
+                self.upsample_modules.append(upsample_module)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_out_levels = (
+            num_outs - self.backbone_end_level + self.start_level)
+        if extra_out_levels >= 1:
+            for i in range(extra_out_levels):
+                in_channels = (
+                    self.in_channels[self.backbone_end_level -
+                                     1] if i == 0 else out_channels)
+                extra_l_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                if self.upsample == 'deconv':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsampler_cfg_ = dict(
+                        channels=out_channels,
+                        scale_factor=2,
+                        **self.upsample_cfg)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsampler_cfg_ = dict(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsampler_cfg_['type'] = self.upsample
+                upsample_module = build_upsample_layer(upsampler_cfg_)
+                extra_fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                self.upsample_modules.append(upsample_module)
+                self.fpn_convs.append(extra_fpn_conv)
+                self.lateral_convs.append(extra_l_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of module."""
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                xavier_init(m, distribution='uniform')
+        for m in self.modules():
+            if isinstance(m, CARAFEPack):
+                m.init_weights()
+
+    def slice_as(self, src, dst):
+        """Slice ``src`` as ``dst``
+
+        Note:
+            ``src`` should have the same or larger size than ``dst``.
+
+        Args:
+            src (torch.Tensor): Tensors to be sliced.
+            dst (torch.Tensor): ``src`` will be sliced to have the same
+                size as ``dst``.
+
+        Returns:
+            torch.Tensor: Sliced tensor.
+        """
+        assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3))
+        if src.size(2) == dst.size(2) and src.size(3) == dst.size(3):
+            return src
+        else:
+            return src[:, :, :dst.size(2), :dst.size(3)]
+
+    def tensor_add(self, a, b):
+        """Add tensors ``a`` and ``b`` that might have different sizes."""
+        if a.size() == b.size():
+            c = a + b
+        else:
+            c = a + self.slice_as(b, a)
+        return c
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            if i <= self.backbone_end_level - self.start_level:
+                input = inputs[min(i + self.start_level, len(inputs) - 1)]
+            else:
+                input = laterals[-1]
+            lateral = lateral_conv(input)
+            laterals.append(lateral)
+
+        # build top-down path
+        for i in range(len(laterals) - 1, 0, -1):
+            if self.upsample is not None:
+                upsample_feat = self.upsample_modules[i - 1](laterals[i])
+            else:
+                upsample_feat = laterals[i]
+            laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat)
+
+        # build outputs
+        num_conv_outs = len(self.fpn_convs)
+        outs = []
+        for i in range(num_conv_outs):
+            out = self.fpn_convs[i](laterals[i])
+            outs.append(out)
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/hrfpn.py b/insightface/detection/scrfd/mmdet/models/necks/hrfpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..cf87cfa7918c3755e22290b8a1dfdc068db6d729
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/hrfpn.py
@@ -0,0 +1,102 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, caffe2_xavier_init
+from torch.utils.checkpoint import checkpoint
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class HRFPN(nn.Module):
+    """HRFPN (High Resolution Feature Pyrmamids)
+
+    paper: `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        in_channels (list): number of channels for each branch.
+        out_channels (int): output channels of feature pyramids.
+        num_outs (int): number of output stages.
+        pooling_type (str): pooling for generating feature pyramids
+            from {MAX, AVG}.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp  (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        stride (int): stride of 3x3 convolutional layers
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs=5,
+                 pooling_type='AVG',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 with_cp=False,
+                 stride=1):
+        super(HRFPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reduction_conv = ConvModule(
+            sum(in_channels),
+            out_channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            act_cfg=None)
+
+        self.fpn_convs = nn.ModuleList()
+        for i in range(self.num_outs):
+            self.fpn_convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=None))
+
+        if pooling_type == 'MAX':
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                caffe2_xavier_init(m)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_ins
+        outs = [inputs[0]]
+        for i in range(1, self.num_ins):
+            outs.append(
+                F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear'))
+        out = torch.cat(outs, dim=1)
+        if out.requires_grad and self.with_cp:
+            out = checkpoint(self.reduction_conv, out)
+        else:
+            out = self.reduction_conv(out)
+        outs = [out]
+        for i in range(1, self.num_outs):
+            outs.append(self.pooling(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_outs):
+            if outs[i].requires_grad and self.with_cp:
+                tmp_out = checkpoint(self.fpn_convs[i], outs[i])
+            else:
+                tmp_out = self.fpn_convs[i](outs[i])
+            outputs.append(tmp_out)
+        return tuple(outputs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/lfpn.py b/insightface/detection/scrfd/mmdet/models/necks/lfpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..a82cf06585afced056df4e11ec40e40229acaa25
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/lfpn.py
@@ -0,0 +1,176 @@
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.runner import auto_fp16
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class LFPN(nn.Module):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=True,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(LFPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # TODO: deprecate `extra_convs_on_inputs`
+                warnings.simplefilter('once')
+                warnings.warn(
+                    '"extra_convs_on_inputs" will be deprecated in v2.9.0,'
+                    'Please use "add_extra_convs"', DeprecationWarning)
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            #self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of FPN module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            laterals[i] for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            for i in range(self.num_outs - used_backbone_levels):
+                outs.append(F.max_pool2d(outs[-1], 2, stride=2))
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/nas_fpn.py b/insightface/detection/scrfd/mmdet/models/necks/nas_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..8e333ce65d4d06c47c29af489526ba3142736ad7
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/nas_fpn.py
@@ -0,0 +1,160 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, caffe2_xavier_init
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class NASFPN(nn.Module):
+    """NAS-FPN.
+
+    Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture
+    for Object Detection <https://arxiv.org/abs/1904.07392>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None):
+        super(NASFPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)  # num of input feature levels
+        self.num_outs = num_outs  # num of output feature levels
+        self.stack_times = stack_times
+        self.norm_cfg = norm_cfg
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # add lateral connections
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.lateral_convs.append(l_conv)
+
+        # add extra downsample layers (stride-2 pooling or conv)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_conv = ConvModule(
+                out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+            self.extra_downsamples.append(
+                nn.Sequential(extra_conv, nn.MaxPool2d(2, 2)))
+
+        # add NAS FPN connections
+        self.fpn_stages = nn.ModuleList()
+        for _ in range(self.stack_times):
+            stage = nn.ModuleDict()
+            # gp(p6, p4) -> p4_1
+            stage['gp_64_4'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_1, p4) -> p4_2
+            stage['sum_44_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_2, p3) -> p3_out
+            stage['sum_43_3'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p3_out, p4_2) -> p4_out
+            stage['sum_34_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_55_5'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_77_7'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # gp(p7_out, p5_out) -> p6_out
+            stage['gp_75_6'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            self.fpn_stages.append(stage)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                caffe2_xavier_init(m)
+
+    def forward(self, inputs):
+        """Forward function."""
+        # build P3-P5
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # build P6-P7 on top of P5
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        p3, p4, p5, p6, p7 = feats
+
+        for stage in self.fpn_stages:
+            # gp(p6, p4) -> p4_1
+            p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:])
+            # sum(p4_1, p4) -> p4_2
+            p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:])
+            # sum(p4_2, p3) -> p3_out
+            p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:])
+            # sum(p3_out, p4_2) -> p4_out
+            p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:])
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:])
+            p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:])
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:])
+            p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:])
+            # gp(p7_out, p5_out) -> p6_out
+            p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:])
+
+        return p3, p4, p5, p6, p7
diff --git a/insightface/detection/scrfd/mmdet/models/necks/nasfcos_fpn.py b/insightface/detection/scrfd/mmdet/models/necks/nasfcos_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..2daf79ef591373499184c624ccd27fb7456dec06
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/nasfcos_fpn.py
@@ -0,0 +1,161 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, caffe2_xavier_init
+from mmcv.ops.merge_cells import ConcatCell
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class NASFCOS_FPN(nn.Module):
+    """FPN structure in NASFPN.
+
+    Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for
+    Object Detection <https://arxiv.org/abs/1906.04423>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=1,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        super(NASFCOS_FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.adapt_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            adapt_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=False,
+                norm_cfg=dict(type='BN'),
+                act_cfg=dict(type='ReLU', inplace=False))
+            self.adapt_convs.append(adapt_conv)
+
+        # C2 is omitted according to the paper
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+
+        def build_concat_cell(with_input1_conv, with_input2_conv):
+            cell_conv_cfg = dict(
+                kernel_size=1, padding=0, bias=False, groups=out_channels)
+            return ConcatCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                with_out_conv=True,
+                out_conv_cfg=cell_conv_cfg,
+                out_norm_cfg=dict(type='BN'),
+                out_conv_order=('norm', 'act', 'conv'),
+                with_input1_conv=with_input1_conv,
+                with_input2_conv=with_input2_conv,
+                input_conv_cfg=conv_cfg,
+                input_norm_cfg=norm_cfg,
+                upsample_mode='nearest')
+
+        # Denote c3=f0, c4=f1, c5=f2 for convince
+        self.fpn = nn.ModuleDict()
+        self.fpn['c22_1'] = build_concat_cell(True, True)
+        self.fpn['c22_2'] = build_concat_cell(True, True)
+        self.fpn['c32'] = build_concat_cell(True, False)
+        self.fpn['c02'] = build_concat_cell(True, False)
+        self.fpn['c42'] = build_concat_cell(True, True)
+        self.fpn['c36'] = build_concat_cell(True, True)
+        self.fpn['c61'] = build_concat_cell(True, True)  # f9
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_act_cfg = None if i == 0 \
+                else dict(type='ReLU', inplace=False)
+            self.extra_downsamples.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=extra_act_cfg,
+                    order=('act', 'norm', 'conv')))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = [
+            adapt_conv(inputs[i + self.start_level])
+            for i, adapt_conv in enumerate(self.adapt_convs)
+        ]
+
+        for (i, module_name) in enumerate(self.fpn):
+            idx_1, idx_2 = int(module_name[1]), int(module_name[2])
+            res = self.fpn[module_name](feats[idx_1], feats[idx_2])
+            feats.append(res)
+
+        ret = []
+        for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]):  # add P3, P4, P5
+            feats1, feats2 = feats[idx], feats[5]
+            feats2_resize = F.interpolate(
+                feats2,
+                size=feats1.size()[2:],
+                mode='bilinear',
+                align_corners=False)
+
+            feats_sum = feats1 + feats2_resize
+            ret.append(
+                F.interpolate(
+                    feats_sum,
+                    size=inputs[input_idx].size()[2:],
+                    mode='bilinear',
+                    align_corners=False))
+
+        for submodule in self.extra_downsamples:
+            ret.append(submodule(ret[-1]))
+
+        return tuple(ret)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        for module in self.fpn.values():
+            if hasattr(module, 'conv_out'):
+                caffe2_xavier_init(module.out_conv.conv)
+
+        for modules in [
+                self.adapt_convs.modules(),
+                self.extra_downsamples.modules()
+        ]:
+            for module in modules:
+                if isinstance(module, nn.Conv2d):
+                    caffe2_xavier_init(module)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/pafpn.py b/insightface/detection/scrfd/mmdet/models/necks/pafpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..d7c0b50f29e882aacb5158b33ead3d4566d0ce0b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/pafpn.py
@@ -0,0 +1,142 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import auto_fp16
+
+from ..builder import NECKS
+from .fpn import FPN
+
+
+@NECKS.register_module()
+class PAFPN(FPN):
+    """Path Aggregation Network for Instance Segmentation.
+
+    This is an implementation of the `PAFPN in Path Aggregation Network
+    <https://arxiv.org/abs/1803.01534>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): Whether to add conv layers on top of the
+            original feature maps. Default: False.
+        extra_convs_on_inputs (bool): Whether to apply extra conv on
+            the original feature from the backbone. Default: False.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=True,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super(PAFPN,
+              self).__init__(in_channels, out_channels, num_outs, start_level,
+                             end_level, add_extra_convs, extra_convs_on_inputs,
+                             relu_before_extra_convs, no_norm_on_lateral,
+                             conv_cfg, norm_cfg, act_cfg)
+        # add extra bottom up pathway
+        self.downsample_convs = nn.ModuleList()
+        self.pafpn_convs = nn.ModuleList()
+        for i in range(self.start_level + 1, self.backbone_end_level):
+            d_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            pafpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.downsample_convs.append(d_conv)
+            self.pafpn_convs.append(pafpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] += F.interpolate(
+                laterals[i], size=prev_shape, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        inter_outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+
+        # part 2: add bottom-up path
+        for i in range(0, used_backbone_levels - 1):
+            inter_outs[i + 1] += self.downsample_convs[i](inter_outs[i])
+
+        outs = []
+        outs.append(inter_outs[0])
+        outs.extend([
+            self.pafpn_convs[i - 1](inter_outs[i])
+            for i in range(1, used_backbone_levels)
+        ])
+
+        # part 3: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    orig = inputs[self.backbone_end_level - 1]
+                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                elif self.add_extra_convs == 'on_lateral':
+                    outs.append(self.fpn_convs[used_backbone_levels](
+                        laterals[-1]))
+                elif self.add_extra_convs == 'on_output':
+                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                else:
+                    raise NotImplementedError
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/insightface/detection/scrfd/mmdet/models/necks/rfp.py b/insightface/detection/scrfd/mmdet/models/necks/rfp.py
new file mode 100755
index 0000000000000000000000000000000000000000..8a63e63bdef0094c26c17526d5ddde75bd309cea
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/rfp.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import constant_init, kaiming_init, xavier_init
+
+from ..builder import NECKS, build_backbone
+from .fpn import FPN
+
+
+class ASPP(nn.Module):
+    """ASPP (Atrous Spatial Pyramid Pooling)
+
+    This is an implementation of the ASPP module used in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of channels produced by this module
+        dilations (tuple[int]): Dilations of the four branches.
+            Default: (1, 3, 6, 1)
+    """
+
+    def __init__(self, in_channels, out_channels, dilations=(1, 3, 6, 1)):
+        super().__init__()
+        assert dilations[-1] == 1
+        self.aspp = nn.ModuleList()
+        for dilation in dilations:
+            kernel_size = 3 if dilation > 1 else 1
+            padding = dilation if dilation > 1 else 0
+            conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(len(self.aspp)):
+            inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+
+
+@NECKS.register_module()
+class RFP(FPN):
+    """RFP (Recursive Feature Pyramid)
+
+    This is an implementation of RFP in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_. Different from standard FPN, the
+    input of RFP should be multi level features along with origin input image
+    of backbone.
+
+    Args:
+        rfp_steps (int): Number of unrolled steps of RFP.
+        rfp_backbone (dict): Configuration of the backbone for RFP.
+        aspp_out_channels (int): Number of output channels of ASPP module.
+        aspp_dilations (tuple[int]): Dilation rates of four branches.
+            Default: (1, 3, 6, 1)
+    """
+
+    def __init__(self,
+                 rfp_steps,
+                 rfp_backbone,
+                 aspp_out_channels,
+                 aspp_dilations=(1, 3, 6, 1),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.rfp_steps = rfp_steps
+        self.rfp_modules = nn.ModuleList()
+        for rfp_idx in range(1, rfp_steps):
+            rfp_module = build_backbone(rfp_backbone)
+            self.rfp_modules.append(rfp_module)
+        self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels,
+                             aspp_dilations)
+        self.rfp_weight = nn.Conv2d(
+            self.out_channels,
+            1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def init_weights(self):
+        # Avoid using super().init_weights(), which may alter the default
+        # initialization of the modules in self.rfp_modules that have missing
+        # keys in the pretrained checkpoint.
+        for convs in [self.lateral_convs, self.fpn_convs]:
+            for m in convs.modules():
+                if isinstance(m, nn.Conv2d):
+                    xavier_init(m, distribution='uniform')
+        for rfp_idx in range(self.rfp_steps - 1):
+            self.rfp_modules[rfp_idx].init_weights(
+                self.rfp_modules[rfp_idx].pretrained)
+        constant_init(self.rfp_weight, 0)
+
+    def forward(self, inputs):
+        inputs = list(inputs)
+        assert len(inputs) == len(self.in_channels) + 1  # +1 for input image
+        img = inputs.pop(0)
+        # FPN forward
+        x = super().forward(tuple(inputs))
+        for rfp_idx in range(self.rfp_steps - 1):
+            rfp_feats = [x[0]] + list(
+                self.rfp_aspp(x[i]) for i in range(1, len(x)))
+            x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats)
+            # FPN forward
+            x_idx = super().forward(x_idx)
+            x_new = []
+            for ft_idx in range(len(x_idx)):
+                add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx]))
+                x_new.append(add_weight * x_idx[ft_idx] +
+                             (1 - add_weight) * x[ft_idx])
+            x = x_new
+        return x
diff --git a/insightface/detection/scrfd/mmdet/models/necks/yolo_neck.py b/insightface/detection/scrfd/mmdet/models/necks/yolo_neck.py
new file mode 100755
index 0000000000000000000000000000000000000000..c2f9b9ef3859796c284c16ad1a92fe41ecbed613
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/necks/yolo_neck.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import NECKS
+
+
+class DetectionBlock(nn.Module):
+    """Detection block in YOLO neck.
+
+    Let out_channels = n, the DetectionBlock contains:
+    Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
+    The first 6 ConvLayers are formed the following way:
+        1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
+    The Conv2D layer is 1x1x255.
+    Some block will have branch after the fifth ConvLayer.
+    The input channel is arbitrary (in_channels)
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
+        super(DetectionBlock, self).__init__()
+        double_out_channels = out_channels * 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+        self.conv4 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+
+    def forward(self, x):
+        tmp = self.conv1(x)
+        tmp = self.conv2(tmp)
+        tmp = self.conv3(tmp)
+        tmp = self.conv4(tmp)
+        out = self.conv5(tmp)
+        return out
+
+
+@NECKS.register_module()
+class YOLOV3Neck(nn.Module):
+    """The neck of YOLOV3.
+
+    It can be treated as a simplified version of FPN. It
+    will take the result from Darknet backbone and do some upsampling and
+    concatenation. It will finally output the detection result.
+
+    Note:
+        The input feats should be from top to bottom.
+            i.e., from high-lvl to low-lvl
+        But YOLOV3Neck will process them in reversed order.
+            i.e., from bottom (high-lvl) to top (low-lvl)
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+    """
+
+    def __init__(self,
+                 num_scales,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
+        super(YOLOV3Neck, self).__init__()
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        # To support arbitrary scales, the code looks awful, but it works.
+        # Better solution is welcomed.
+        self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
+        for i in range(1, self.num_scales):
+            in_c, out_c = self.in_channels[i], self.out_channels[i]
+            self.add_module(f'conv{i}', ConvModule(in_c, out_c, 1, **cfg))
+            # in_c + out_c : High-lvl feats will be cat with low-lvl feats
+            self.add_module(f'detect{i+1}',
+                            DetectionBlock(in_c + out_c, out_c, **cfg))
+
+    def forward(self, feats):
+        assert len(feats) == self.num_scales
+
+        # processed from bottom (high-lvl) to top (low-lvl)
+        outs = []
+        out = self.detect1(feats[-1])
+        outs.append(out)
+
+        for i, x in enumerate(reversed(feats[:-1])):
+            conv = getattr(self, f'conv{i+1}')
+            tmp = conv(out)
+
+            # Cat with low-lvl feats
+            tmp = F.interpolate(tmp, scale_factor=2)
+            tmp = torch.cat((tmp, x), 1)
+
+            detect = getattr(self, f'detect{i+2}')
+            out = detect(tmp)
+            outs.append(out)
+
+        return tuple(outs)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        # init is done in ConvModule
+        pass
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/__init__.py b/insightface/detection/scrfd/mmdet/models/roi_heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e08c3b928a8335c816041327c8927c7cda560a89
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/__init__.py
@@ -0,0 +1,27 @@
+from .base_roi_head import BaseRoIHead
+from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DoubleConvFCBBoxHead,
+                         Shared2FCBBoxHead, Shared4Conv1FCBBoxHead)
+from .cascade_roi_head import CascadeRoIHead
+from .double_roi_head import DoubleHeadRoIHead
+from .dynamic_roi_head import DynamicRoIHead
+from .grid_roi_head import GridRoIHead
+from .htc_roi_head import HybridTaskCascadeRoIHead
+from .mask_heads import (CoarseMaskHead, FCNMaskHead, FusedSemanticHead,
+                         GridHead, HTCMaskHead, MaskIoUHead, MaskPointHead)
+from .mask_scoring_roi_head import MaskScoringRoIHead
+from .pisa_roi_head import PISARoIHead
+from .point_rend_roi_head import PointRendRoIHead
+from .roi_extractors import SingleRoIExtractor
+from .shared_heads import ResLayer
+from .standard_roi_head import StandardRoIHead
+from .trident_roi_head import TridentRoIHead
+
+__all__ = [
+    'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead',
+    'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead',
+    'ConvFCBBoxHead', 'Shared2FCBBoxHead', 'StandardRoIHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'FCNMaskHead',
+    'HTCMaskHead', 'FusedSemanticHead', 'GridHead', 'MaskIoUHead',
+    'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead',
+    'CoarseMaskHead', 'DynamicRoIHead', 'TridentRoIHead'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/base_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/base_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..ec027dda8c88e356d49ae3182a7cb19c93c5fe51
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/base_roi_head.py
@@ -0,0 +1,106 @@
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+from ..builder import build_shared_head
+
+
+class BaseRoIHead(nn.Module, metaclass=ABCMeta):
+    """Base class for RoIHeads."""
+
+    def __init__(self,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 shared_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super(BaseRoIHead, self).__init__()
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if shared_head is not None:
+            self.shared_head = build_shared_head(shared_head)
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self):
+        """bool: whether the RoI head contains a `bbox_head`"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        """bool: whether the RoI head contains a `mask_head`"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @property
+    def with_shared_head(self):
+        """bool: whether the RoI head contains a `shared_head`"""
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @abstractmethod
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def init_bbox_head(self):
+        """Initialize ``bbox_head``"""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self):
+        """Initialize ``mask_head``"""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def forward_train(self,
+                      x,
+                      img_meta,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      **kwargs):
+        """Forward function during training."""
+        pass
+
+    async def async_simple_test(self, x, img_meta, **kwargs):
+        """Asynchronized test function."""
+        raise NotImplementedError
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_meta,
+                    proposals=None,
+                    rescale=False,
+                    **kwargs):
+        """Test without augmentation."""
+        pass
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        pass
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/__init__.py b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..acfb71ebebe0bdd5c0601073409a77ddae1e1b1e
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,10 @@
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead,
+                               Shared4Conv1FCBBoxHead)
+from .double_bbox_head import DoubleConvFCBBoxHead
+from .sabl_head import SABLHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..e0931e176160826ea9864b8a558e2cdf51d8d5d2
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,335 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner import auto_fp16, force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.core import build_bbox_coder, multi_apply, multiclass_nms
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.losses import accuracy
+
+
+@HEADS.register_module()
+class BBoxHead(nn.Module):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively."""
+
+    def __init__(self,
+                 with_avg_pool=False,
+                 with_cls=True,
+                 with_reg=True,
+                 roi_feat_size=7,
+                 in_channels=256,
+                 num_classes=80,
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[0., 0., 0., 0.],
+                     target_stds=[0.1, 0.1, 0.2, 0.2]),
+                 reg_class_agnostic=False,
+                 reg_decoded_bbox=False,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.0)):
+        super(BBoxHead, self).__init__()
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.reg_class_agnostic = reg_class_agnostic
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.fp16_enabled = False
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        else:
+            in_channels *= self.roi_feat_area
+        if self.with_cls:
+            # need to add background class
+            self.fc_cls = nn.Linear(in_channels, num_classes + 1)
+        if self.with_reg:
+            out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
+            self.fc_reg = nn.Linear(in_channels, out_dim_reg)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        # conv layers are already initialized by ConvModule
+        if self.with_cls:
+            nn.init.normal_(self.fc_cls.weight, 0, 0.01)
+            nn.init.constant_(self.fc_cls.bias, 0)
+        if self.with_reg:
+            nn.init.normal_(self.fc_reg.weight, 0, 0.001)
+            nn.init.constant_(self.fc_reg.bias, 0)
+
+    @auto_fp16()
+    def forward(self, x):
+        if self.with_avg_pool:
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def _get_target_single(self, pos_bboxes, neg_bboxes, pos_gt_bboxes,
+                           pos_gt_labels, cfg):
+        num_pos = pos_bboxes.size(0)
+        num_neg = neg_bboxes.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_bboxes.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_bboxes.new_zeros(num_samples)
+        bbox_targets = pos_bboxes.new_zeros(num_samples, 4)
+        bbox_weights = pos_bboxes.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[:num_pos] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_bboxes, pos_gt_bboxes)
+            else:
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[:num_pos, :] = pos_bbox_targets
+            bbox_weights[:num_pos, :] = 1
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results,
+                    gt_bboxes,
+                    gt_labels,
+                    rcnn_train_cfg,
+                    concat=True):
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        neg_bboxes_list = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            neg_bboxes_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def loss(self,
+             cls_score,
+             bbox_pred,
+             rois,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             reduction_override=None):
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            if cls_score.numel() > 0:
+                losses['loss_cls'] = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            bg_class_ind = self.num_classes
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            pos_inds = (labels >= 0) & (labels < bg_class_ind)
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                if self.reg_decoded_bbox:
+                    bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred)
+                if self.reg_class_agnostic:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), 4)[pos_inds.type(torch.bool)]
+                else:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), -1,
+                        4)[pos_inds.type(torch.bool),
+                           labels[pos_inds.type(torch.bool)]]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=bbox_targets.size(0),
+                    reduction_override=reduction_override)
+            else:
+                losses['loss_bbox'] = bbox_pred[pos_inds].sum()
+        return losses
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   img_shape,
+                   scale_factor,
+                   rescale=False,
+                   cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes = self.bbox_coder.decode(
+                rois[:, 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = rois[:, 1:].clone()
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
+
+        if rescale and bboxes.size(0) > 0:
+            if isinstance(scale_factor, float):
+                bboxes /= scale_factor
+            else:
+                scale_factor = bboxes.new_tensor(scale_factor)
+                bboxes = (bboxes.view(bboxes.size(0), -1, 4) /
+                          scale_factor).view(bboxes.size()[0], -1)
+
+        if cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(bboxes, scores,
+                                                    cfg.score_thr, cfg.nms,
+                                                    cfg.max_per_img)
+
+            return det_bboxes, det_labels
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas):
+        """Refine bboxes during training.
+
+        Args:
+            rois (Tensor): Shape (n*bs, 5), where n is image number per GPU,
+                and bs is the sampled RoIs per image. The first column is
+                the image id and the next 4 columns are x1, y1, x2, y2.
+            labels (Tensor): Shape (n*bs, ).
+            bbox_preds (Tensor): Shape (n*bs, 4) or (n*bs, 4*#class).
+            pos_is_gts (list[Tensor]): Flags indicating if each positive bbox
+                is a gt bbox.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Refined bboxes of each image in a mini-batch.
+
+        Example:
+            >>> # xdoctest: +REQUIRES(module:kwarray)
+            >>> import kwarray
+            >>> import numpy as np
+            >>> from mmdet.core.bbox.demodata import random_boxes
+            >>> self = BBoxHead(reg_class_agnostic=True)
+            >>> n_roi = 2
+            >>> n_img = 4
+            >>> scale = 512
+            >>> rng = np.random.RandomState(0)
+            >>> img_metas = [{'img_shape': (scale, scale)}
+            ...              for _ in range(n_img)]
+            >>> # Create rois in the expected format
+            >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> img_ids = torch.randint(0, n_img, (n_roi,))
+            >>> img_ids = img_ids.float()
+            >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1)
+            >>> # Create other args
+            >>> labels = torch.randint(0, 2, (n_roi,)).long()
+            >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> # For each image, pretend random positive boxes are gts
+            >>> is_label_pos = (labels.numpy() > 0).astype(np.int)
+            >>> lbl_per_img = kwarray.group_items(is_label_pos,
+            ...                                   img_ids.numpy())
+            >>> pos_per_img = [sum(lbl_per_img.get(gid, []))
+            ...                for gid in range(n_img)]
+            >>> pos_is_gts = [
+            >>>     torch.randint(0, 2, (npos,)).byte().sort(
+            >>>         descending=True)[0]
+            >>>     for npos in pos_per_img
+            >>> ]
+            >>> bboxes_list = self.refine_bboxes(rois, labels, bbox_preds,
+            >>>                    pos_is_gts, img_metas)
+            >>> print(bboxes_list)
+        """
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(img_metas)
+
+        bboxes_list = []
+        for i in range(len(img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            bbox_pred_ = bbox_preds[inds]
+            img_meta_ = img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+
+            bboxes_list.append(bboxes[keep_inds.type(torch.bool)])
+
+        return bboxes_list
+
+    @force_fp32(apply_to=('bbox_pred', ))
+    def regress_by_class(self, rois, label, bbox_pred, img_meta):
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (Tensor): shape (n, 4*(#class)) or (n, 4)
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5, repr(rois.shape)
+
+        if not self.reg_class_agnostic:
+            label = label * 4
+            inds = torch.stack((label, label + 1, label + 2, label + 3), 1)
+            bbox_pred = torch.gather(bbox_pred, 1, inds)
+        assert bbox_pred.size(1) == 4
+
+        if rois.size(1) == 4:
+            new_rois = self.bbox_coder.decode(
+                rois, bbox_pred, max_shape=img_meta['img_shape'])
+        else:
+            bboxes = self.bbox_coder.decode(
+                rois[:, 1:], bbox_pred, max_shape=img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e86d2ea67e154fae18dbf9d2bfde6d0a70e582c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,205 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.models.builder import HEADS
+from .bbox_head import BBoxHead
+
+
+@HEADS.register_module()
+class ConvFCBBoxHead(BBoxHead):
+    r"""More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                                    /-> cls convs -> cls fcs -> cls
+        shared convs -> shared fcs
+                                    \-> reg convs -> reg fcs -> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_shared_convs=0,
+                 num_shared_fcs=0,
+                 num_cls_convs=0,
+                 num_cls_fcs=0,
+                 num_reg_convs=0,
+                 num_reg_fcs=0,
+                 conv_out_channels=256,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 *args,
+                 **kwargs):
+        super(ConvFCBBoxHead, self).__init__(*args, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= self.roi_feat_area
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= self.roi_feat_area
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            self.fc_cls = nn.Linear(self.cls_last_dim, self.num_classes + 1)
+        if self.with_reg:
+            out_dim_reg = (4 if self.reg_class_agnostic else 4 *
+                           self.num_classes)
+            self.fc_reg = nn.Linear(self.reg_last_dim, out_dim_reg)
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs,
+                            num_branch_fcs,
+                            in_channels,
+                            is_shared=False):
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def init_weights(self):
+        super(ConvFCBBoxHead, self).init_weights()
+        # conv layers are already initialized by ConvModule
+        for module_list in [self.shared_fcs, self.cls_fcs, self.reg_fcs]:
+            for m in module_list.modules():
+                if isinstance(m, nn.Linear):
+                    nn.init.xavier_uniform_(m.weight)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+@HEADS.register_module()
+class Shared2FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels=1024, *args, **kwargs):
+        super(Shared2FCBBoxHead, self).__init__(
+            num_shared_convs=0,
+            num_shared_fcs=2,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
+
+
+@HEADS.register_module()
+class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels=1024, *args, **kwargs):
+        super(Shared4Conv1FCBBoxHead, self).__init__(
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..6c154cb3c0d9d7639c3d4a2a1272406d3fab8acd
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
@@ -0,0 +1,172 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init, xavier_init
+
+from mmdet.models.backbones.resnet import Bottleneck
+from mmdet.models.builder import HEADS
+from .bbox_head import BBoxHead
+
+
+class BasicResBlock(nn.Module):
+    """Basic residual block.
+
+    This block is a little different from the block in the ResNet backbone.
+    The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        out_channels (int): Channels of the output feature map.
+        conv_cfg (dict): The config dict for convolution layers.
+        norm_cfg (dict): The config dict for normalization layers.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super(BasicResBlock, self).__init__()
+
+        # main path
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        # identity path
+        self.conv_identity = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        identity = x
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        identity = self.conv_identity(identity)
+        out = x + identity
+
+        out = self.relu(out)
+        return out
+
+
+@HEADS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+    .. code-block:: none
+
+                                          /-> cls
+                      /-> shared convs ->
+                                          \-> reg
+        roi features
+                                          /-> cls
+                      \-> shared fc    ->
+                                          \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=0,
+                 conv_out_channels=1024,
+                 fc_out_channels=1024,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        kwargs.setdefault('with_avg_pool', True)
+        super(DoubleConvFCBBoxHead, self).__init__(**kwargs)
+        assert self.with_avg_pool
+        assert num_convs > 0
+        assert num_fcs > 0
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # increase the channel of input features
+        self.res_block = BasicResBlock(self.in_channels,
+                                       self.conv_out_channels)
+
+        # add conv heads
+        self.conv_branch = self._add_conv_branch()
+        # add fc heads
+        self.fc_branch = self._add_fc_branch()
+
+        out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes
+        self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg)
+
+        self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def _add_conv_branch(self):
+        """Add the fc branch which consists of a sequential of conv layers."""
+        branch_convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            branch_convs.append(
+                Bottleneck(
+                    inplanes=self.conv_out_channels,
+                    planes=self.conv_out_channels // 4,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        return branch_convs
+
+    def _add_fc_branch(self):
+        """Add the fc branch which consists of a sequential of fc layers."""
+        branch_fcs = nn.ModuleList()
+        for i in range(self.num_fcs):
+            fc_in_channels = (
+                self.in_channels *
+                self.roi_feat_area if i == 0 else self.fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+        return branch_fcs
+
+    def init_weights(self):
+        # conv layers are already initialized by ConvModule
+        normal_init(self.fc_cls, std=0.01)
+        normal_init(self.fc_reg, std=0.001)
+
+        for m in self.fc_branch.modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, x_cls, x_reg):
+        # conv head
+        x_conv = self.res_block(x_reg)
+
+        for conv in self.conv_branch:
+            x_conv = conv(x_conv)
+
+        if self.with_avg_pool:
+            x_conv = self.avg_pool(x_conv)
+
+        x_conv = x_conv.view(x_conv.size(0), -1)
+        bbox_pred = self.fc_reg(x_conv)
+
+        # fc head
+        x_fc = x_cls.view(x_cls.size(0), -1)
+        for fc in self.fc_branch:
+            x_fc = self.relu(fc(x_fc))
+
+        cls_score = self.fc_cls(x_fc)
+
+        return cls_score, bbox_pred
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/sabl_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..370a933bf3d8f3e626fe3608204383f8ced103ee
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -0,0 +1,572 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, kaiming_init, normal_init, xavier_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import build_bbox_coder, multi_apply, multiclass_nms
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.losses import accuracy
+
+
+@HEADS.register_module()
+class SABLHead(nn.Module):
+    """Side-Aware Boundary Localization (SABL) for RoI-Head.
+
+    Side-Aware features are extracted by conv layers
+    with an attention mechanism.
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented in BucketingBBoxCoder.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        cls_in_channels (int): Input channels of cls RoI feature. \
+            Defaults to 256.
+        reg_in_channels (int): Input channels of reg RoI feature. \
+            Defaults to 256.
+        roi_feat_size (int): Size of RoI features. Defaults to 7.
+        reg_feat_up_ratio (int): Upsample ratio of reg features. \
+            Defaults to 2.
+        reg_pre_kernel (int): Kernel of 2D conv layers before \
+            attention pooling. Defaults to 3.
+        reg_post_kernel (int): Kernel of 1D conv layers after \
+            attention pooling. Defaults to 3.
+        reg_pre_num (int): Number of pre convs. Defaults to 2.
+        reg_post_num (int): Number of post convs. Defaults to 1.
+        num_classes (int): Number of classes in dataset. Defaults to 80.
+        cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024.
+        reg_offset_out_channels (int): Hidden and output channel \
+            of reg offset branch. Defaults to 256.
+        reg_cls_out_channels (int): Hidden and output channel \
+            of reg cls branch. Defaults to 256.
+        num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1.
+        num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0.
+        reg_class_agnostic (bool): Class agnostic regresion or not. \
+            Defaults to True.
+        norm_cfg (dict): Config of norm layers. Defaults to None.
+        bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 cls_in_channels=256,
+                 reg_in_channels=256,
+                 roi_feat_size=7,
+                 reg_feat_up_ratio=2,
+                 reg_pre_kernel=3,
+                 reg_post_kernel=3,
+                 reg_pre_num=2,
+                 reg_post_num=1,
+                 cls_out_channels=1024,
+                 reg_offset_out_channels=256,
+                 reg_cls_out_channels=256,
+                 num_cls_fcs=1,
+                 num_reg_fcs=0,
+                 reg_class_agnostic=True,
+                 norm_cfg=None,
+                 bbox_coder=dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=1.7),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox_reg=dict(
+                     type='SmoothL1Loss', beta=0.1, loss_weight=1.0)):
+        super(SABLHead, self).__init__()
+        self.cls_in_channels = cls_in_channels
+        self.reg_in_channels = reg_in_channels
+        self.roi_feat_size = roi_feat_size
+        self.reg_feat_up_ratio = int(reg_feat_up_ratio)
+        self.num_buckets = bbox_coder['num_buckets']
+        assert self.reg_feat_up_ratio // 2 >= 1
+        self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio
+        assert self.up_reg_feat_size == bbox_coder['num_buckets']
+        self.reg_pre_kernel = reg_pre_kernel
+        self.reg_post_kernel = reg_post_kernel
+        self.reg_pre_num = reg_pre_num
+        self.reg_post_num = reg_post_num
+        self.num_classes = num_classes
+        self.cls_out_channels = cls_out_channels
+        self.reg_offset_out_channels = reg_offset_out_channels
+        self.reg_cls_out_channels = reg_cls_out_channels
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_fcs = num_reg_fcs
+        self.reg_class_agnostic = reg_class_agnostic
+        assert self.reg_class_agnostic
+        self.norm_cfg = norm_cfg
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox_cls = build_loss(loss_bbox_cls)
+        self.loss_bbox_reg = build_loss(loss_bbox_reg)
+
+        self.cls_fcs = self._add_fc_branch(self.num_cls_fcs,
+                                           self.cls_in_channels,
+                                           self.roi_feat_size,
+                                           self.cls_out_channels)
+
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        if self.reg_feat_up_ratio > 1:
+            self.upsample_x = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+            self.upsample_y = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+
+        self.reg_pre_convs = nn.ModuleList()
+        for i in range(self.reg_pre_num):
+            reg_pre_conv = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=reg_pre_kernel,
+                padding=reg_pre_kernel // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_pre_convs.append(reg_pre_conv)
+
+        self.reg_post_conv_xs = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_x = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(1, reg_post_kernel),
+                padding=(0, reg_post_kernel // 2),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_xs.append(reg_post_conv_x)
+        self.reg_post_conv_ys = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_y = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(reg_post_kernel, 1),
+                padding=(reg_post_kernel // 2, 0),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_ys.append(reg_post_conv_y)
+
+        self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1)
+        self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1)
+
+        self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                               self.reg_in_channels, 1,
+                                               self.reg_cls_out_channels)
+        self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                                  self.reg_in_channels, 1,
+                                                  self.reg_offset_out_channels)
+        self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1)
+        self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1)
+
+    def _add_fc_branch(self, num_branch_fcs, in_channels, roi_feat_size,
+                       fc_out_channels):
+        in_channels = in_channels * roi_feat_size * roi_feat_size
+        branch_fcs = nn.ModuleList()
+        for i in range(num_branch_fcs):
+            fc_in_channels = (in_channels if i == 0 else fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels))
+        return branch_fcs
+
+    def init_weights(self):
+        for module_list in [
+                self.reg_cls_fcs, self.reg_offset_fcs, self.cls_fcs
+        ]:
+            for m in module_list.modules():
+                if isinstance(m, nn.Linear):
+                    xavier_init(m, distribution='uniform')
+        if self.reg_feat_up_ratio > 1:
+            kaiming_init(self.upsample_x, distribution='normal')
+            kaiming_init(self.upsample_y, distribution='normal')
+
+        normal_init(self.reg_conv_att_x, 0, 0.01)
+        normal_init(self.reg_conv_att_y, 0, 0.01)
+        normal_init(self.fc_reg_offset, 0, 0.001)
+        normal_init(self.fc_reg_cls, 0, 0.01)
+        normal_init(self.fc_cls, 0, 0.01)
+
+    def cls_forward(self, cls_x):
+        cls_x = cls_x.view(cls_x.size(0), -1)
+        for fc in self.cls_fcs:
+            cls_x = self.relu(fc(cls_x))
+        cls_score = self.fc_cls(cls_x)
+        return cls_score
+
+    def attention_pool(self, reg_x):
+        """Extract direction-specific features fx and fy with attention
+        methanism."""
+        reg_fx = reg_x
+        reg_fy = reg_x
+        reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid()
+        reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid()
+        reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2)
+        reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3)
+        reg_fx = (reg_fx * reg_fx_att).sum(dim=2)
+        reg_fy = (reg_fy * reg_fy_att).sum(dim=3)
+        return reg_fx, reg_fy
+
+    def side_aware_feature_extractor(self, reg_x):
+        """Refine and extract side-aware features without split them."""
+        for reg_pre_conv in self.reg_pre_convs:
+            reg_x = reg_pre_conv(reg_x)
+        reg_fx, reg_fy = self.attention_pool(reg_x)
+
+        if self.reg_post_num > 0:
+            reg_fx = reg_fx.unsqueeze(2)
+            reg_fy = reg_fy.unsqueeze(3)
+            for i in range(self.reg_post_num):
+                reg_fx = self.reg_post_conv_xs[i](reg_fx)
+                reg_fy = self.reg_post_conv_ys[i](reg_fy)
+            reg_fx = reg_fx.squeeze(2)
+            reg_fy = reg_fy.squeeze(3)
+        if self.reg_feat_up_ratio > 1:
+            reg_fx = self.relu(self.upsample_x(reg_fx))
+            reg_fy = self.relu(self.upsample_y(reg_fy))
+        reg_fx = torch.transpose(reg_fx, 1, 2)
+        reg_fy = torch.transpose(reg_fy, 1, 2)
+        return reg_fx.contiguous(), reg_fy.contiguous()
+
+    def reg_pred(self, x, offfset_fcs, cls_fcs):
+        """Predict bucketing esimation (cls_pred) and fine regression (offset
+        pred) with side-aware features."""
+        x_offset = x.view(-1, self.reg_in_channels)
+        x_cls = x.view(-1, self.reg_in_channels)
+
+        for fc in offfset_fcs:
+            x_offset = self.relu(fc(x_offset))
+        for fc in cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+        offset_pred = self.fc_reg_offset(x_offset)
+        cls_pred = self.fc_reg_cls(x_cls)
+
+        offset_pred = offset_pred.view(x.size(0), -1)
+        cls_pred = cls_pred.view(x.size(0), -1)
+
+        return offset_pred, cls_pred
+
+    def side_aware_split(self, feat):
+        """Split side-aware features aligned with orders of bucketing
+        targets."""
+        l_end = int(np.ceil(self.up_reg_feat_size / 2))
+        r_start = int(np.floor(self.up_reg_feat_size / 2))
+        feat_fl = feat[:, :l_end]
+        feat_fr = feat[:, r_start:].flip(dims=(1, ))
+        feat_fl = feat_fl.contiguous()
+        feat_fr = feat_fr.contiguous()
+        feat = torch.cat([feat_fl, feat_fr], dim=-1)
+        return feat
+
+    def bbox_pred_split(self, bbox_pred, num_proposals_per_img):
+        """Split batch bbox prediction back to each image."""
+        bucket_cls_preds, bucket_offset_preds = bbox_pred
+        bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0)
+        bucket_offset_preds = bucket_offset_preds.split(
+            num_proposals_per_img, 0)
+        bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds))
+        return bbox_pred
+
+    def reg_forward(self, reg_x):
+        outs = self.side_aware_feature_extractor(reg_x)
+        edge_offset_preds = []
+        edge_cls_preds = []
+        reg_fx = outs[0]
+        reg_fy = outs[1]
+        offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_x = self.side_aware_split(offset_pred_x)
+        offset_pred_y = self.side_aware_split(offset_pred_y)
+        cls_pred_x = self.side_aware_split(cls_pred_x)
+        cls_pred_y = self.side_aware_split(cls_pred_y)
+        edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1)
+        edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1)
+
+        return (edge_cls_preds, edge_offset_preds)
+
+    def forward(self, x):
+
+        bbox_pred = self.reg_forward(x)
+        cls_score = self.cls_forward(x)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bucket_target(pos_proposals, neg_proposals,
+                                             pos_gt_bboxes, pos_gt_labels,
+                                             rcnn_train_cfg)
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = cls_reg_targets
+        return (labels, label_weights, (bucket_cls_targets,
+                                        bucket_offset_targets),
+                (bucket_cls_weights, bucket_offset_weights))
+
+    def bucket_target(self,
+                      pos_proposals_list,
+                      neg_proposals_list,
+                      pos_gt_bboxes_list,
+                      pos_gt_labels_list,
+                      rcnn_train_cfg,
+                      concat=True):
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = multi_apply(
+             self._bucket_target_single,
+             pos_proposals_list,
+             neg_proposals_list,
+             pos_gt_bboxes_list,
+             pos_gt_labels_list,
+             cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bucket_cls_targets = torch.cat(bucket_cls_targets, 0)
+            bucket_cls_weights = torch.cat(bucket_cls_weights, 0)
+            bucket_offset_targets = torch.cat(bucket_offset_targets, 0)
+            bucket_offset_weights = torch.cat(bucket_offset_weights, 0)
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def _bucket_target_single(self, pos_proposals, neg_proposals,
+                              pos_gt_bboxes, pos_gt_labels, cfg):
+        """Compute bucketing estimation targets and fine regression targets for
+        a single image.
+
+        Args:
+            pos_proposals (Tensor): positive proposals of a single image,
+                 Shape (n_pos, 4)
+            neg_proposals (Tensor): negative proposals of a single image,
+                 Shape (n_neg, 4).
+            pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals
+                 of a single image, Shape (n_pos, 4).
+            pos_gt_labels (Tensor): gt labels assigned to positive proposals
+                 of a single image, Shape (n_pos, ).
+            cfg (dict): Config of calculating targets
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels in a single image. \
+                    Shape (n,).
+                - label_weights (Tensor): Label weights in a single image.\
+                    Shape (n,)
+                - bucket_cls_targets (Tensor): Bucket cls targets in \
+                    a single image. Shape (n, num_buckets*2).
+                - bucket_cls_weights (Tensor): Bucket cls weights in \
+                    a single image. Shape (n, num_buckets*2).
+                - bucket_offset_targets (Tensor): Bucket offset targets \
+                    in a single image. Shape (n, num_buckets*2).
+                - bucket_offset_targets (Tensor): Bucket offset weights \
+                    in a single image. Shape (n, num_buckets*2).
+        """
+        num_pos = pos_proposals.size(0)
+        num_neg = neg_proposals.size(0)
+        num_samples = num_pos + num_neg
+        labels = pos_gt_bboxes.new_full((num_samples, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = pos_proposals.new_zeros(num_samples)
+        bucket_cls_targets = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_cls_weights = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_offset_targets = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        bucket_offset_weights = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            label_weights[:num_pos] = 1.0
+            (pos_bucket_offset_targets, pos_bucket_offset_weights,
+             pos_bucket_cls_targets,
+             pos_bucket_cls_weights) = self.bbox_coder.encode(
+                 pos_proposals, pos_gt_bboxes)
+            bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets
+            bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights
+            bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets
+            bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def loss(self,
+             cls_score,
+             bbox_pred,
+             rois,
+             labels,
+             label_weights,
+             bbox_targets,
+             bbox_weights,
+             reduction_override=None):
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            losses['loss_cls'] = self.loss_cls(
+                cls_score,
+                labels,
+                label_weights,
+                avg_factor=avg_factor,
+                reduction_override=reduction_override)
+            losses['acc'] = accuracy(cls_score, labels)
+
+        if bbox_pred is not None:
+            bucket_cls_preds, bucket_offset_preds = bbox_pred
+            bucket_cls_targets, bucket_offset_targets = bbox_targets
+            bucket_cls_weights, bucket_offset_weights = bbox_weights
+            # edge cls
+            bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num)
+            bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num)
+            bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num)
+            losses['loss_bbox_cls'] = self.loss_bbox_cls(
+                bucket_cls_preds,
+                bucket_cls_targets,
+                bucket_cls_weights,
+                avg_factor=bucket_cls_targets.size(0),
+                reduction_override=reduction_override)
+
+            losses['loss_bbox_reg'] = self.loss_bbox_reg(
+                bucket_offset_preds,
+                bucket_offset_targets,
+                bucket_offset_weights,
+                avg_factor=bucket_offset_targets.size(0),
+                reduction_override=reduction_override)
+
+        return losses
+
+    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   img_shape,
+                   scale_factor,
+                   rescale=False,
+                   cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes, confids = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                                     img_shape)
+        else:
+            bboxes = rois[:, 1:].clone()
+            confids = None
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+
+        if rescale and bboxes.size(0) > 0:
+            if isinstance(scale_factor, float):
+                bboxes /= scale_factor
+            else:
+                bboxes /= torch.from_numpy(scale_factor).to(bboxes.device)
+
+        if cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                cfg.score_thr,
+                cfg.nms,
+                cfg.max_per_img,
+                score_factors=confids)
+
+            return det_bboxes, det_labels
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def refine_bboxes(self, rois, labels, bbox_preds, pos_is_gts, img_metas):
+        """Refine bboxes during training.
+
+        Args:
+            rois (Tensor): Shape (n*bs, 5), where n is image number per GPU,
+                and bs is the sampled RoIs per image.
+            labels (Tensor): Shape (n*bs, ).
+            bbox_preds (list[Tensor]): Shape [(n*bs, num_buckets*2), \
+                (n*bs, num_buckets*2)].
+            pos_is_gts (list[Tensor]): Flags indicating if each positive bbox
+                is a gt bbox.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Refined bboxes of each image in a mini-batch.
+        """
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() == len(img_metas)
+
+        bboxes_list = []
+        for i in range(len(img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            edge_cls_preds, edge_offset_preds = bbox_preds
+            edge_cls_preds_ = edge_cls_preds[inds]
+            edge_offset_preds_ = edge_offset_preds[inds]
+            bbox_pred_ = [edge_cls_preds_, edge_offset_preds_]
+            img_meta_ = img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+
+            bboxes_list.append(bboxes[keep_inds.type(torch.bool)])
+
+        return bboxes_list
+
+    @force_fp32(apply_to=('bbox_pred', ))
+    def regress_by_class(self, rois, label, bbox_pred, img_meta):
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (list[Tensor]): shape [(n, num_buckets *2), \
+                (n, num_buckets *2)]
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5
+
+        if rois.size(1) == 4:
+            new_rois, _ = self.bbox_coder.decode(rois, bbox_pred,
+                                                 img_meta['img_shape'])
+        else:
+            bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                               img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/cascade_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/cascade_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..45b6f36a386cd37c50cc43666fcc516f2e14d868
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/cascade_roi_head.py
@@ -0,0 +1,507 @@
+import torch
+import torch.nn as nn
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, build_assigner,
+                        build_sampler, merge_aug_bboxes, merge_aug_masks,
+                        multiclass_nms)
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class CascadeRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1712.00726
+    """
+
+    def __init__(self,
+                 num_stages,
+                 stage_loss_weights,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 shared_head=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert shared_head is None, \
+            'Shared head is not supported in Cascade RCNN anymore'
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        super(CascadeRoIHead, self).__init__(
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            shared_head=shared_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict): Config of box roi extractor.
+            bbox_head (dict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = nn.ModuleList()
+        self.bbox_head = nn.ModuleList()
+        if not isinstance(bbox_roi_extractor, list):
+            bbox_roi_extractor = [
+                bbox_roi_extractor for _ in range(self.num_stages)
+            ]
+        if not isinstance(bbox_head, list):
+            bbox_head = [bbox_head for _ in range(self.num_stages)]
+        assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
+        for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
+            self.bbox_roi_extractor.append(build_roi_extractor(roi_extractor))
+            self.bbox_head.append(build_head(head))
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict): Config of mask roi extractor.
+            mask_head (dict): Config of mask in mask head.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(build_head(head))
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = nn.ModuleList()
+            if not isinstance(mask_roi_extractor, list):
+                mask_roi_extractor = [
+                    mask_roi_extractor for _ in range(self.num_stages)
+                ]
+            assert len(mask_roi_extractor) == self.num_stages
+            for roi_extractor in mask_roi_extractor:
+                self.mask_roi_extractor.append(
+                    build_roi_extractor(roi_extractor))
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler for each stage."""
+        self.bbox_assigner = []
+        self.bbox_sampler = []
+        if self.train_cfg is not None:
+            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
+                self.bbox_assigner.append(
+                    build_assigner(rcnn_train_cfg.assigner))
+                self.current_stage = idx
+                self.bbox_sampler.append(
+                    build_sampler(rcnn_train_cfg.sampler, context=self))
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if self.with_shared_head:
+            self.shared_head.init_weights(pretrained=pretrained)
+        for i in range(self.num_stages):
+            if self.with_bbox:
+                self.bbox_roi_extractor[i].init_weights()
+                self.bbox_head[i].init_weights()
+            if self.with_mask:
+                if not self.share_roi_extractor:
+                    self.mask_roi_extractor[i].init_weights()
+                self.mask_head[i].init_weights()
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            for i in range(self.num_stages):
+                bbox_results = self._bbox_forward(i, x, rois)
+                outs = outs + (bbox_results['cls_score'],
+                               bbox_results['bbox_pred'])
+        # mask heads
+        if self.with_mask:
+            mask_rois = rois[:100]
+            for i in range(self.num_stages):
+                mask_results = self._mask_forward(i, x, mask_rois)
+                outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def _bbox_forward(self, stage, x, rois):
+        """Box head forward function used in both training and testing."""
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self, stage, x, sampling_results, gt_bboxes,
+                            gt_labels, rcnn_train_cfg):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_targets = self.bbox_head[stage].get_targets(
+            sampling_results, gt_bboxes, gt_labels, rcnn_train_cfg)
+        loss_bbox = self.bbox_head[stage].loss(bbox_results['cls_score'],
+                                               bbox_results['bbox_pred'], rois,
+                                               *bbox_targets)
+
+        bbox_results.update(
+            loss_bbox=loss_bbox, rois=rois, bbox_targets=bbox_targets)
+        return bbox_results
+
+    def _mask_forward(self, stage, x, rois):
+        """Mask head forward function used in both training and testing."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_pred = mask_head(mask_feats)
+
+        mask_results = dict(mask_pred=mask_pred)
+        return mask_results
+
+    def _mask_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_masks,
+                            rcnn_train_cfg,
+                            bbox_feats=None):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        mask_results = self._mask_forward(stage, x, pos_rois)
+
+        mask_targets = self.mask_head[stage].get_targets(
+            sampling_results, gt_masks, rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head[stage].loss(mask_results['mask_pred'],
+                                               mask_targets, pos_labels)
+
+        mask_results.update(loss_mask=loss_mask)
+        return mask_results
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): list of region proposals.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        losses = dict()
+        for i in range(self.num_stages):
+            self.current_stage = i
+            rcnn_train_cfg = self.train_cfg[i]
+            lw = self.stage_loss_weights[i]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[i]
+                bbox_sampler = self.bbox_sampler[i]
+                num_imgs = len(img_metas)
+                if gt_bboxes_ignore is None:
+                    gt_bboxes_ignore = [None for _ in range(num_imgs)]
+
+                for j in range(num_imgs):
+                    assign_result = bbox_assigner.assign(
+                        proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j],
+                        gt_labels[j])
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        proposal_list[j],
+                        gt_bboxes[j],
+                        gt_labels[j],
+                        feats=[lvl_feat[j][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self._bbox_forward_train(i, x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    rcnn_train_cfg)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{i}.{name}'] = (
+                    value * lw if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                mask_results = self._mask_forward_train(
+                    i, x, sampling_results, gt_masks, rcnn_train_cfg,
+                    bbox_results['bbox_feats'])
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{i}.{name}'] = (
+                        value * lw if 'loss' in name else value)
+
+            # refine bboxes
+            if i < self.num_stages - 1:
+                pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                # bbox_targets is a tuple
+                roi_labels = bbox_results['bbox_targets'][0]
+                with torch.no_grad():
+                    roi_labels = torch.where(
+                        roi_labels == self.bbox_head[i].num_classes,
+                        bbox_results['cls_score'][:, :-1].argmax(1),
+                        roi_labels)
+                    proposal_list = self.bbox_head[i].refine_bboxes(
+                        bbox_results['rois'], roi_labels,
+                        bbox_results['bbox_pred'], pos_is_gts, img_metas)
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        num_imgs = len(proposal_list)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # "ms" in variable names means multi-stage
+        ms_bbox_result = {}
+        ms_segm_result = {}
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        rois = bbox2roi(proposal_list)
+        for i in range(self.num_stages):
+            bbox_results = self._bbox_forward(i, x, rois)
+
+            # split batch bbox prediction back to each image
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            num_proposals_per_img = tuple(
+                len(proposals) for proposals in proposal_list)
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_score = cls_score.split(num_proposals_per_img, 0)
+            if isinstance(bbox_pred, torch.Tensor):
+                bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            else:
+                bbox_pred = self.bbox_head[i].bbox_pred_split(
+                    bbox_pred, num_proposals_per_img)
+            ms_scores.append(cls_score)
+
+            if i < self.num_stages - 1:
+                bbox_label = [s[:, :-1].argmax(dim=1) for s in cls_score]
+                rois = torch.cat([
+                    self.bbox_head[i].regress_by_class(rois[j], bbox_label[j],
+                                                       bbox_pred[j],
+                                                       img_metas[j])
+                    for j in range(num_imgs)
+                ])
+
+        # average scores of each image by stages
+        cls_score = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(num_imgs)
+        ]
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(num_imgs):
+            det_bbox, det_label = self.bbox_head[-1].get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+
+        if torch.onnx.is_in_onnx_export():
+            return det_bboxes, det_labels
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head[-1].num_classes)
+            for i in range(num_imgs)
+        ]
+        ms_bbox_result['ensemble'] = bbox_results
+
+        if self.with_mask:
+            if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+            else:
+                if rescale and not isinstance(scale_factors[0], float):
+                    scale_factors = [
+                        torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                        for scale_factor in scale_factors
+                    ]
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                    for i in range(len(det_bboxes))
+                ]
+                mask_rois = bbox2roi(_bboxes)
+                num_mask_rois_per_img = tuple(
+                    _bbox.size(0) for _bbox in _bboxes)
+                aug_masks = []
+                for i in range(self.num_stages):
+                    mask_results = self._mask_forward(i, x, mask_rois)
+                    mask_pred = mask_results['mask_pred']
+                    # split batch mask prediction back to each image
+                    mask_pred = mask_pred.split(num_mask_rois_per_img, 0)
+                    aug_masks.append(
+                        [m.sigmoid().cpu().numpy() for m in mask_pred])
+
+                # apply mask post-processing to each image individually
+                segm_results = []
+                for i in range(num_imgs):
+                    if det_bboxes[i].shape[0] == 0:
+                        segm_results.append(
+                            [[]
+                             for _ in range(self.mask_head[-1].num_classes)])
+                    else:
+                        aug_mask = [mask[i] for mask in aug_masks]
+                        merged_masks = merge_aug_masks(
+                            aug_mask, [[img_metas[i]]] * self.num_stages,
+                            rcnn_test_cfg)
+                        segm_result = self.mask_head[-1].get_seg_masks(
+                            merged_masks, _bboxes[i], det_labels[i],
+                            rcnn_test_cfg, ori_shapes[i], scale_factors[i],
+                            rescale)
+                        segm_results.append(segm_result)
+            ms_segm_result['ensemble'] = segm_results
+
+        if self.with_mask:
+            results = list(
+                zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble']))
+        else:
+            results = ms_bbox_result['ensemble']
+
+        return results
+
+    def aug_test(self, features, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        rcnn_test_cfg = self.test_cfg
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(features, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            # "ms" in variable names means multi-stage
+            ms_scores = []
+
+            rois = bbox2roi([proposals])
+            for i in range(self.num_stages):
+                bbox_results = self._bbox_forward(i, x, rois)
+                ms_scores.append(bbox_results['cls_score'])
+
+                if i < self.num_stages - 1:
+                    bbox_label = bbox_results['cls_score'][:, :-1].argmax(
+                        dim=1)
+                    rois = self.bbox_head[i].regress_by_class(
+                        rois, bbox_label, bbox_results['bbox_pred'],
+                        img_meta[0])
+
+            cls_score = sum(ms_scores) / float(len(ms_scores))
+            bboxes, scores = self.bbox_head[-1].get_bboxes(
+                rois,
+                cls_score,
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+
+        bbox_result = bbox2result(det_bboxes, det_labels,
+                                  self.bbox_head[-1].num_classes)
+
+        if self.with_mask:
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[[]
+                                for _ in range(self.mask_head[-1].num_classes)]
+                               ]
+            else:
+                aug_masks = []
+                aug_img_metas = []
+                for x, img_meta in zip(features, img_metas):
+                    img_shape = img_meta[0]['img_shape']
+                    scale_factor = img_meta[0]['scale_factor']
+                    flip = img_meta[0]['flip']
+                    flip_direction = img_meta[0]['flip_direction']
+                    _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                           scale_factor, flip, flip_direction)
+                    mask_rois = bbox2roi([_bboxes])
+                    for i in range(self.num_stages):
+                        mask_results = self._mask_forward(i, x, mask_rois)
+                        aug_masks.append(
+                            mask_results['mask_pred'].sigmoid().cpu().numpy())
+                        aug_img_metas.append(img_meta)
+                merged_masks = merge_aug_masks(aug_masks, aug_img_metas,
+                                               self.test_cfg)
+
+                ori_shape = img_metas[0][0]['ori_shape']
+                segm_result = self.mask_head[-1].get_seg_masks(
+                    merged_masks,
+                    det_bboxes,
+                    det_labels,
+                    rcnn_test_cfg,
+                    ori_shape,
+                    scale_factor=1.0,
+                    rescale=False)
+            return [(bbox_result, segm_result)]
+        else:
+            return [bbox_result]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/double_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/double_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..a1aa6c8244a889fbbed312a89574c3e11be294f0
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/double_roi_head.py
@@ -0,0 +1,33 @@
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for Double Head RCNN.
+
+    https://arxiv.org/abs/1904.06493
+    """
+
+    def __init__(self, reg_roi_scale_factor, **kwargs):
+        super(DoubleHeadRoIHead, self).__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing time."""
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/dynamic_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/dynamic_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..89427a931f45f5a920c0e66fd88058bf9fa05f5c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/dynamic_roi_head.py
@@ -0,0 +1,154 @@
+import numpy as np
+import torch
+
+from mmdet.core import bbox2roi
+from mmdet.models.losses import SmoothL1Loss
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+EPS = 1e-15
+
+
+@HEADS.register_module()
+class DynamicRoIHead(StandardRoIHead):
+    """RoI head for `Dynamic R-CNN <https://arxiv.org/abs/2004.06002>`_."""
+
+    def __init__(self, **kwargs):
+        super(DynamicRoIHead, self).__init__(**kwargs)
+        assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss)
+        # the IoU history of the past `update_iter_interval` iterations
+        self.iou_history = []
+        # the beta history of the past `update_iter_interval` iterations
+        self.beta_history = []
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """Forward function for training.
+
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            proposals (list[Tensors]): list of region proposals.
+
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            cur_iou = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                # record the `iou_topk`-th largest IoU in an image
+                iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk,
+                               len(assign_result.max_overlaps))
+                ious, _ = torch.topk(assign_result.max_overlaps, iou_topk)
+                cur_iou.append(ious[-1].item())
+                sampling_results.append(sampling_result)
+            # average the current IoUs over images
+            cur_iou = np.mean(cur_iou)
+            self.iou_history.append(cur_iou)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        # update IoU threshold and SmoothL1 beta
+        update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval
+        if len(self.iou_history) % update_iter_interval == 0:
+            new_iou_thr, new_beta = self.update_hyperparameters()
+
+        return losses
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        num_imgs = len(img_metas)
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+        # record the `beta_topk`-th smallest target
+        # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets
+        # and bbox_weights, respectively
+        pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1)
+        num_pos = len(pos_inds)
+        cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1)
+        beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs,
+                        num_pos)
+        cur_target = torch.kthvalue(cur_target, beta_topk)[0].item()
+        self.beta_history.append(cur_target)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def update_hyperparameters(self):
+        """Update hyperparameters like IoU thresholds for assigner and beta for
+        SmoothL1 loss based on the training statistics.
+
+        Returns:
+            tuple[float]: the updated ``iou_thr`` and ``beta``.
+        """
+        new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou,
+                          np.mean(self.iou_history))
+        self.iou_history = []
+        self.bbox_assigner.pos_iou_thr = new_iou_thr
+        self.bbox_assigner.neg_iou_thr = new_iou_thr
+        self.bbox_assigner.min_pos_iou = new_iou_thr
+        if (np.median(self.beta_history) < EPS):
+            # avoid 0 or too small value for new_beta
+            new_beta = self.bbox_head.loss_bbox.beta
+        else:
+            new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta,
+                           np.median(self.beta_history))
+        self.beta_history = []
+        self.bbox_head.loss_bbox.beta = new_beta
+        return new_iou_thr, new_beta
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/grid_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/grid_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..4c52c79863ebaf17bd023382c7e5d4c237b4da77
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/grid_roi_head.py
@@ -0,0 +1,176 @@
+import torch
+
+from mmdet.core import bbox2result, bbox2roi
+from ..builder import HEADS, build_head, build_roi_extractor
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class GridRoIHead(StandardRoIHead):
+    """Grid roi head for Grid R-CNN.
+
+    https://arxiv.org/abs/1811.12030
+    """
+
+    def __init__(self, grid_roi_extractor, grid_head, **kwargs):
+        assert grid_head is not None
+        super(GridRoIHead, self).__init__(**kwargs)
+        if grid_roi_extractor is not None:
+            self.grid_roi_extractor = build_roi_extractor(grid_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.grid_roi_extractor = self.bbox_roi_extractor
+        self.grid_head = build_head(grid_head)
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(GridRoIHead, self).init_weights(pretrained)
+        self.grid_head.init_weights()
+        if not self.share_roi_extractor:
+            self.grid_roi_extractor.init_weights()
+
+    def _random_jitter(self, sampling_results, img_metas, amplitude=0.15):
+        """Ramdom jitter positive proposals for training."""
+        for sampling_result, img_meta in zip(sampling_results, img_metas):
+            bboxes = sampling_result.pos_bboxes
+            random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_(
+                -amplitude, amplitude)
+            # before jittering
+            cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2
+            wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs()
+            # after jittering
+            new_cxcy = cxcy + wh * random_offsets[:, :2]
+            new_wh = wh * (1 + random_offsets[:, 2:])
+            # xywh to xyxy
+            new_x1y1 = (new_cxcy - new_wh / 2)
+            new_x2y2 = (new_cxcy + new_wh / 2)
+            new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1)
+            # clip bboxes
+            max_shape = img_meta['img_shape']
+            if max_shape is not None:
+                new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1)
+                new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1)
+
+            sampling_result.pos_bboxes = new_bboxes
+        return sampling_results
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+
+        # grid head
+        grid_rois = rois[:100]
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], grid_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        grid_pred = self.grid_head(grid_feats)
+        outs = outs + (grid_pred, )
+
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        """Run forward function and calculate loss for box head in training."""
+        bbox_results = super(GridRoIHead,
+                             self)._bbox_forward_train(x, sampling_results,
+                                                       gt_bboxes, gt_labels,
+                                                       img_metas)
+
+        # Grid head forward and loss
+        sampling_results = self._random_jitter(sampling_results, img_metas)
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        # GN in head does not support zero shape input
+        if pos_rois.shape[0] == 0:
+            return bbox_results
+
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], pos_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        # Accelerate training
+        max_sample_num_grid = self.train_cfg.get('max_num_grid', 192)
+        sample_idx = torch.randperm(
+            grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid
+                                      )]
+        grid_feats = grid_feats[sample_idx]
+
+        grid_pred = self.grid_head(grid_feats)
+
+        grid_targets = self.grid_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        grid_targets = grid_targets[sample_idx]
+
+        loss_grid = self.grid_head.loss(grid_pred, grid_targets)
+
+        bbox_results['loss_bbox'].update(loss_grid)
+        return bbox_results
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=False)
+        # pack rois into bboxes
+        grid_rois = bbox2roi([det_bbox[:, :4] for det_bbox in det_bboxes])
+        if grid_rois.shape[0] != 0:
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            self.grid_head.test_mode = True
+            grid_pred = self.grid_head(grid_feats)
+            # split batch grid head prediction back to each image
+            num_roi_per_img = tuple(len(det_bbox) for det_bbox in det_bboxes)
+            grid_pred = {
+                k: v.split(num_roi_per_img, 0)
+                for k, v in grid_pred.items()
+            }
+
+            # apply bbox post-processing to each image individually
+            bbox_results = []
+            num_imgs = len(det_bboxes)
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    bbox_results.append(grid_rois.new_tensor([]))
+                else:
+                    det_bbox = self.grid_head.get_bboxes(
+                        det_bboxes[i], grid_pred['fused'][i], [img_metas[i]])
+                    if rescale:
+                        det_bbox[:, :4] /= img_metas[i]['scale_factor']
+                    bbox_results.append(
+                        bbox2result(det_bbox, det_labels[i],
+                                    self.bbox_head.num_classes))
+        else:
+            bbox_results = [
+                grid_rois.new_tensor([]) for _ in range(len(det_bboxes))
+            ]
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_metas, det_bboxes, det_labels, rescale=rescale)
+            return list(zip(bbox_results, segm_results))
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/htc_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/htc_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..e2d3d7384de585a477096e45d119d2459fb7bacb
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/htc_roi_head.py
@@ -0,0 +1,589 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        merge_aug_masks, multiclass_nms)
+from ..builder import HEADS, build_head, build_roi_extractor
+from .cascade_roi_head import CascadeRoIHead
+
+
+@HEADS.register_module()
+class HybridTaskCascadeRoIHead(CascadeRoIHead):
+    """Hybrid task cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1901.07518
+    """
+
+    def __init__(self,
+                 num_stages,
+                 stage_loss_weights,
+                 semantic_roi_extractor=None,
+                 semantic_head=None,
+                 semantic_fusion=('bbox', 'mask'),
+                 interleaved=True,
+                 mask_info_flow=True,
+                 **kwargs):
+        super(HybridTaskCascadeRoIHead,
+              self).__init__(num_stages, stage_loss_weights, **kwargs)
+        assert self.with_bbox and self.with_mask
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = build_roi_extractor(
+                semantic_roi_extractor)
+            self.semantic_head = build_head(semantic_head)
+
+        self.semantic_fusion = semantic_fusion
+        self.interleaved = interleaved
+        self.mask_info_flow = mask_info_flow
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(HybridTaskCascadeRoIHead, self).init_weights(pretrained)
+        if self.with_semantic:
+            self.semantic_head.init_weights()
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic head"""
+        if hasattr(self, 'semantic_head') and self.semantic_head is not None:
+            return True
+        else:
+            return False
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        outs = ()
+        # semantic head
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+        # bbox heads
+        rois = bbox2roi([proposals])
+        for i in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                i, x, rois, semantic_feat=semantic_feat)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+        # mask heads
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_roi_extractor = self.mask_roi_extractor[-1]
+            mask_feats = mask_roi_extractor(
+                x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+            if self.with_semantic and 'mask' in self.semantic_fusion:
+                mask_semantic_feat = self.semantic_roi_extractor(
+                    [semantic_feat], mask_rois)
+                mask_feats += mask_semantic_feat
+            last_feat = None
+            for i in range(self.num_stages):
+                mask_head = self.mask_head[i]
+                if self.mask_info_flow:
+                    mask_pred, last_feat = mask_head(mask_feats, last_feat)
+                else:
+                    mask_pred = mask_head(mask_feats)
+                outs = outs + (mask_pred, )
+        return outs
+
+    def _bbox_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_bboxes,
+                            gt_labels,
+                            rcnn_train_cfg,
+                            semantic_feat=None):
+        """Run forward function and calculate loss for box head in training."""
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage, x, rois, semantic_feat=semantic_feat)
+
+        bbox_targets = bbox_head.get_targets(sampling_results, gt_bboxes,
+                                             gt_labels, rcnn_train_cfg)
+        loss_bbox = bbox_head.loss(bbox_results['cls_score'],
+                                   bbox_results['bbox_pred'], rois,
+                                   *bbox_targets)
+
+        bbox_results.update(
+            loss_bbox=loss_bbox,
+            rois=rois,
+            bbox_targets=bbox_targets,
+        )
+        return bbox_results
+
+    def _mask_forward_train(self,
+                            stage,
+                            x,
+                            sampling_results,
+                            gt_masks,
+                            rcnn_train_cfg,
+                            semantic_feat=None):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        pos_rois)
+
+        # semantic feature fusion
+        # element-wise sum for original features and pooled semantic features
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             pos_rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats += mask_semantic_feat
+
+        # mask information flow
+        # forward all previous mask heads to obtain last_feat, and fuse it
+        # with the normal mask feature
+        if self.mask_info_flow:
+            last_feat = None
+            for i in range(stage):
+                last_feat = self.mask_head[i](
+                    mask_feats, last_feat, return_logits=False)
+            mask_pred = mask_head(mask_feats, last_feat, return_feat=False)
+        else:
+            mask_pred = mask_head(mask_feats, return_feat=False)
+
+        mask_targets = mask_head.get_targets(sampling_results, gt_masks,
+                                             rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels)
+
+        mask_results = dict(loss_mask=loss_mask)
+        return mask_results
+
+    def _bbox_forward(self, stage, x, rois, semantic_feat=None):
+        """Box head forward function used in both training and testing."""
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(
+            x[:len(bbox_roi_extractor.featmap_strides)], rois)
+        if self.with_semantic and 'bbox' in self.semantic_fusion:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = F.adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def _mask_forward_test(self, stage, x, bboxes, semantic_feat=None):
+        """Mask head forward function for testing."""
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_rois = bbox2roi([bboxes])
+        mask_feats = mask_roi_extractor(
+            x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             mask_rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats += mask_semantic_feat
+        if self.mask_info_flow:
+            last_feat = None
+            last_pred = None
+            for i in range(stage):
+                mask_pred, last_feat = self.mask_head[i](mask_feats, last_feat)
+                if last_pred is not None:
+                    mask_pred = mask_pred + last_pred
+                last_pred = mask_pred
+            mask_pred = mask_head(mask_feats, last_feat, return_feat=False)
+            if last_pred is not None:
+                mask_pred = mask_pred + last_pred
+        else:
+            mask_pred = mask_head(mask_feats)
+        return mask_pred
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      gt_semantic_seg=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            proposal_list (list[Tensors]): list of region proposals.
+
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None, list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None, Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+            gt_semantic_seg (None, list[Tensor]): semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # semantic segmentation part
+        # 2 outputs: segmentation prediction and embedded features
+        losses = dict()
+        if self.with_semantic:
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        for i in range(self.num_stages):
+            self.current_stage = i
+            rcnn_train_cfg = self.train_cfg[i]
+            lw = self.stage_loss_weights[i]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[i]
+            bbox_sampler = self.bbox_sampler[i]
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+
+            for j in range(num_imgs):
+                assign_result = bbox_assigner.assign(proposal_list[j],
+                                                     gt_bboxes[j],
+                                                     gt_bboxes_ignore[j],
+                                                     gt_labels[j])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[j],
+                    gt_bboxes[j],
+                    gt_labels[j],
+                    feats=[lvl_feat[j][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = \
+                self._bbox_forward_train(
+                    i, x, sampling_results, gt_bboxes, gt_labels,
+                    rcnn_train_cfg, semantic_feat)
+            roi_labels = bbox_results['bbox_targets'][0]
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{i}.{name}'] = (
+                    value * lw if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                # interleaved execution: use regressed bboxes by the box branch
+                # to train the mask branch
+                if self.interleaved:
+                    pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                    with torch.no_grad():
+                        proposal_list = self.bbox_head[i].refine_bboxes(
+                            bbox_results['rois'], roi_labels,
+                            bbox_results['bbox_pred'], pos_is_gts, img_metas)
+                        # re-assign and sample 512 RoIs from 512 RoIs
+                        sampling_results = []
+                        for j in range(num_imgs):
+                            assign_result = bbox_assigner.assign(
+                                proposal_list[j], gt_bboxes[j],
+                                gt_bboxes_ignore[j], gt_labels[j])
+                            sampling_result = bbox_sampler.sample(
+                                assign_result,
+                                proposal_list[j],
+                                gt_bboxes[j],
+                                gt_labels[j],
+                                feats=[lvl_feat[j][None] for lvl_feat in x])
+                            sampling_results.append(sampling_result)
+                mask_results = self._mask_forward_train(
+                    i, x, sampling_results, gt_masks, rcnn_train_cfg,
+                    semantic_feat)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{i}.{name}'] = (
+                        value * lw if 'loss' in name else value)
+
+            # refine bboxes (same as Cascade R-CNN)
+            if i < self.num_stages - 1 and not self.interleaved:
+                pos_is_gts = [res.pos_is_gt for res in sampling_results]
+                with torch.no_grad():
+                    proposal_list = self.bbox_head[i].refine_bboxes(
+                        bbox_results['rois'], roi_labels,
+                        bbox_results['bbox_pred'], pos_is_gts, img_metas)
+
+        return losses
+
+    def simple_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test without augmentation."""
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        num_imgs = len(proposal_list)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # "ms" in variable names means multi-stage
+        ms_bbox_result = {}
+        ms_segm_result = {}
+        ms_scores = []
+        rcnn_test_cfg = self.test_cfg
+
+        rois = bbox2roi(proposal_list)
+        for i in range(self.num_stages):
+            bbox_head = self.bbox_head[i]
+            bbox_results = self._bbox_forward(
+                i, x, rois, semantic_feat=semantic_feat)
+            # split batch bbox prediction back to each image
+            cls_score = bbox_results['cls_score']
+            bbox_pred = bbox_results['bbox_pred']
+            num_proposals_per_img = tuple(len(p) for p in proposal_list)
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_score = cls_score.split(num_proposals_per_img, 0)
+            bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_score)
+
+            if i < self.num_stages - 1:
+                bbox_label = [s[:, :-1].argmax(dim=1) for s in cls_score]
+                rois = torch.cat([
+                    bbox_head.regress_by_class(rois[i], bbox_label[i],
+                                               bbox_pred[i], img_metas[i])
+                    for i in range(num_imgs)
+                ])
+
+        # average scores of each image by stages
+        cls_score = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(num_imgs)
+        ]
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(num_imgs):
+            det_bbox, det_label = self.bbox_head[-1].get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+        bbox_result = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head[-1].num_classes)
+            for i in range(num_imgs)
+        ]
+        ms_bbox_result['ensemble'] = bbox_result
+
+        if self.with_mask:
+            if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+                mask_classes = self.mask_head[-1].num_classes
+                segm_results = [[[] for _ in range(mask_classes)]
+                                for _ in range(num_imgs)]
+            else:
+                if rescale and not isinstance(scale_factors[0], float):
+                    scale_factors = [
+                        torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                        for scale_factor in scale_factors
+                    ]
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i]
+                    for i in range(num_imgs)
+                ]
+                mask_rois = bbox2roi(_bboxes)
+                aug_masks = []
+                mask_roi_extractor = self.mask_roi_extractor[-1]
+                mask_feats = mask_roi_extractor(
+                    x[:len(mask_roi_extractor.featmap_strides)], mask_rois)
+                if self.with_semantic and 'mask' in self.semantic_fusion:
+                    mask_semantic_feat = self.semantic_roi_extractor(
+                        [semantic_feat], mask_rois)
+                    mask_feats += mask_semantic_feat
+                last_feat = None
+
+                num_bbox_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+                for i in range(self.num_stages):
+                    mask_head = self.mask_head[i]
+                    if self.mask_info_flow:
+                        mask_pred, last_feat = mask_head(mask_feats, last_feat)
+                    else:
+                        mask_pred = mask_head(mask_feats)
+
+                    # split batch mask prediction back to each image
+                    mask_pred = mask_pred.split(num_bbox_per_img, 0)
+                    aug_masks.append(
+                        [mask.sigmoid().cpu().numpy() for mask in mask_pred])
+
+                # apply mask post-processing to each image individually
+                segm_results = []
+                for i in range(num_imgs):
+                    if det_bboxes[i].shape[0] == 0:
+                        segm_results.append(
+                            [[]
+                             for _ in range(self.mask_head[-1].num_classes)])
+                    else:
+                        aug_mask = [mask[i] for mask in aug_masks]
+                        merged_mask = merge_aug_masks(
+                            aug_mask, [[img_metas[i]]] * self.num_stages,
+                            rcnn_test_cfg)
+                        segm_result = self.mask_head[-1].get_seg_masks(
+                            merged_mask, _bboxes[i], det_labels[i],
+                            rcnn_test_cfg, ori_shapes[i], scale_factors[i],
+                            rescale)
+                        segm_results.append(segm_result)
+            ms_segm_result['ensemble'] = segm_results
+
+        if self.with_mask:
+            results = list(
+                zip(ms_bbox_result['ensemble'], ms_segm_result['ensemble']))
+        else:
+            results = ms_bbox_result['ensemble']
+
+        return results
+
+    def aug_test(self, img_feats, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        if self.with_semantic:
+            semantic_feats = [
+                self.semantic_head(feat)[1] for feat in img_feats
+            ]
+        else:
+            semantic_feats = [None] * len(img_metas)
+
+        rcnn_test_cfg = self.test_cfg
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta, semantic in zip(img_feats, img_metas, semantic_feats):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            # "ms" in variable names means multi-stage
+            ms_scores = []
+
+            rois = bbox2roi([proposals])
+            for i in range(self.num_stages):
+                bbox_head = self.bbox_head[i]
+                bbox_results = self._bbox_forward(
+                    i, x, rois, semantic_feat=semantic)
+                ms_scores.append(bbox_results['cls_score'])
+
+                if i < self.num_stages - 1:
+                    bbox_label = bbox_results['cls_score'].argmax(dim=1)
+                    rois = bbox_head.regress_by_class(
+                        rois, bbox_label, bbox_results['bbox_pred'],
+                        img_meta[0])
+
+            cls_score = sum(ms_scores) / float(len(ms_scores))
+            bboxes, scores = self.bbox_head[-1].get_bboxes(
+                rois,
+                cls_score,
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+
+        bbox_result = bbox2result(det_bboxes, det_labels,
+                                  self.bbox_head[-1].num_classes)
+
+        if self.with_mask:
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[[]
+                                for _ in range(self.mask_head[-1].num_classes -
+                                               1)]]
+            else:
+                aug_masks = []
+                aug_img_metas = []
+                for x, img_meta, semantic in zip(img_feats, img_metas,
+                                                 semantic_feats):
+                    img_shape = img_meta[0]['img_shape']
+                    scale_factor = img_meta[0]['scale_factor']
+                    flip = img_meta[0]['flip']
+                    flip_direction = img_meta[0]['flip_direction']
+                    _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                           scale_factor, flip, flip_direction)
+                    mask_rois = bbox2roi([_bboxes])
+                    mask_feats = self.mask_roi_extractor[-1](
+                        x[:len(self.mask_roi_extractor[-1].featmap_strides)],
+                        mask_rois)
+                    if self.with_semantic:
+                        semantic_feat = semantic
+                        mask_semantic_feat = self.semantic_roi_extractor(
+                            [semantic_feat], mask_rois)
+                        if mask_semantic_feat.shape[-2:] != mask_feats.shape[
+                                -2:]:
+                            mask_semantic_feat = F.adaptive_avg_pool2d(
+                                mask_semantic_feat, mask_feats.shape[-2:])
+                        mask_feats += mask_semantic_feat
+                    last_feat = None
+                    for i in range(self.num_stages):
+                        mask_head = self.mask_head[i]
+                        if self.mask_info_flow:
+                            mask_pred, last_feat = mask_head(
+                                mask_feats, last_feat)
+                        else:
+                            mask_pred = mask_head(mask_feats)
+                        aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+                        aug_img_metas.append(img_meta)
+                merged_masks = merge_aug_masks(aug_masks, aug_img_metas,
+                                               self.test_cfg)
+
+                ori_shape = img_metas[0][0]['ori_shape']
+                segm_result = self.mask_head[-1].get_seg_masks(
+                    merged_masks,
+                    det_bboxes,
+                    det_labels,
+                    rcnn_test_cfg,
+                    ori_shape,
+                    scale_factor=1.0,
+                    rescale=False)
+            return [(bbox_result, segm_result)]
+        else:
+            return [bbox_result]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/__init__.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..26c3e95a635b62e6fedcafd5d071355188b581a6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,12 @@
+from .coarse_mask_head import CoarseMaskHead
+from .fcn_mask_head import FCNMaskHead
+from .fused_semantic_head import FusedSemanticHead
+from .grid_head import GridHead
+from .htc_mask_head import HTCMaskHead
+from .mask_point_head import MaskPointHead
+from .maskiou_head import MaskIoUHead
+
+__all__ = [
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..d665dfff83855e6db3866c681559ccdef09f9999
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
@@ -0,0 +1,91 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Linear, constant_init, xavier_init
+from mmcv.runner import auto_fp16
+
+from mmdet.models.builder import HEADS
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class CoarseMaskHead(FCNMaskHead):
+    """Coarse mask head used in PointRend.
+
+    Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample
+    the input feature map instead of upsample it.
+
+    Args:
+        num_convs (int): Number of conv layers in the head. Default: 0.
+        num_fcs (int): Number of fc layers in the head. Default: 2.
+        fc_out_channels (int): Number of output channels of fc layer.
+            Default: 1024.
+        downsample_factor (int): The factor that feature map is downsampled by.
+            Default: 2.
+    """
+
+    def __init__(self,
+                 num_convs=0,
+                 num_fcs=2,
+                 fc_out_channels=1024,
+                 downsample_factor=2,
+                 *arg,
+                 **kwarg):
+        super(CoarseMaskHead, self).__init__(
+            *arg, num_convs=num_convs, upsample_cfg=dict(type=None), **kwarg)
+        self.num_fcs = num_fcs
+        assert self.num_fcs > 0
+        self.fc_out_channels = fc_out_channels
+        self.downsample_factor = downsample_factor
+        assert self.downsample_factor >= 1
+        # remove conv_logit
+        delattr(self, 'conv_logits')
+
+        if downsample_factor > 1:
+            downsample_in_channels = (
+                self.conv_out_channels
+                if self.num_convs > 0 else self.in_channels)
+            self.downsample_conv = ConvModule(
+                downsample_in_channels,
+                self.conv_out_channels,
+                kernel_size=downsample_factor,
+                stride=downsample_factor,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        else:
+            self.downsample_conv = None
+
+        self.output_size = (self.roi_feat_size[0] // downsample_factor,
+                            self.roi_feat_size[1] // downsample_factor)
+        self.output_area = self.output_size[0] * self.output_size[1]
+
+        last_layer_dim = self.conv_out_channels * self.output_area
+
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            fc_in_channels = (
+                last_layer_dim if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(fc_in_channels, self.fc_out_channels))
+        last_layer_dim = self.fc_out_channels
+        output_channels = self.num_classes * self.output_area
+        self.fc_logits = Linear(last_layer_dim, output_channels)
+
+    def init_weights(self):
+        for m in self.fcs.modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m)
+        constant_init(self.fc_logits, 0.001)
+
+    @auto_fp16()
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+
+        if self.downsample_conv is not None:
+            x = self.downsample_conv(x)
+
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_pred = self.fc_logits(x).view(
+            x.size(0), self.num_classes, *self.output_size)
+        return mask_pred
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..0cba3cda06f1ba1622b61c7d15eb823f154ede54
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -0,0 +1,328 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmcv.runner import auto_fp16, force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.core import mask_target
+from mmdet.models.builder import HEADS, build_loss
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+@HEADS.register_module()
+class FCNMaskHead(nn.Module):
+
+    def __init__(self,
+                 num_convs=4,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 conv_out_channels=256,
+                 num_classes=80,
+                 class_agnostic=False,
+                 upsample_cfg=dict(type='deconv', scale_factor=2),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 loss_mask=dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)):
+        super(FCNMaskHead, self).__init__()
+        self.upsample_cfg = upsample_cfg.copy()
+        if self.upsample_cfg['type'] not in [
+                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        ]:
+            raise ValueError(
+                f'Invalid upsample method {self.upsample_cfg["type"]}, '
+                'accepted methods are "deconv", "nearest", "bilinear", '
+                '"carafe"')
+        self.num_convs = num_convs
+        # WARN: roi_feat_size is reserved and not used
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = self.upsample_cfg.get('type')
+        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+        self.loss_mask = build_loss(loss_mask)
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+        upsample_in_channels = (
+            self.conv_out_channels if self.num_convs > 0 else in_channels)
+        upsample_cfg_ = self.upsample_cfg.copy()
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            upsample_cfg_.update(
+                in_channels=upsample_in_channels,
+                out_channels=self.conv_out_channels,
+                kernel_size=self.scale_factor,
+                stride=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        elif self.upsample_method == 'carafe':
+            upsample_cfg_.update(
+                channels=upsample_in_channels, scale_factor=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        else:
+            # suppress warnings
+            align_corners = (None
+                             if self.upsample_method == 'nearest' else False)
+            upsample_cfg_.update(
+                scale_factor=self.scale_factor,
+                mode=self.upsample_method,
+                align_corners=align_corners)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        logits_in_channel = (
+            self.conv_out_channels
+            if self.upsample_method == 'deconv' else upsample_in_channels)
+        self.conv_logits = Conv2d(logits_in_channel, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            elif isinstance(m, CARAFEPack):
+                m.init_weights()
+            else:
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.constant_(m.bias, 0)
+
+    @auto_fp16()
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_pred = self.conv_logits(x)
+        return mask_pred
+
+    def get_targets(self, sampling_results, gt_masks, rcnn_train_cfg):
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, mask_targets, labels):
+        loss = dict()
+        if mask_pred.size(0) == 0:
+            loss_mask = mask_pred.sum()
+        else:
+            if self.class_agnostic:
+                loss_mask = self.loss_mask(mask_pred, mask_targets,
+                                           torch.zeros_like(labels))
+            else:
+                loss_mask = self.loss_mask(mask_pred, mask_targets, labels)
+        loss['loss_mask'] = loss_mask
+        return loss
+
+    def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
+                      ori_shape, scale_factor, rescale):
+        """Get segmentation masks from mask_pred and bboxes.
+
+        Args:
+            mask_pred (Tensor or ndarray): shape (n, #class, h, w).
+                For single-scale testing, mask_pred is the direct output of
+                model, whose type is Tensor, while for multi-scale testing,
+                it will be converted to numpy array outside of this method.
+            det_bboxes (Tensor): shape (n, 4/5)
+            det_labels (Tensor): shape (n, )
+            img_shape (Tensor): shape (3, )
+            rcnn_test_cfg (dict): rcnn testing config
+            ori_shape: original image size
+
+        Returns:
+            list[list]: encoded masks
+        """
+        if isinstance(mask_pred, torch.Tensor):
+            mask_pred = mask_pred.sigmoid()
+        else:
+            mask_pred = det_bboxes.new_tensor(mask_pred)
+
+        device = mask_pred.device
+        cls_segms = [[] for _ in range(self.num_classes)
+                     ]  # BG is not included in num_classes
+        bboxes = det_bboxes[:, :4]
+        labels = det_labels
+
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+        else:
+            if isinstance(scale_factor, float):
+                img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
+                img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
+            else:
+                w_scale, h_scale = scale_factor[0], scale_factor[1]
+                img_h = np.round(ori_shape[0] * h_scale.item()).astype(
+                    np.int32)
+                img_w = np.round(ori_shape[1] * w_scale.item()).astype(
+                    np.int32)
+            scale_factor = 1.0
+
+        if not isinstance(scale_factor, (float, torch.Tensor)):
+            scale_factor = bboxes.new_tensor(scale_factor)
+        bboxes = bboxes / scale_factor
+
+        if torch.onnx.is_in_onnx_export():
+            # TODO: Remove after F.grid_sample is supported.
+            from torchvision.models.detection.roi_heads \
+                import paste_masks_in_image
+            masks = paste_masks_in_image(mask_pred, bboxes, ori_shape[:2])
+            thr = rcnn_test_cfg.get('mask_thr_binary', 0)
+            if thr > 0:
+                masks = masks >= thr
+            return masks
+
+        N = len(mask_pred)
+        # The actual implementation split the input into chunks,
+        # and paste them chunk by chunk.
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            num_chunks = int(
+                np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+        threshold = rcnn_test_cfg.mask_thr_binary
+        im_mask = torch.zeros(
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
+
+        if not self.class_agnostic:
+            mask_pred = mask_pred[range(N), labels][:, None]
+
+        for inds in chunks:
+            masks_chunk, spatial_inds = _do_paste_mask(
+                mask_pred[inds],
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+            else:
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+
+        for i in range(N):
+            cls_segms[labels[i]].append(im_mask[i].detach().cpu().numpy())
+        return cls_segms
+
+
+def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
+    """Paste instance masks acoording to boxes.
+
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+            is the slice object.
+        If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+        If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(
+        y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(
+        x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    if torch.isinf(img_x).any():
+        inds = torch.where(torch.isinf(img_x))
+        img_x[inds] = 0
+    if torch.isinf(img_y).any():
+        inds = torch.where(torch.isinf(img_y))
+        img_y[inds] = 0
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    if torch.onnx.is_in_onnx_export():
+        raise RuntimeError(
+            'Exporting F.grid_sample from Pytorch to ONNX is not supported.')
+    img_masks = F.grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..2aa6033eec17a30aeb68c0fdd218d8f0d41157e8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,107 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, kaiming_init
+from mmcv.runner import auto_fp16, force_fp32
+
+from mmdet.models.builder import HEADS
+
+
+@HEADS.register_module()
+class FusedSemanticHead(nn.Module):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_ins,
+                 fusion_level,
+                 num_convs=4,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 num_classes=183,
+                 ignore_label=255,
+                 loss_weight=0.2,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        super(FusedSemanticHead, self).__init__()
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.ignore_label = ignore_label
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+
+        self.criterion = nn.CrossEntropyLoss(ignore_index=ignore_label)
+
+    def init_weights(self):
+        kaiming_init(self.conv_logits)
+
+    @auto_fp16()
+    def forward(self, feats):
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                x += self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_pred = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_pred, x
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, labels):
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_pred, labels)
+        loss_semantic_seg *= self.loss_weight
+        return loss_semantic_seg
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/grid_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/grid_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..83058cbdda934ebfc3a76088e1820848ac01b78b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/grid_head.py
@@ -0,0 +1,359 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, kaiming_init, normal_init
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class GridHead(nn.Module):
+
+    def __init__(self,
+                 grid_points=9,
+                 num_convs=8,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 point_feat_channels=64,
+                 deconv_kernel_size=4,
+                 class_agnostic=False,
+                 loss_grid=dict(
+                     type='CrossEntropyLoss', use_sigmoid=True,
+                     loss_weight=15),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='GN', num_groups=36)):
+        super(GridHead, self).__init__()
+        self.grid_points = grid_points
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.point_feat_channels = point_feat_channels
+        self.conv_out_channels = self.point_feat_channels * self.grid_points
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN':
+            assert self.conv_out_channels % norm_cfg['num_groups'] == 0
+
+        assert self.grid_points >= 4
+        self.grid_size = int(np.sqrt(self.grid_points))
+        if self.grid_size * self.grid_size != self.grid_points:
+            raise ValueError('grid_points must be a square number')
+
+        # the predicted heatmap is half of whole_map_size
+        if not isinstance(self.roi_feat_size, int):
+            raise ValueError('Only square RoIs are supporeted in Grid R-CNN')
+        self.whole_map_size = self.roi_feat_size * 4
+
+        # compute point-wise sub-regions
+        self.sub_regions = self.calc_sub_regions()
+
+        self.convs = []
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            stride = 2 if i == 0 else 1
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=True))
+        self.convs = nn.Sequential(*self.convs)
+
+        self.deconv1 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            self.conv_out_channels,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+        self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels)
+        self.deconv2 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            grid_points,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+
+        # find the 4-neighbor of each grid point
+        self.neighbor_points = []
+        grid_size = self.grid_size
+        for i in range(grid_size):  # i-th column
+            for j in range(grid_size):  # j-th row
+                neighbors = []
+                if i > 0:  # left: (i - 1, j)
+                    neighbors.append((i - 1) * grid_size + j)
+                if j > 0:  # up: (i, j - 1)
+                    neighbors.append(i * grid_size + j - 1)
+                if j < grid_size - 1:  # down: (i, j + 1)
+                    neighbors.append(i * grid_size + j + 1)
+                if i < grid_size - 1:  # right: (i + 1, j)
+                    neighbors.append((i + 1) * grid_size + j)
+                self.neighbor_points.append(tuple(neighbors))
+        # total edges in the grid
+        self.num_edges = sum([len(p) for p in self.neighbor_points])
+
+        self.forder_trans = nn.ModuleList()  # first-order feature transition
+        self.sorder_trans = nn.ModuleList()  # second-order feature transition
+        for neighbors in self.neighbor_points:
+            fo_trans = nn.ModuleList()
+            so_trans = nn.ModuleList()
+            for _ in range(len(neighbors)):
+                # each transition module consists of a 5x5 depth-wise conv and
+                # 1x1 conv.
+                fo_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            stride=1,
+                            padding=2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+                so_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            1,
+                            2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+            self.forder_trans.append(fo_trans)
+            self.sorder_trans.append(so_trans)
+
+        self.loss_grid = build_loss(loss_grid)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+                # TODO: compare mode = "fan_in" or "fan_out"
+                kaiming_init(m)
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+        nn.init.constant_(self.deconv2.bias, -np.log(0.99 / 0.01))
+
+    def forward(self, x):
+        assert x.shape[-1] == x.shape[-2] == self.roi_feat_size
+        # RoI feature transformation, downsample 2x
+        x = self.convs(x)
+
+        c = self.point_feat_channels
+        # first-order fusion
+        x_fo = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_fo[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_fo[i] = x_fo[i] + self.forder_trans[i][j](
+                    x[:, point_idx * c:(point_idx + 1) * c])
+
+        # second-order fusion
+        x_so = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_so[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx])
+
+        # predicted heatmap with fused features
+        x2 = torch.cat(x_so, dim=1)
+        x2 = self.deconv1(x2)
+        x2 = F.relu(self.norm1(x2), inplace=True)
+        heatmap = self.deconv2(x2)
+
+        # predicted heatmap with original features (applicable during training)
+        if self.training:
+            x1 = x
+            x1 = self.deconv1(x1)
+            x1 = F.relu(self.norm1(x1), inplace=True)
+            heatmap_unfused = self.deconv2(x1)
+        else:
+            heatmap_unfused = heatmap
+
+        return dict(fused=heatmap, unfused=heatmap_unfused)
+
+    def calc_sub_regions(self):
+        """Compute point specific representation regions.
+
+        See Grid R-CNN Plus (https://arxiv.org/abs/1906.05688) for details.
+        """
+        # to make it consistent with the original implementation, half_size
+        # is computed as 2 * quarter_size, which is smaller
+        half_size = self.whole_map_size // 4 * 2
+        sub_regions = []
+        for i in range(self.grid_points):
+            x_idx = i // self.grid_size
+            y_idx = i % self.grid_size
+            if x_idx == 0:
+                sub_x1 = 0
+            elif x_idx == self.grid_size - 1:
+                sub_x1 = half_size
+            else:
+                ratio = x_idx / (self.grid_size - 1) - 0.25
+                sub_x1 = max(int(ratio * self.whole_map_size), 0)
+
+            if y_idx == 0:
+                sub_y1 = 0
+            elif y_idx == self.grid_size - 1:
+                sub_y1 = half_size
+            else:
+                ratio = y_idx / (self.grid_size - 1) - 0.25
+                sub_y1 = max(int(ratio * self.whole_map_size), 0)
+            sub_regions.append(
+                (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size))
+        return sub_regions
+
+    def get_targets(self, sampling_results, rcnn_train_cfg):
+        # mix all samples (across images) together.
+        pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results],
+                               dim=0).cpu()
+        pos_gt_bboxes = torch.cat(
+            [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu()
+        assert pos_bboxes.shape == pos_gt_bboxes.shape
+
+        # expand pos_bboxes to 2x of original size
+        x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1)
+        pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1)
+
+        num_rois = pos_bboxes.shape[0]
+        map_size = self.whole_map_size
+        # this is not the final target shape
+        targets = torch.zeros((num_rois, self.grid_points, map_size, map_size),
+                              dtype=torch.float)
+
+        # pre-compute interpolation factors for all grid points.
+        # the first item is the factor of x-dim, and the second is y-dim.
+        # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1)
+        factors = []
+        for j in range(self.grid_points):
+            x_idx = j // self.grid_size
+            y_idx = j % self.grid_size
+            factors.append((1 - x_idx / (self.grid_size - 1),
+                            1 - y_idx / (self.grid_size - 1)))
+
+        radius = rcnn_train_cfg.pos_radius
+        radius2 = radius**2
+        for i in range(num_rois):
+            # ignore small bboxes
+            if (pos_bbox_ws[i] <= self.grid_size
+                    or pos_bbox_hs[i] <= self.grid_size):
+                continue
+            # for each grid point, mark a small circle as positive
+            for j in range(self.grid_points):
+                factor_x, factor_y = factors[j]
+                gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + (
+                    1 - factor_x) * pos_gt_bboxes[i, 2]
+                gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + (
+                    1 - factor_y) * pos_gt_bboxes[i, 3]
+
+                cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] *
+                         map_size)
+                cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] *
+                         map_size)
+
+                for x in range(cx - radius, cx + radius + 1):
+                    for y in range(cy - radius, cy + radius + 1):
+                        if x >= 0 and x < map_size and y >= 0 and y < map_size:
+                            if (x - cx)**2 + (y - cy)**2 <= radius2:
+                                targets[i, j, y, x] = 1
+        # reduce the target heatmap size by a half
+        # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688).
+        sub_targets = []
+        for i in range(self.grid_points):
+            sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i]
+            sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2])
+        sub_targets = torch.cat(sub_targets, dim=1)
+        sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device)
+        return sub_targets
+
+    def loss(self, grid_pred, grid_targets):
+        loss_fused = self.loss_grid(grid_pred['fused'], grid_targets)
+        loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets)
+        loss_grid = loss_fused + loss_unfused
+        return dict(loss_grid=loss_grid)
+
+    def get_bboxes(self, det_bboxes, grid_pred, img_metas):
+        # TODO: refactoring
+        assert det_bboxes.shape[0] == grid_pred.shape[0]
+        det_bboxes = det_bboxes.cpu()
+        cls_scores = det_bboxes[:, [4]]
+        det_bboxes = det_bboxes[:, :4]
+        grid_pred = grid_pred.sigmoid().cpu()
+
+        R, c, h, w = grid_pred.shape
+        half_size = self.whole_map_size // 4 * 2
+        assert h == w == half_size
+        assert c == self.grid_points
+
+        # find the point with max scores in the half-sized heatmap
+        grid_pred = grid_pred.view(R * c, h * w)
+        pred_scores, pred_position = grid_pred.max(dim=1)
+        xs = pred_position % w
+        ys = pred_position // w
+
+        # get the position in the whole heatmap instead of half-sized heatmap
+        for i in range(self.grid_points):
+            xs[i::self.grid_points] += self.sub_regions[i][0]
+            ys[i::self.grid_points] += self.sub_regions[i][1]
+
+        # reshape to (num_rois, grid_points)
+        pred_scores, xs, ys = tuple(
+            map(lambda x: x.view(R, c), [pred_scores, xs, ys]))
+
+        # get expanded pos_bboxes
+        widths = (det_bboxes[:, 2] - det_bboxes[:, 0]).unsqueeze(-1)
+        heights = (det_bboxes[:, 3] - det_bboxes[:, 1]).unsqueeze(-1)
+        x1 = (det_bboxes[:, 0, None] - widths / 2)
+        y1 = (det_bboxes[:, 1, None] - heights / 2)
+        # map the grid point to the absolute coordinates
+        abs_xs = (xs.float() + 0.5) / w * widths + x1
+        abs_ys = (ys.float() + 0.5) / h * heights + y1
+
+        # get the grid points indices that fall on the bbox boundaries
+        x1_inds = [i for i in range(self.grid_size)]
+        y1_inds = [i * self.grid_size for i in range(self.grid_size)]
+        x2_inds = [
+            self.grid_points - self.grid_size + i
+            for i in range(self.grid_size)
+        ]
+        y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)]
+
+        # voting of all grid points on some boundary
+        bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x1_inds].sum(dim=1, keepdim=True))
+        bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y1_inds].sum(dim=1, keepdim=True))
+        bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x2_inds].sum(dim=1, keepdim=True))
+        bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y2_inds].sum(dim=1, keepdim=True))
+
+        bbox_res = torch.cat(
+            [bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2, cls_scores], dim=1)
+        bbox_res[:, [0, 2]].clamp_(min=0, max=img_metas[0]['img_shape'][1])
+        bbox_res[:, [1, 3]].clamp_(min=0, max=img_metas[0]['img_shape'][0])
+
+        return bbox_res
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..330b778ebad8d48d55d09ddd42baa70ec10ae463
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
@@ -0,0 +1,43 @@
+from mmcv.cnn import ConvModule
+
+from mmdet.models.builder import HEADS
+from .fcn_mask_head import FCNMaskHead
+
+
+@HEADS.register_module()
+class HTCMaskHead(FCNMaskHead):
+
+    def __init__(self, with_conv_res=True, *args, **kwargs):
+        super(HTCMaskHead, self).__init__(*args, **kwargs)
+        self.with_conv_res = with_conv_res
+        if self.with_conv_res:
+            self.conv_res = ConvModule(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def init_weights(self):
+        super(HTCMaskHead, self).init_weights()
+        if self.with_conv_res:
+            self.conv_res.init_weights()
+
+    def forward(self, x, res_feat=None, return_logits=True, return_feat=True):
+        if res_feat is not None:
+            assert self.with_conv_res
+            res_feat = self.conv_res(res_feat)
+            x = x + res_feat
+        for conv in self.convs:
+            x = conv(x)
+        res_feat = x
+        outs = []
+        if return_logits:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            mask_pred = self.conv_logits(x)
+            outs.append(mask_pred)
+        if return_feat:
+            outs.append(res_feat)
+        return outs if len(outs) > 1 else outs[0]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/mask_point_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..f38a5c9d7595d441776d6b38070ed75e42911fce
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -0,0 +1,300 @@
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class MaskPointHead(nn.Module):
+    """A mask point head use in PointRend.
+
+    ``MaskPointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Default: 3.
+        in_channels (int): Number of input channels. Default: 256.
+        fc_channels (int): Number of fc channels. Default: 256.
+        num_classes (int): Number of classes for logits. Default: 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Default: False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg (dict | None): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d'))
+        norm_cfg (dict | None): Dictionary to construct and config norm layer.
+            Default: None.
+        loss_point (dict): Dictionary to construct and config loss layer of
+            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
+            loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_classes,
+                 num_fcs=3,
+                 in_channels=256,
+                 fc_channels=256,
+                 class_agnostic=False,
+                 coarse_pred_each_layer=True,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 loss_point=dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)):
+        super().__init__()
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channles = fc_channels
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_point = build_loss(loss_point)
+
+        fc_in_channels = in_channels + num_classes
+        self.fcs = nn.ModuleList()
+        for _ in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += num_classes if self.coarse_pred_each_layer else 0
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.fc_logits = nn.Conv1d(
+            fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def init_weights(self):
+        """Initialize last classification layer of MaskPointHead, conv layers
+        are already initialized by ConvModule."""
+        normal_init(self.fc_logits, std=0.001)
+
+    def forward(self, fine_grained_feats, coarse_feats):
+        """Classify each point base on fine grained and coarse feats.
+
+        Args:
+            fine_grained_feats (Tensor): Fine grained feature sampled from FPN,
+                shape (num_rois, in_channels, num_points).
+            coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead,
+                shape (num_rois, num_classes, num_points).
+
+        Returns:
+            Tensor: Point classification results,
+                shape (num_rois, num_class, num_points).
+        """
+
+        x = torch.cat([fine_grained_feats, coarse_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_feats), dim=1)
+        return self.fc_logits(x)
+
+    def get_targets(self, rois, rel_roi_points, sampling_results, gt_masks,
+                    cfg):
+        """Get training targets of MaskPointHead for all images.
+
+        Args:
+            rois (Tensor): Region of Interest, shape (num_rois, 5).
+            rel_roi_points: Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+            sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            gt_masks (Tensor) : Ground truth segmentation masks of
+                corresponding boxes, shape (num_rois, height, width).
+            cfg (dict): Training cfg.
+
+        Returns:
+            Tensor: Point target, shape (num_rois, num_points).
+        """
+
+        num_imgs = len(sampling_results)
+        rois_list = []
+        rel_roi_points_list = []
+        for batch_ind in range(num_imgs):
+            inds = (rois[:, 0] == batch_ind)
+            rois_list.append(rois[inds])
+            rel_roi_points_list.append(rel_roi_points[inds])
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        cfg_list = [cfg for _ in range(num_imgs)]
+
+        point_targets = map(self._get_target_single, rois_list,
+                            rel_roi_points_list, pos_assigned_gt_inds_list,
+                            gt_masks, cfg_list)
+        point_targets = list(point_targets)
+
+        if len(point_targets) > 0:
+            point_targets = torch.cat(point_targets)
+
+        return point_targets
+
+    def _get_target_single(self, rois, rel_roi_points, pos_assigned_gt_inds,
+                           gt_masks, cfg):
+        """Get training target of MaskPointHead for each image."""
+        num_pos = rois.size(0)
+        num_points = cfg.num_points
+        if num_pos > 0:
+            gt_masks_th = (
+                gt_masks.to_tensor(rois.dtype, rois.device).index_select(
+                    0, pos_assigned_gt_inds))
+            gt_masks_th = gt_masks_th.unsqueeze(1)
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, gt_masks_th.shape[2:])
+            point_targets = point_sample(gt_masks_th,
+                                         rel_img_points).squeeze(1)
+        else:
+            point_targets = rois.new_zeros((0, num_points))
+        return point_targets
+
+    def loss(self, point_pred, point_targets, labels):
+        """Calculate loss for MaskPointHead.
+
+        Args:
+            point_pred (Tensor): Point predication result, shape
+                (num_rois, num_classes, num_points).
+            point_targets (Tensor): Point targets, shape (num_roi, num_points).
+            labels (Tensor): Class label of corresponding boxes,
+                shape (num_rois, )
+
+        Returns:
+            dict[str, Tensor]: a dictionary of point loss components
+        """
+
+        loss = dict()
+        if self.class_agnostic:
+            loss_point = self.loss_point(point_pred, point_targets,
+                                         torch.zeros_like(labels))
+        else:
+            loss_point = self.loss_point(point_pred, point_targets, labels)
+        loss['loss_point'] = loss_point
+        return loss
+
+    def _get_uncertainty(self, mask_pred, labels):
+        """Estimate uncertainty based on pred logits.
+
+        We estimate uncertainty as L1 distance between 0.0 and the logits
+        prediction in 'mask_pred' for the foreground class in `classes`.
+
+        Args:
+            mask_pred (Tensor): mask predication logits, shape (num_rois,
+                num_classes, mask_height, mask_width).
+
+            labels (list[Tensor]): Either predicted or ground truth label for
+                each predicted mask, of length num_rois.
+
+        Returns:
+            scores (Tensor): Uncertainty scores with the most uncertain
+                locations having the highest uncertainty score,
+                shape (num_rois, 1, mask_height, mask_width)
+        """
+        if mask_pred.shape[1] == 1:
+            gt_class_logits = mask_pred.clone()
+        else:
+            inds = torch.arange(mask_pred.shape[0], device=mask_pred.device)
+            gt_class_logits = mask_pred[inds, labels].unsqueeze(1)
+        return -torch.abs(gt_class_logits)
+
+    def get_roi_rel_points_train(self, mask_pred, labels, cfg):
+        """Get ``num_points`` most uncertain points with random points during
+        train.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        '_get_uncertainty()' function that takes point's logit prediction as
+        input.
+
+        Args:
+            mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            labels (list): The ground truth class for each instance.
+            cfg (dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+                that contains the coordinates sampled points.
+        """
+        num_points = cfg.num_points
+        oversample_ratio = cfg.oversample_ratio
+        importance_sample_ratio = cfg.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = mask_pred.shape[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = torch.rand(
+            batch_size, num_sampled, 2, device=mask_pred.device)
+        point_logits = point_sample(mask_pred, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = self._get_uncertainty(point_logits, labels)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_sampled * torch.arange(
+            batch_size, dtype=torch.long, device=mask_pred.device)
+        idx += shift[:, None]
+        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+            batch_size, num_uncertain_points, 2)
+        if num_random_points > 0:
+            rand_roi_coords = torch.rand(
+                batch_size, num_random_points, 2, device=mask_pred.device)
+            point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+        return point_coords
+
+    def get_roi_rel_points_test(self, mask_pred, pred_label, cfg):
+        """Get ``num_points`` most uncertain points during test.
+
+        Args:
+            mask_pred (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            pred_label (list): The predication class for each instance.
+            cfg (dict): Testing config of point head.
+
+        Returns:
+            point_indices (Tensor): A tensor of shape (num_rois, num_points)
+                that contains indices from [0, mask_height x mask_width) of the
+                most uncertain points.
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+                that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid .
+        """
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = self._get_uncertainty(mask_pred, pred_label)
+        num_rois, _, mask_height, mask_width = uncertainty_map.shape
+        h_step = 1.0 / mask_height
+        w_step = 1.0 / mask_width
+
+        uncertainty_map = uncertainty_map.view(num_rois,
+                                               mask_height * mask_width)
+        num_points = min(mask_height * mask_width, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        point_coords = uncertainty_map.new_zeros(num_rois, num_points, 2)
+        point_coords[:, :, 0] = w_step / 2.0 + (point_indices %
+                                                mask_width).float() * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (point_indices //
+                                                mask_width).float() * h_step
+        return point_indices, point_coords
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/maskiou_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..39bcd6a7dbdb089cd19cef811038e0b6a80ab89a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -0,0 +1,186 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import Conv2d, Linear, MaxPool2d, kaiming_init, normal_init
+from mmcv.runner import force_fp32
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class MaskIoUHead(nn.Module):
+    """Mask IoU Head.
+
+    This head predicts the IoU of predicted masks and corresponding gt masks.
+    """
+
+    def __init__(self,
+                 num_convs=4,
+                 num_fcs=2,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 fc_out_channels=1024,
+                 num_classes=80,
+                 loss_iou=dict(type='MSELoss', loss_weight=0.5)):
+        super(MaskIoUHead, self).__init__()
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.num_classes = num_classes
+        self.fp16_enabled = False
+
+        self.convs = nn.ModuleList()
+        for i in range(num_convs):
+            if i == 0:
+                # concatenation of mask feature and mask prediction
+                in_channels = self.in_channels + 1
+            else:
+                in_channels = self.conv_out_channels
+            stride = 2 if i == num_convs - 1 else 1
+            self.convs.append(
+                Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    stride=stride,
+                    padding=1))
+
+        roi_feat_size = _pair(roi_feat_size)
+        pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2)
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            in_channels = (
+                self.conv_out_channels *
+                pooled_area if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(in_channels, self.fc_out_channels))
+
+        self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes)
+        self.relu = nn.ReLU()
+        self.max_pool = MaxPool2d(2, 2)
+        self.loss_iou = build_loss(loss_iou)
+
+    def init_weights(self):
+        for conv in self.convs:
+            kaiming_init(conv)
+        for fc in self.fcs:
+            kaiming_init(
+                fc,
+                a=1,
+                mode='fan_in',
+                nonlinearity='leaky_relu',
+                distribution='uniform')
+        normal_init(self.fc_mask_iou, std=0.01)
+
+    def forward(self, mask_feat, mask_pred):
+        mask_pred = mask_pred.sigmoid()
+        mask_pred_pooled = self.max_pool(mask_pred.unsqueeze(1))
+
+        x = torch.cat((mask_feat, mask_pred_pooled), 1)
+
+        for conv in self.convs:
+            x = self.relu(conv(x))
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_iou = self.fc_mask_iou(x)
+        return mask_iou
+
+    @force_fp32(apply_to=('mask_iou_pred', ))
+    def loss(self, mask_iou_pred, mask_iou_targets):
+        pos_inds = mask_iou_targets > 0
+        if pos_inds.sum() > 0:
+            loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds],
+                                          mask_iou_targets[pos_inds])
+        else:
+            loss_mask_iou = mask_iou_pred.sum() * 0
+        return dict(loss_mask_iou=loss_mask_iou)
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def get_targets(self, sampling_results, gt_masks, mask_pred, mask_targets,
+                    rcnn_train_cfg):
+        """Compute target of mask IoU.
+
+        Mask IoU target is the IoU of the predicted mask (inside a bbox) and
+        the gt mask of corresponding gt mask (the whole instance).
+        The intersection area is computed inside the bbox, and the gt mask area
+        is computed with two steps, firstly we compute the gt area inside the
+        bbox, then divide it by the area ratio of gt area inside the bbox and
+        the gt area of the whole instance.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]): sampling results.
+            gt_masks (BitmapMask | PolygonMask): Gt masks (the whole instance)
+                of each image, with the same shape of the input image.
+            mask_pred (Tensor): Predicted masks of each positive proposal,
+                shape (num_pos, h, w).
+            mask_targets (Tensor): Gt mask of each positive proposal,
+                binary map of the shape (num_pos, h, w).
+            rcnn_train_cfg (dict): Training config for R-CNN part.
+
+        Returns:
+            Tensor: mask iou target (length == num positive).
+        """
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+
+        # compute the area ratio of gt areas inside the proposals and
+        # the whole instance
+        area_ratios = map(self._get_area_ratio, pos_proposals,
+                          pos_assigned_gt_inds, gt_masks)
+        area_ratios = torch.cat(list(area_ratios))
+        assert mask_targets.size(0) == area_ratios.size(0)
+
+        mask_pred = (mask_pred > rcnn_train_cfg.mask_thr_binary).float()
+        mask_pred_areas = mask_pred.sum((-1, -2))
+
+        # mask_pred and mask_targets are binary maps
+        overlap_areas = (mask_pred * mask_targets).sum((-1, -2))
+
+        # compute the mask area of the whole instance
+        gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7)
+
+        mask_iou_targets = overlap_areas / (
+            mask_pred_areas + gt_full_areas - overlap_areas)
+        return mask_iou_targets
+
+    def _get_area_ratio(self, pos_proposals, pos_assigned_gt_inds, gt_masks):
+        """Compute area ratio of the gt mask inside the proposal and the gt
+        mask of the corresponding instance."""
+        num_pos = pos_proposals.size(0)
+        if num_pos > 0:
+            area_ratios = []
+            proposals_np = pos_proposals.cpu().numpy()
+            pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+            # compute mask areas of gt instances (batch processing for speedup)
+            gt_instance_mask_area = gt_masks.areas
+            for i in range(num_pos):
+                gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+
+                # crop the gt mask inside the proposal
+                bbox = proposals_np[i, :].astype(np.int32)
+                gt_mask_in_proposal = gt_mask.crop(bbox)
+
+                ratio = gt_mask_in_proposal.areas[0] / (
+                    gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7)
+                area_ratios.append(ratio)
+            area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to(
+                pos_proposals.device)
+        else:
+            area_ratios = pos_proposals.new_zeros((0, ))
+        return area_ratios
+
+    @force_fp32(apply_to=('mask_iou_pred', ))
+    def get_mask_scores(self, mask_iou_pred, det_bboxes, det_labels):
+        """Get the mask scores.
+
+        mask_score = bbox_score * mask_iou
+        """
+        inds = range(det_labels.size(0))
+        mask_scores = mask_iou_pred[inds, det_labels] * det_bboxes[inds, -1]
+        mask_scores = mask_scores.cpu().numpy()
+        det_labels = det_labels.cpu().numpy()
+        return [mask_scores[det_labels == i] for i in range(self.num_classes)]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/mask_scoring_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_scoring_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..c6e55c7752209cb5c15eab689ad9e8ac1fef1b66
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/mask_scoring_roi_head.py
@@ -0,0 +1,122 @@
+import torch
+
+from mmdet.core import bbox2roi
+from ..builder import HEADS, build_head
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class MaskScoringRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self, mask_iou_head, **kwargs):
+        assert mask_iou_head is not None
+        super(MaskScoringRoIHead, self).__init__(**kwargs)
+        self.mask_iou_head = build_head(mask_iou_head)
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super(MaskScoringRoIHead, self).init_weights(pretrained)
+        self.mask_iou_head.init_weights()
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for Mask head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        mask_results = super(MaskScoringRoIHead,
+                             self)._mask_forward_train(x, sampling_results,
+                                                       bbox_feats, gt_masks,
+                                                       img_metas)
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_mask_pred = mask_results['mask_pred'][
+            range(mask_results['mask_pred'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        mask_iou_targets = self.mask_iou_head.get_targets(
+            sampling_results, gt_masks, pos_mask_pred,
+            mask_results['mask_targets'], self.train_cfg)
+        loss_mask_iou = self.mask_iou_head.loss(pos_mask_iou_pred,
+                                                mask_iou_targets)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Obtain mask prediction without augmentation."""
+        # image shapes of images in the batch
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            num_classes = self.mask_head.num_classes
+            segm_results = [[[] for _ in range(num_classes)]
+                            for _ in range(num_imgs)]
+            mask_scores = [[[] for _ in range(num_classes)]
+                           for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i]
+                for i in range(num_imgs)
+            ]
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            concat_det_labels = torch.cat(det_labels)
+            # get mask scores with mask iou head
+            mask_feats = mask_results['mask_feats']
+            mask_pred = mask_results['mask_pred']
+            mask_iou_pred = self.mask_iou_head(
+                mask_feats, mask_pred[range(concat_det_labels.size(0)),
+                                      concat_det_labels])
+            # split batch mask prediction back to each image
+            num_bboxes_per_img = tuple(len(_bbox) for _bbox in _bboxes)
+            mask_preds = mask_pred.split(num_bboxes_per_img, 0)
+            mask_iou_preds = mask_iou_pred.split(num_bboxes_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            mask_scores = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                    mask_scores.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], _bboxes[i], det_labels[i],
+                        self.test_cfg, ori_shapes[i], scale_factors[i],
+                        rescale)
+                    # get mask scores with mask iou head
+                    mask_score = self.mask_iou_head.get_mask_scores(
+                        mask_iou_preds[i], det_bboxes[i], det_labels[i])
+                    segm_results.append(segm_result)
+                    mask_scores.append(mask_score)
+        return list(zip(segm_results, mask_scores))
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/pisa_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/pisa_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..e01113629837eb9c065ba40cd4025899b7bd0172
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/pisa_roi_head.py
@@ -0,0 +1,159 @@
+from mmdet.core import bbox2roi
+from ..builder import HEADS
+from ..losses.pisa_loss import carl_loss, isr_p
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class PISARoIHead(StandardRoIHead):
+    r"""The RoI head for `Prime Sample Attention in Object Detection
+    <https://arxiv.org/abs/1904.04821>`_."""
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """Forward function for training.
+
+        Args:
+            x (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): List of region proposals.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (list[Tensor], optional): Specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : True segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            neg_label_weights = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                # neg label weight is obtained by sampling when using ISR-N
+                neg_label_weight = None
+                if isinstance(sampling_result, tuple):
+                    sampling_result, neg_label_weight = sampling_result
+                sampling_results.append(sampling_result)
+                neg_label_weights.append(neg_label_weight)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(
+                x,
+                sampling_results,
+                gt_bboxes,
+                gt_labels,
+                img_metas,
+                neg_label_weights=neg_label_weights)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x, rois):
+        """Box forward function used in both training and testing."""
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self,
+                            x,
+                            sampling_results,
+                            gt_bboxes,
+                            gt_labels,
+                            img_metas,
+                            neg_label_weights=None):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+
+        # neg_label_weights obtained by sampler is image-wise, mapping back to
+        # the corresponding location in label weights
+        if neg_label_weights[0] is not None:
+            label_weights = bbox_targets[1]
+            cur_num_rois = 0
+            for i in range(len(sampling_results)):
+                num_pos = sampling_results[i].pos_inds.size(0)
+                num_neg = sampling_results[i].neg_inds.size(0)
+                label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos +
+                              num_neg] = neg_label_weights[i]
+                cur_num_rois += num_pos + num_neg
+
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            bbox_targets = isr_p(
+                cls_score,
+                bbox_pred,
+                bbox_targets,
+                rois,
+                sampling_results,
+                self.bbox_head.loss_cls,
+                self.bbox_head.bbox_coder,
+                **isr_cfg,
+                num_class=self.bbox_head.num_classes)
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois,
+                                        *bbox_targets)
+
+        # Add CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                cls_score,
+                bbox_targets[0],
+                bbox_pred,
+                bbox_targets[2],
+                self.bbox_head.loss_bbox,
+                **carl_cfg,
+                num_class=self.bbox_head.num_classes)
+            loss_bbox.update(loss_carl)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/point_rend_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/point_rend_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..478cdf5bff6779e9291f94c543205289036ea2c6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -0,0 +1,218 @@
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+
+import torch
+import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+
+from mmdet.core import bbox2roi, bbox_mapping, merge_aug_masks
+from .. import builder
+from ..builder import HEADS
+from .standard_roi_head import StandardRoIHead
+
+
+@HEADS.register_module()
+class PointRendRoIHead(StandardRoIHead):
+    """`PointRend <https://arxiv.org/abs/1912.08193>`_."""
+
+    def __init__(self, point_head, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.with_bbox and self.with_mask
+        self.init_point_head(point_head)
+
+    def init_point_head(self, point_head):
+        """Initialize ``point_head``"""
+        self.point_head = builder.build_head(point_head)
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+        """
+        super().init_weights(pretrained)
+        self.point_head.init_weights()
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for mask head and point head
+        in training."""
+        mask_results = super()._mask_forward_train(x, sampling_results,
+                                                   bbox_feats, gt_masks,
+                                                   img_metas)
+        if mask_results['loss_mask'] is not None:
+            loss_point = self._mask_point_forward_train(
+                x, sampling_results, mask_results['mask_pred'], gt_masks,
+                img_metas)
+            mask_results['loss_mask'].update(loss_point)
+
+        return mask_results
+
+    def _mask_point_forward_train(self, x, sampling_results, mask_pred,
+                                  gt_masks, img_metas):
+        """Run forward function and calculate loss for point head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        rel_roi_points = self.point_head.get_roi_rel_points_train(
+            mask_pred, pos_labels, cfg=self.train_cfg)
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, rois, rel_roi_points, img_metas)
+        coarse_point_feats = point_sample(mask_pred, rel_roi_points)
+        mask_point_pred = self.point_head(fine_grained_point_feats,
+                                          coarse_point_feats)
+        mask_point_target = self.point_head.get_targets(
+            rois, rel_roi_points, sampling_results, gt_masks, self.train_cfg)
+        loss_mask_point = self.point_head.loss(mask_point_pred,
+                                               mask_point_target, pos_labels)
+
+        return loss_mask_point
+
+    def _get_fine_grained_point_feats(self, x, rois, rel_roi_points,
+                                      img_metas):
+        """Sample fine grained feats from each level feature map and
+        concatenate them together."""
+        num_imgs = len(img_metas)
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = feats[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois[inds], rel_roi_points[inds], feat.shape[2:],
+                        spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(feat, rel_img_points)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+            fine_grained_feats.append(torch.cat(point_feats, dim=0))
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def _mask_point_forward_test(self, x, rois, label_pred, mask_pred,
+                                 img_metas):
+        """Mask refining process with point head in testing."""
+        refined_mask_pred = mask_pred.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_pred, cfg=self.test_cfg)
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x, rois, rel_roi_points, img_metas)
+            coarse_point_feats = point_sample(mask_pred, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+            refined_mask_pred = refined_mask_pred.scatter_(
+                2, point_indices, mask_point_pred)
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Obtain mask prediction without augmentation."""
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.mask_head.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            _bboxes = [
+                det_bboxes[i][:, :4] *
+                scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                for i in range(len(det_bboxes))
+            ]
+            mask_rois = bbox2roi(_bboxes)
+            mask_results = self._mask_forward(x, mask_rois)
+            # split batch mask prediction back to each image
+            mask_pred = mask_results['mask_pred']
+            num_mask_roi_per_img = [len(det_bbox) for det_bbox in det_bboxes]
+            mask_preds = mask_pred.split(num_mask_roi_per_img, 0)
+            mask_rois = mask_rois.split(num_mask_roi_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    x_i = [xx[[i]] for xx in x]
+                    mask_rois_i = mask_rois[i]
+                    mask_rois_i[:, 0] = 0  # TODO: remove this hack
+                    mask_pred_i = self._mask_point_forward_test(
+                        x_i, mask_rois_i, det_labels[i], mask_preds[i],
+                        [img_metas])
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_pred_i, _bboxes[i], det_labels[i], self.test_cfg,
+                        ori_shapes[i], scale_factors[i], rescale)
+                    segm_results.append(segm_result)
+        return segm_results
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                mask_results['mask_pred'] = self._mask_point_forward_test(
+                    x, mask_rois, det_labels, mask_results['mask_pred'],
+                    img_metas)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=1.0,
+                rescale=False)
+        return segm_result
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/__init__.py b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a6ec0ecc3063cd23c2463f2f53f1c2a83b04d43b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,7 @@
+from .generic_roi_extractor import GenericRoIExtractor
+from .single_level_roi_extractor import SingleRoIExtractor
+
+__all__ = [
+    'SingleRoIExtractor',
+    'GenericRoIExtractor',
+]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
new file mode 100755
index 0000000000000000000000000000000000000000..0e42b52f3615722ba9dd575c8f6293dd64004be8
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -0,0 +1,83 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+
+
+class BaseRoIExtractor(nn.Module, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (int): Strides of input feature maps.
+    """
+
+    def __init__(self, roi_layer, out_channels, featmap_strides):
+        super(BaseRoIExtractor, self).__init__()
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.fp16_enabled = False
+
+    @property
+    def num_inputs(self):
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+
+    def init_weights(self):
+        pass
+
+    def build_roi_layers(self, layer_cfg, featmap_strides):
+        """Build RoI operator to extract feature from each level feature map.
+
+        Args:
+            layer_cfg (dict): Dictionary to construct and config RoI layer
+                operation. Options are modules under ``mmcv/ops`` such as
+                ``RoIAlign``.
+            featmap_strides (int): The stride of input feature map w.r.t to the
+                original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+
+        Returns:
+            nn.ModuleList: The RoI extractor modules for each level feature
+                map.
+        """
+
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def roi_rescale(self, rois, scale_factor):
+        """Scale RoI coordinates by scale factor.
+
+        Args:
+            rois (torch.Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+
+        Returns:
+            torch.Tensor: Scaled RoI.
+        """
+
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+
+    @abstractmethod
+    def forward(self, feats, rois, roi_scale_factor=None):
+        pass
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
new file mode 100755
index 0000000000000000000000000000000000000000..80c25bb8fde7844c994bfc1f4ae1a2d960cbf3d6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -0,0 +1,83 @@
+from mmcv.cnn.bricks import build_plugin_layer
+from mmcv.runner import force_fp32
+
+from mmdet.models.builder import ROI_EXTRACTORS
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@ROI_EXTRACTORS.register_module()
+class GenericRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from all level feature maps levels.
+
+    This is the implementation of `A novel Region of Interest Extraction Layer
+    for Instance Segmentation <https://arxiv.org/abs/2004.13665>`_.
+
+    Args:
+        aggregation (str): The method to aggregate multiple feature maps.
+            Options are 'sum', 'concat'. Default: 'sum'.
+        pre_cfg (dict | None): Specify pre-processing modules. Default: None.
+        post_cfg (dict | None): Specify post-processing modules. Default: None.
+        kwargs (keyword arguments): Arguments that are the same
+            as :class:`BaseRoIExtractor`.
+    """
+
+    def __init__(self,
+                 aggregation='sum',
+                 pre_cfg=None,
+                 post_cfg=None,
+                 **kwargs):
+        super(GenericRoIExtractor, self).__init__(**kwargs)
+
+        assert aggregation in ['sum', 'concat']
+
+        self.aggregation = aggregation
+        self.with_post = post_cfg is not None
+        self.with_pre = pre_cfg is not None
+        # build pre/post processing modules
+        if self.with_post:
+            self.post_module = build_plugin_layer(post_cfg, '_post_module')[1]
+        if self.with_pre:
+            self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1]
+
+    @force_fp32(apply_to=('feats', ), out_fp16=True)
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        if len(feats) == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # some times rois is an empty tensor
+        if roi_feats.shape[0] == 0:
+            return roi_feats
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        # mark the starting channels for concat mode
+        start_channels = 0
+        for i in range(num_levels):
+            roi_feats_t = self.roi_layers[i](feats[i], rois)
+            end_channels = start_channels + roi_feats_t.size(1)
+            if self.with_pre:
+                # apply pre-processing to a RoI extracted from each layer
+                roi_feats_t = self.pre_module(roi_feats_t)
+            if self.aggregation == 'sum':
+                # and sum them all
+                roi_feats += roi_feats_t
+            else:
+                # and concat them along channel dimension
+                roi_feats[:, start_channels:end_channels] = roi_feats_t
+            # update channels starting position
+            start_channels = end_channels
+        # check if concat channels match at the end
+        if self.aggregation == 'concat':
+            assert start_channels == self.out_channels
+
+        if self.with_post:
+            # apply post-processing before return the result
+            roi_feats = self.post_module(roi_feats)
+        return roi_feats
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
new file mode 100755
index 0000000000000000000000000000000000000000..c0eebc4af57bd283d4faac88a7f2af053dff1201
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,99 @@
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet.models.builder import ROI_EXTRACTORS
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@ROI_EXTRACTORS.register_module()
+class SingleRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (int): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0. Default: 56.
+    """
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 finest_scale=56):
+        super(SingleRoIExtractor, self).__init__(roi_layer, out_channels,
+                                                 featmap_strides)
+        self.finest_scale = finest_scale
+
+    def map_roi_levels(self, rois, num_levels):
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    @force_fp32(apply_to=('feats', ), out_fp16=True)
+    def forward(self, feats, rois, roi_scale_factor=None):
+        """Forward function."""
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        if torch.onnx.is_in_onnx_export():
+            # Work around to export mask-rcnn to onnx
+            roi_feats = rois[:, :1].clone().detach()
+            roi_feats = roi_feats.expand(
+                -1, self.out_channels * out_size[0] * out_size[1])
+            roi_feats = roi_feats.reshape(-1, self.out_channels, *out_size)
+            roi_feats = roi_feats * 0
+        else:
+            roi_feats = feats[0].new_zeros(
+                rois.size(0), self.out_channels, *out_size)
+        # TODO: remove this when parrots supports
+        if torch.__version__ == 'parrots':
+            roi_feats.requires_grad = True
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        target_lvls = self.map_roi_levels(rois, num_levels)
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            # TODO: make it nicer when exporting to onnx
+            if torch.onnx.is_in_onnx_export():
+                # To keep all roi_align nodes exported to onnx
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+                continue
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+            else:
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/__init__.py b/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..bbe70145b8bf7c304370f725f5afa8db98666679
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,3 @@
+from .res_layer import ResLayer
+
+__all__ = ['ResLayer']
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/res_layer.py b/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/res_layer.py
new file mode 100755
index 0000000000000000000000000000000000000000..b5c343258b079a0dd832d4f999c18d002b06efac
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/shared_heads/res_layer.py
@@ -0,0 +1,77 @@
+import torch.nn as nn
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import auto_fp16, load_checkpoint
+
+from mmdet.models.backbones import ResNet
+from mmdet.models.builder import SHARED_HEADS
+from mmdet.models.utils import ResLayer as _ResLayer
+from mmdet.utils import get_root_logger
+
+
+@SHARED_HEADS.register_module()
+class ResLayer(nn.Module):
+
+    def __init__(self,
+                 depth,
+                 stage=3,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch',
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 with_cp=False,
+                 dcn=None):
+        super(ResLayer, self).__init__()
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.stage = stage
+        self.fp16_enabled = False
+        block, stage_blocks = ResNet.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = _ResLayer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            stride=stride,
+            dilation=dilation,
+            style=style,
+            with_cp=with_cp,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn)
+        self.add_module(f'layer{stage + 1}', res_layer)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in the module.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    @auto_fp16()
+    def forward(self, x):
+        res_layer = getattr(self, f'layer{self.stage + 1}')
+        out = res_layer(x)
+        return out
+
+    def train(self, mode=True):
+        super(ResLayer, self).train(mode)
+        if self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/standard_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/standard_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..c530f2a5ce904439492de12ff7d267cc1e757d3a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/standard_roi_head.py
@@ -0,0 +1,295 @@
+import torch
+
+from mmdet.core import bbox2result, bbox2roi, build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_roi_head import BaseRoIHead
+from .test_mixins import BBoxTestMixin, MaskTestMixin
+
+
+@HEADS.register_module()
+class StandardRoIHead(BaseRoIHead, BBoxTestMixin, MaskTestMixin):
+    """Simplest base roi head including one bbox head and one mask head."""
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+            self.bbox_sampler = build_sampler(
+                self.train_cfg.sampler, context=self)
+
+    def init_bbox_head(self, bbox_roi_extractor, bbox_head):
+        """Initialize ``bbox_head``"""
+        self.bbox_roi_extractor = build_roi_extractor(bbox_roi_extractor)
+        self.bbox_head = build_head(bbox_head)
+
+    def init_mask_head(self, mask_roi_extractor, mask_head):
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = build_roi_extractor(mask_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+        self.mask_head = build_head(mask_head)
+
+    def init_weights(self, pretrained):
+        """Initialize the weights in head.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if self.with_shared_head:
+            self.shared_head.init_weights(pretrained=pretrained)
+        if self.with_bbox:
+            self.bbox_roi_extractor.init_weights()
+            self.bbox_head.init_weights()
+        if self.with_mask:
+            self.mask_head.init_weights()
+            if not self.share_roi_extractor:
+                self.mask_roi_extractor.init_weights()
+
+    def forward_dummy(self, x, proposals):
+        """Dummy forward function."""
+        # bbox head
+        outs = ()
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            outs = outs + (bbox_results['cls_score'],
+                           bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            outs = outs + (mask_results['mask_pred'], )
+        return outs
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None):
+        """
+        Args:
+            x (list[Tensor]): list of multi-level img features.
+            img_metas (list[dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            proposals (list[Tensors]): list of region proposals.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            num_imgs = len(img_metas)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            for i in range(num_imgs):
+                assign_result = self.bbox_assigner.assign(
+                    proposal_list[i], gt_bboxes[i], gt_bboxes_ignore[i],
+                    gt_labels[i])
+                sampling_result = self.bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(x, sampling_results,
+                                                    gt_bboxes, gt_labels,
+                                                    img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self._mask_forward_train(x, sampling_results,
+                                                    bbox_results['bbox_feats'],
+                                                    gt_masks, img_metas)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x, rois):
+        """Box head forward function used in both training and testing."""
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels,
+                            img_metas):
+        """Run forward function and calculate loss for box head in training."""
+        rois = bbox2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results, gt_bboxes,
+                                                  gt_labels, self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _mask_forward_train(self, x, sampling_results, bbox_feats, gt_masks,
+                            img_metas):
+        """Run forward function and calculate loss for mask head in
+        training."""
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_bboxes.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_targets = self.mask_head.get_targets(sampling_results, gt_masks,
+                                                  self.train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        loss_mask = self.mask_head.loss(mask_results['mask_pred'],
+                                        mask_targets, pos_labels)
+
+        mask_results.update(loss_mask=loss_mask, mask_targets=mask_targets)
+        return mask_results
+
+    def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
+        """Mask head forward function used in both training and testing."""
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_pred = self.mask_head(mask_feats)
+        mask_results = dict(mask_pred=mask_pred, mask_feats=mask_feats)
+        return mask_results
+
+    async def async_simple_test(self,
+                                x,
+                                proposal_list,
+                                img_metas,
+                                proposals=None,
+                                rescale=False):
+        """Async test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = await self.async_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = await self.async_test_mask(
+                x,
+                img_metas,
+                det_bboxes,
+                det_labels,
+                rescale=rescale,
+                mask_test_cfg=self.test_cfg.get('mask'))
+            return bbox_results, segm_results
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+        if torch.onnx.is_in_onnx_export():
+            if self.with_mask:
+                segm_results = self.simple_test_mask(
+                    x, img_metas, det_bboxes, det_labels, rescale=rescale)
+                return det_bboxes, det_labels, segm_results
+            else:
+                return det_bboxes, det_labels
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head.num_classes)
+            for i in range(len(det_bboxes))
+        ]
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_metas, det_bboxes, det_labels, rescale=rescale)
+            return list(zip(bbox_results, segm_results))
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        det_bboxes, det_labels = self.aug_test_bboxes(x, img_metas,
+                                                      proposal_list,
+                                                      self.test_cfg)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+        bbox_results = bbox2result(_det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        # det_bboxes always keep the original scale
+        if self.with_mask:
+            segm_results = self.aug_test_mask(x, img_metas, det_bboxes,
+                                              det_labels)
+            return [(bbox_results, segm_results)]
+        else:
+            return [bbox_results]
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/test_mixins.py b/insightface/detection/scrfd/mmdet/models/roi_heads/test_mixins.py
new file mode 100755
index 0000000000000000000000000000000000000000..12684c52c2bac0493f1b31e5ea91bd66c004c76b
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/test_mixins.py
@@ -0,0 +1,268 @@
+import logging
+import sys
+
+import torch
+
+from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        merge_aug_masks, multiclass_nms)
+
+logger = logging.getLogger(__name__)
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_metas,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    bbox_semaphore=None,
+                                    global_lock=None):
+            """Asynchronized test for box head without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_metas[0]['img_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_metas,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = bbox2roi(proposals)
+        bbox_results = self._bbox_forward(x, rois)
+        img_shapes = tuple(meta['img_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+
+        # split batch bbox prediction back to each image
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_score = cls_score.split(num_proposals_per_img, 0)
+
+        # some detector with_reg is False, bbox_pred will be None
+        if bbox_pred is not None:
+            # the bbox prediction of some detectors like SABL is not Tensor
+            if isinstance(bbox_pred, torch.Tensor):
+                bbox_pred = bbox_pred.split(num_proposals_per_img, 0)
+            else:
+                bbox_pred = self.bbox_head.bbox_pred_split(
+                    bbox_pred, num_proposals_per_img)
+        else:
+            bbox_pred = (None, ) * len(proposals)
+
+        # apply bbox post-processing to each image individually
+        det_bboxes = []
+        det_labels = []
+        for i in range(len(proposals)):
+            det_bbox, det_label = self.bbox_head.get_bboxes(
+                rois[i],
+                cls_score[i],
+                bbox_pred[i],
+                img_shapes[i],
+                scale_factors[i],
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            det_bboxes.append(det_bbox)
+            det_labels.append(det_label)
+        return det_bboxes, det_labels
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+            # TODO more flexible
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            rois = bbox2roi([proposals])
+            bbox_results = self._bbox_forward(x, rois)
+            bboxes, scores = self.bbox_head.get_bboxes(
+                rois,
+                bbox_results['cls_score'],
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin(object):
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_mask(self,
+                                  x,
+                                  img_metas,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            """Asynchronized test for mask head without augmentation."""
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_metas[0]['ori_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[] for _ in range(self.mask_head.num_classes)]
+            else:
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_seg_masks(
+                    mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape,
+                    scale_factor, rescale)
+            return segm_result
+
+    def simple_test_mask(self,
+                         x,
+                         img_metas,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        """Simple test for mask head without augmentation."""
+        # image shapes of images in the batch
+        ori_shapes = tuple(meta['ori_shape'] for meta in img_metas)
+        scale_factors = tuple(meta['scale_factor'] for meta in img_metas)
+        num_imgs = len(det_bboxes)
+        if all(det_bbox.shape[0] == 0 for det_bbox in det_bboxes):
+            segm_results = [[[] for _ in range(self.mask_head.num_classes)]
+                            for _ in range(num_imgs)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factors[0], float):
+                scale_factors = [
+                    torch.from_numpy(scale_factor).to(det_bboxes[0].device)
+                    for scale_factor in scale_factors
+                ]
+            if torch.onnx.is_in_onnx_export():
+                # avoid mask_pred.split with static number of prediction
+                mask_preds = []
+                _bboxes = []
+                for i, boxes in enumerate(det_bboxes):
+                    boxes = boxes[:, :4]
+                    if rescale:
+                        boxes *= scale_factors[i]
+                    _bboxes.append(boxes)
+                    img_inds = boxes[:, :1].clone() * 0 + i
+                    mask_rois = torch.cat([img_inds, boxes], dim=-1)
+                    mask_result = self._mask_forward(x, mask_rois)
+                    mask_preds.append(mask_result['mask_pred'])
+            else:
+                _bboxes = [
+                    det_bboxes[i][:, :4] *
+                    scale_factors[i] if rescale else det_bboxes[i][:, :4]
+                    for i in range(len(det_bboxes))
+                ]
+                mask_rois = bbox2roi(_bboxes)
+                mask_results = self._mask_forward(x, mask_rois)
+                mask_pred = mask_results['mask_pred']
+                # split batch mask prediction back to each image
+                num_mask_roi_per_img = [
+                    det_bbox.shape[0] for det_bbox in det_bboxes
+                ]
+                mask_preds = mask_pred.split(num_mask_roi_per_img, 0)
+
+            # apply mask post-processing to each image individually
+            segm_results = []
+            for i in range(num_imgs):
+                if det_bboxes[i].shape[0] == 0:
+                    segm_results.append(
+                        [[] for _ in range(self.mask_head.num_classes)])
+                else:
+                    segm_result = self.mask_head.get_seg_masks(
+                        mask_preds[i], _bboxes[i], det_labels[i],
+                        self.test_cfg, ori_shapes[i], scale_factors[i],
+                        rescale)
+                    segm_results.append(segm_result)
+        return segm_results
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                flip_direction = img_meta[0]['flip_direction']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip, flip_direction)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=1.0,
+                rescale=False)
+        return segm_result
diff --git a/insightface/detection/scrfd/mmdet/models/roi_heads/trident_roi_head.py b/insightface/detection/scrfd/mmdet/models/roi_heads/trident_roi_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..da3eed8faafd6934ed9a2de074da019f881ccd5a
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/roi_heads/trident_roi_head.py
@@ -0,0 +1,111 @@
+import torch
+from mmcv.ops import batched_nms
+
+from mmdet.core import (bbox2result, bbox2roi, bbox_mapping, merge_aug_bboxes,
+                        multiclass_nms)
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class TridentRoIHead(StandardRoIHead):
+    """Trident roi head.
+
+    Args:
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+    """
+
+    def __init__(self, num_branch, test_branch_idx, **kwargs):
+        self.num_branch = num_branch
+        self.test_branch_idx = test_branch_idx
+        super(TridentRoIHead, self).__init__(**kwargs)
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False):
+        """Test without augmentation as follows:
+
+        1. Compute prediction bbox and label per branch.
+        2. Merge predictions of each branch according to scores of
+           bboxes, i.e., bboxes with higher score are kept to give
+           top-k prediction.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        det_bboxes_list, det_labels_list = self.simple_test_bboxes(
+            x, img_metas, proposal_list, self.test_cfg, rescale=rescale)
+
+        for _ in range(len(det_bboxes_list)):
+            if det_bboxes_list[_].shape[0] == 0:
+                det_bboxes_list[_] = det_bboxes_list[_].new_empty((0, 5))
+        trident_det_bboxes = torch.cat(det_bboxes_list, 0)
+        trident_det_labels = torch.cat(det_labels_list, 0)
+
+        if trident_det_bboxes.numel() == 0:
+            det_bboxes = trident_det_bboxes.new_zeros((0, 5))
+            det_labels = trident_det_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            nms_bboxes = trident_det_bboxes[:, :4]
+            nms_scores = trident_det_bboxes[:, 4].contiguous()
+            nms_inds = trident_det_labels
+            nms_cfg = self.test_cfg['nms']
+            det_bboxes, keep = batched_nms(nms_bboxes, nms_scores, nms_inds,
+                                           nms_cfg)
+            det_labels = trident_det_labels[keep]
+            if self.test_cfg['max_per_img'] > 0:
+                det_labels = det_labels[:self.test_cfg['max_per_img']]
+                det_bboxes = det_bboxes[:self.test_cfg['max_per_img']]
+
+        det_bboxes, det_labels = [det_bboxes], [det_labels]
+
+        bbox_results = [
+            bbox2result(det_bboxes[i], det_labels[i],
+                        self.bbox_head.num_classes)
+            for i in range(len(det_bboxes))
+        ]
+
+        return bbox_results
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+
+            trident_bboxes, trident_scores = [], []
+            for branch_idx in range(len(proposal_list)):
+                proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                         scale_factor, flip, flip_direction)
+                rois = bbox2roi([proposals])
+                bbox_results = self._bbox_forward(x, rois)
+                bboxes, scores = self.bbox_head.get_bboxes(
+                    rois,
+                    bbox_results['cls_score'],
+                    bbox_results['bbox_pred'],
+                    img_shape,
+                    scale_factor,
+                    rescale=False,
+                    cfg=None)
+                trident_bboxes.append(bboxes)
+                trident_scores.append(scores)
+
+            aug_bboxes.append(torch.cat(trident_bboxes, 0))
+            aug_scores.append(torch.cat(trident_scores, 0))
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/insightface/detection/scrfd/mmdet/models/utils/__init__.py b/insightface/detection/scrfd/mmdet/models/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..1fc09b5e8fe34c7493203e3e9b7d9a433ed21d7c
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/__init__.py
@@ -0,0 +1,16 @@
+from .builder import build_positional_encoding, build_transformer
+from .gaussian_target import gaussian_radius, gen_gaussian_target
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+from .res_layer import ResLayer
+from .transformer import (FFN, MultiheadAttention, Transformer,
+                          TransformerDecoder, TransformerDecoderLayer,
+                          TransformerEncoder, TransformerEncoderLayer)
+
+__all__ = [
+    'ResLayer', 'gaussian_radius', 'gen_gaussian_target', 'MultiheadAttention',
+    'FFN', 'TransformerEncoderLayer', 'TransformerEncoder',
+    'TransformerDecoderLayer', 'TransformerDecoder', 'Transformer',
+    'build_transformer', 'build_positional_encoding', 'SinePositionalEncoding',
+    'LearnedPositionalEncoding'
+]
diff --git a/insightface/detection/scrfd/mmdet/models/utils/builder.py b/insightface/detection/scrfd/mmdet/models/utils/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..f362d1c92ca9d4ed95a2b3d28d3e6baedd14e462
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/builder.py
@@ -0,0 +1,14 @@
+from mmcv.utils import Registry, build_from_cfg
+
+TRANSFORMER = Registry('Transformer')
+POSITIONAL_ENCODING = Registry('Position encoding')
+
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
diff --git a/insightface/detection/scrfd/mmdet/models/utils/gaussian_target.py b/insightface/detection/scrfd/mmdet/models/utils/gaussian_target.py
new file mode 100755
index 0000000000000000000000000000000000000000..7bb7160cb4bf2f47876f6e8373142aa5846920a9
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/gaussian_target.py
@@ -0,0 +1,185 @@
+from math import sqrt
+
+import torch
+
+
+def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'):
+    """Generate 2D gaussian kernel.
+
+    Args:
+        radius (int): Radius of gaussian kernel.
+        sigma (int): Sigma of gaussian function. Default: 1.
+        dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32.
+        device (str): Device of gaussian tensor. Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius + 1) * (2 * radius + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x + y * y) / (2 * sigma * sigma)).exp()
+
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def gen_gaussian_target(heatmap, center, radius, k=1):
+    """Generate 2D gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius (int): Radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter = 2 * radius + 1
+    gaussian_kernel = gaussian2D(
+        radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device)
+
+    x, y = center
+
+    height, width = heatmap.shape[:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius - top:radius + bottom,
+                                      radius - left:radius + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def gaussian_radius(det_size, min_overlap):
+    r"""Generate 2D gaussian radius.
+
+    This function is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet-Lite/blob/master/core/sample/
+    utils.py#L65>`_.
+
+    Given ``min_overlap``, radius could computed by a quadratic equation
+    according to Vieta's formulas.
+
+    There are 3 cases for computing gaussian radius, details are following:
+
+    - Explanation of figure: ``lt`` and ``br`` indicates the left-top and
+      bottom-right corner of ground truth box. ``x`` indicates the
+      generated corner at the limited position when ``radius=r``.
+
+    - Case1: one corner is inside the gt box and the other is outside.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x----------+--+
+        |  |          |  |
+        |  |          |  |    height
+        |  | overlap  |  |
+        |  |          |  |
+        |  |          |  |      v
+        +--+---------br--+      -
+           |          |  |
+           +----------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad
+        {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\
+        {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case2: both two corners are inside the gt box.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x-------+  |
+        |  |       |  |
+        |  |overlap|  |       height
+        |  |       |  |
+        |  +-------x--+
+        |          |  |         v
+        +----------+-br         -
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad
+        {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\
+        {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case3: both two corners are outside the gt box.
+
+    .. code:: text
+
+           |<   width   >|
+
+        x--+----------------+
+        |  |                |
+        +-lt-------------+  |   -
+        |  |             |  |   ^
+        |  |             |  |
+        |  |   overlap   |  | height
+        |  |             |  |
+        |  |             |  |   v
+        |  +------------br--+   -
+        |                |  |
+        +----------------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad
+        {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\
+        {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\
+        {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a}
+
+    Args:
+        det_size (list[int]): Shape of object.
+        min_overlap (float): Min IoU with ground truth for boxes generated by
+            keypoints inside the gaussian kernel.
+
+    Returns:
+        radius (int): Radius of gaussian kernel.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 - sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 - sq2) / (2 * a2)
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / (2 * a3)
+    return min(r1, r2, r3)
diff --git a/insightface/detection/scrfd/mmdet/models/utils/positional_encoding.py b/insightface/detection/scrfd/mmdet/models/utils/positional_encoding.py
new file mode 100755
index 0000000000000000000000000000000000000000..9bda2bbdbfcc28ba6304b6325ae556fa02554ac1
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/positional_encoding.py
@@ -0,0 +1,150 @@
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import uniform_init
+
+from .builder import POSITIONAL_ENCODING
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding(nn.Module):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Default 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Default False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Default 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Default 1e-6.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6):
+        super(SinePositionalEncoding, self).__init__()
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding(nn.Module):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+    """
+
+    def __init__(self, num_feats, row_num_embed=50, col_num_embed=50):
+        super(LearnedPositionalEncoding, self).__init__()
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize the learnable weights."""
+        uniform_init(self.row_embed)
+        uniform_init(self.col_embed)
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/models/utils/res_layer.py b/insightface/detection/scrfd/mmdet/models/utils/res_layer.py
new file mode 100755
index 0000000000000000000000000000000000000000..27902426469bcdab392db1a38627d852ae8e3029
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/res_layer.py
@@ -0,0 +1,102 @@
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from torch import nn as nn
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super(ResLayer, self).__init__(*layers)
diff --git a/insightface/detection/scrfd/mmdet/models/utils/transformer.py b/insightface/detection/scrfd/mmdet/models/utils/transformer.py
new file mode 100755
index 0000000000000000000000000000000000000000..f94b183b5b8825424e41dbd27c9b73532fac1322
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/models/utils/transformer.py
@@ -0,0 +1,744 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import (Linear, build_activation_layer, build_norm_layer,
+                      xavier_init)
+
+from .builder import TRANSFORMER
+
+
+class MultiheadAttention(nn.Module):
+    """A warpper for torch.nn.MultiheadAttention.
+
+    This module implements MultiheadAttention with residual connection,
+    and positional encoding used in DETR is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads. Same as
+            `nn.MultiheadAttention`.
+        dropout (float): A Dropout layer on attn_output_weights. Default 0.0.
+    """
+
+    def __init__(self, embed_dims, num_heads, dropout=0.0):
+        super(MultiheadAttention, self).__init__()
+        assert embed_dims % num_heads == 0, 'embed_dims must be ' \
+            f'divisible by num_heads. got {embed_dims} and {num_heads}.'
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, dropout)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self,
+                x,
+                key=None,
+                value=None,
+                residual=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None):
+        """Forward function for `MultiheadAttention`.
+
+        Args:
+            x (Tensor): The input query with shape [num_query, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+            key (Tensor): The key tensor with shape [num_key, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+                Default None. If None, the `query` will be used.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Default None.
+                If None, the `key` will be used.
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. Default None. If not None, it will
+                be added to `x` before forward function.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Default None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`.
+            attn_mask (Tensor): ByteTensor mask with shape [num_query,
+                num_key]. Same in `nn.MultiheadAttention.forward`.
+                Default None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_key].
+                Same in `nn.MultiheadAttention.forward`. Default None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        query = x
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if residual is None:
+            residual = x
+        if key_pos is None:
+            if query_pos is not None and key is not None:
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+        out = self.attn(
+            query,
+            key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        return residual + self.dropout(out)
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'dropout={self.dropout})'
+        return repr_str
+
+
+class FFN(nn.Module):
+    """Implements feed-forward networks (FFNs) with residual connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        num_fcs (int): The number of fully-connected layers in FFNs.
+        act_cfg (dict): The activation config for FFNs.
+        dropout (float): Probability of an element to be zeroed. Default 0.0.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 dropout=0.0,
+                 add_residual=True):
+        super(FFN, self).__init__()
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.dropout = dropout
+        self.activate = build_activation_layer(act_cfg)
+
+        layers = nn.ModuleList()
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                nn.Sequential(
+                    Linear(in_channels, feedforward_channels), self.activate,
+                    nn.Dropout(dropout)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        self.layers = nn.Sequential(*layers)
+        self.dropout = nn.Dropout(dropout)
+        self.add_residual = add_residual
+
+    def forward(self, x, residual=None):
+        """Forward function for `FFN`."""
+        out = self.layers(x)
+        if not self.add_residual:
+            return out
+        if residual is None:
+            residual = x
+        return residual + self.dropout(out)
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(embed_dims={self.embed_dims}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'num_fcs={self.num_fcs}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'add_residual={self.add_residual})'
+        return repr_str
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Implements one encoder layer in DETR transformer.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as `FFN`.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        dropout (float): Probability of an element to be zeroed. Default 0.0.
+        order (tuple[str]): The order for encoder layer. Valid examples are
+            ('selfattn', 'norm', 'ffn', 'norm') and ('norm', 'selfattn',
+            'norm', 'ffn'). Default ('selfattn', 'norm', 'ffn', 'norm').
+        act_cfg (dict): The activation config for FFNs. Defalut ReLU.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default 2.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 dropout=0.0,
+                 order=('selfattn', 'norm', 'ffn', 'norm'),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 num_fcs=2):
+        super(TransformerEncoderLayer, self).__init__()
+        assert isinstance(order, tuple) and len(order) == 4
+        assert set(order) == set(['selfattn', 'norm', 'ffn'])
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.feedforward_channels = feedforward_channels
+        self.dropout = dropout
+        self.order = order
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.num_fcs = num_fcs
+        self.pre_norm = order[0] == 'norm'
+        self.self_attn = MultiheadAttention(embed_dims, num_heads, dropout)
+        self.ffn = FFN(embed_dims, feedforward_channels, num_fcs, act_cfg,
+                       dropout)
+        self.norms = nn.ModuleList()
+        self.norms.append(build_norm_layer(norm_cfg, embed_dims)[1])
+        self.norms.append(build_norm_layer(norm_cfg, embed_dims)[1])
+
+    def forward(self, x, pos=None, attn_mask=None, key_padding_mask=None):
+        """Forward function for `TransformerEncoderLayer`.
+
+        Args:
+            x (Tensor): The input query with shape [num_key, bs,
+                embed_dims]. Same in `MultiheadAttention.forward`.
+            pos (Tensor): The positional encoding for query. Default None.
+                Same as `query_pos` in `MultiheadAttention.forward`.
+            attn_mask (Tensor): ByteTensor mask with shape [num_key,
+                num_key]. Same in `MultiheadAttention.forward`. Default None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_key].
+                Same in `MultiheadAttention.forward`. Default None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_key, bs, embed_dims].
+        """
+        norm_cnt = 0
+        inp_residual = x
+        for layer in self.order:
+            if layer == 'selfattn':
+                # self attention
+                query = key = value = x
+                x = self.self_attn(
+                    query,
+                    key,
+                    value,
+                    inp_residual if self.pre_norm else None,
+                    query_pos=pos,
+                    key_pos=pos,
+                    attn_mask=attn_mask,
+                    key_padding_mask=key_padding_mask)
+                inp_residual = x
+            elif layer == 'norm':
+                x = self.norms[norm_cnt](x)
+                norm_cnt += 1
+            elif layer == 'ffn':
+                x = self.ffn(x, inp_residual if self.pre_norm else None)
+        return x
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'order={self.order}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'norm_cfg={self.norm_cfg}, '
+        repr_str += f'num_fcs={self.num_fcs})'
+        return repr_str
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Implements one decoder layer in DETR transformer.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `TransformerEncoderLayer`.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): Same as `TransformerEncoderLayer`.
+        dropout (float): Same as `TransformerEncoderLayer`. Default 0.0.
+        order (tuple[str]): The order for decoder layer. Valid examples are
+            ('selfattn', 'norm', 'multiheadattn', 'norm', 'ffn', 'norm') and
+            ('norm', 'selfattn', 'norm', 'multiheadattn', 'norm', 'ffn').
+            Default the former.
+        act_cfg (dict): Same as `TransformerEncoderLayer`. Defalut ReLU.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        num_fcs (int): The number of fully-connected layers in FFNs.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 dropout=0.0,
+                 order=('selfattn', 'norm', 'multiheadattn', 'norm', 'ffn',
+                        'norm'),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 num_fcs=2):
+        super(TransformerDecoderLayer, self).__init__()
+        assert isinstance(order, tuple) and len(order) == 6
+        assert set(order) == set(['selfattn', 'norm', 'multiheadattn', 'ffn'])
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.feedforward_channels = feedforward_channels
+        self.dropout = dropout
+        self.order = order
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.num_fcs = num_fcs
+        self.pre_norm = order[0] == 'norm'
+        self.self_attn = MultiheadAttention(embed_dims, num_heads, dropout)
+        self.multihead_attn = MultiheadAttention(embed_dims, num_heads,
+                                                 dropout)
+        self.ffn = FFN(embed_dims, feedforward_channels, num_fcs, act_cfg,
+                       dropout)
+        self.norms = nn.ModuleList()
+        # 3 norm layers in official DETR's TransformerDecoderLayer
+        for _ in range(3):
+            self.norms.append(build_norm_layer(norm_cfg, embed_dims)[1])
+
+    def forward(self,
+                x,
+                memory,
+                memory_pos=None,
+                query_pos=None,
+                memory_attn_mask=None,
+                target_attn_mask=None,
+                memory_key_padding_mask=None,
+                target_key_padding_mask=None):
+        """Forward function for `TransformerDecoderLayer`.
+
+        Args:
+            x (Tensor): Input query with shape [num_query, bs, embed_dims].
+            memory (Tensor): Tensor got from `TransformerEncoder`, with shape
+                [num_key, bs, embed_dims].
+            memory_pos (Tensor): The positional encoding for `memory`. Default
+                None. Same as `key_pos` in `MultiheadAttention.forward`.
+            query_pos (Tensor): The positional encoding for `query`. Default
+                None. Same as `query_pos` in `MultiheadAttention.forward`.
+            memory_attn_mask (Tensor): ByteTensor mask for `memory`, with
+                shape [num_key, num_key]. Same as `attn_mask` in
+                `MultiheadAttention.forward`. Default None.
+            target_attn_mask (Tensor): ByteTensor mask for `x`, with shape
+                [num_query, num_query]. Same as `attn_mask` in
+                `MultiheadAttention.forward`. Default None.
+            memory_key_padding_mask (Tensor): ByteTensor for `memory`, with
+                shape [bs, num_key]. Same as `key_padding_mask` in
+                `MultiheadAttention.forward`. Default None.
+            target_key_padding_mask (Tensor): ByteTensor for `x`, with shape
+                [bs, num_query]. Same as `key_padding_mask` in
+                `MultiheadAttention.forward`. Default None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        norm_cnt = 0
+        inp_residual = x
+        for layer in self.order:
+            if layer == 'selfattn':
+                query = key = value = x
+                x = self.self_attn(
+                    query,
+                    key,
+                    value,
+                    inp_residual if self.pre_norm else None,
+                    query_pos,
+                    key_pos=query_pos,
+                    attn_mask=target_attn_mask,
+                    key_padding_mask=target_key_padding_mask)
+                inp_residual = x
+            elif layer == 'norm':
+                x = self.norms[norm_cnt](x)
+                norm_cnt += 1
+            elif layer == 'multiheadattn':
+                query = x
+                key = value = memory
+                x = self.multihead_attn(
+                    query,
+                    key,
+                    value,
+                    inp_residual if self.pre_norm else None,
+                    query_pos,
+                    key_pos=memory_pos,
+                    attn_mask=memory_attn_mask,
+                    key_padding_mask=memory_key_padding_mask)
+                inp_residual = x
+            elif layer == 'ffn':
+                x = self.ffn(x, inp_residual if self.pre_norm else None)
+        return x
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'order={self.order}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'norm_cfg={self.norm_cfg}, '
+        repr_str += f'num_fcs={self.num_fcs})'
+        return repr_str
+
+
+class TransformerEncoder(nn.Module):
+    """Implements the encoder in DETR transformer.
+
+    Args:
+        num_layers (int): The number of `TransformerEncoderLayer`.
+        embed_dims (int): Same as `TransformerEncoderLayer`.
+        num_heads (int): Same as `TransformerEncoderLayer`.
+        feedforward_channels (int): Same as `TransformerEncoderLayer`.
+        dropout (float): Same as `TransformerEncoderLayer`. Default 0.0.
+        order (tuple[str]): Same as `TransformerEncoderLayer`.
+        act_cfg (dict): Same as `TransformerEncoderLayer`. Defalut ReLU.
+        norm_cfg (dict): Same as `TransformerEncoderLayer`. Default
+            layer normalization.
+        num_fcs (int): Same as `TransformerEncoderLayer`. Default 2.
+    """
+
+    def __init__(self,
+                 num_layers,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 dropout=0.0,
+                 order=('selfattn', 'norm', 'ffn', 'norm'),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 num_fcs=2):
+        super(TransformerEncoder, self).__init__()
+        assert isinstance(order, tuple) and len(order) == 4
+        assert set(order) == set(['selfattn', 'norm', 'ffn'])
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.feedforward_channels = feedforward_channels
+        self.dropout = dropout
+        self.order = order
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.num_fcs = num_fcs
+        self.pre_norm = order[0] == 'norm'
+        self.layers = nn.ModuleList()
+        for _ in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(embed_dims, num_heads,
+                                        feedforward_channels, dropout, order,
+                                        act_cfg, norm_cfg, num_fcs))
+        self.norm = build_norm_layer(norm_cfg,
+                                     embed_dims)[1] if self.pre_norm else None
+
+    def forward(self, x, pos=None, attn_mask=None, key_padding_mask=None):
+        """Forward function for `TransformerEncoder`.
+
+        Args:
+            x (Tensor): Input query. Same in `TransformerEncoderLayer.forward`.
+            pos (Tensor): Positional encoding for query. Default None.
+                Same in `TransformerEncoderLayer.forward`.
+            attn_mask (Tensor): ByteTensor attention mask. Default None.
+                Same in `TransformerEncoderLayer.forward`.
+            key_padding_mask (Tensor): Same in
+                `TransformerEncoderLayer.forward`. Default None.
+
+        Returns:
+            Tensor: Results with shape [num_key, bs, embed_dims].
+        """
+        for layer in self.layers:
+            x = layer(x, pos, attn_mask, key_padding_mask)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_layers={self.num_layers}, '
+        repr_str += f'embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'order={self.order}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'norm_cfg={self.norm_cfg}, '
+        repr_str += f'num_fcs={self.num_fcs})'
+        return repr_str
+
+
+class TransformerDecoder(nn.Module):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        num_layers (int): The number of `TransformerDecoderLayer`.
+        embed_dims (int): Same as `TransformerDecoderLayer`.
+        num_heads (int): Same as `TransformerDecoderLayer`.
+        feedforward_channels (int): Same as `TransformerDecoderLayer`.
+        dropout (float): Same as `TransformerDecoderLayer`. Default 0.0.
+        order (tuple[str]): Same as `TransformerDecoderLayer`.
+        act_cfg (dict): Same as `TransformerDecoderLayer`. Defalut ReLU.
+        norm_cfg (dict): Same as `TransformerDecoderLayer`. Default
+            layer normalization.
+        num_fcs (int): Same as `TransformerDecoderLayer`. Default 2.
+    """
+
+    def __init__(self,
+                 num_layers,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 dropout=0.0,
+                 order=('selfattn', 'norm', 'multiheadattn', 'norm', 'ffn',
+                        'norm'),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 num_fcs=2,
+                 return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        assert isinstance(order, tuple) and len(order) == 6
+        assert set(order) == set(['selfattn', 'norm', 'multiheadattn', 'ffn'])
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.feedforward_channels = feedforward_channels
+        self.dropout = dropout
+        self.order = order
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.num_fcs = num_fcs
+        self.return_intermediate = return_intermediate
+        self.layers = nn.ModuleList()
+        for _ in range(num_layers):
+            self.layers.append(
+                TransformerDecoderLayer(embed_dims, num_heads,
+                                        feedforward_channels, dropout, order,
+                                        act_cfg, norm_cfg, num_fcs))
+        self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+    def forward(self,
+                x,
+                memory,
+                memory_pos=None,
+                query_pos=None,
+                memory_attn_mask=None,
+                target_attn_mask=None,
+                memory_key_padding_mask=None,
+                target_key_padding_mask=None):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            x (Tensor): Input query. Same in `TransformerDecoderLayer.forward`.
+            memory (Tensor): Same in `TransformerDecoderLayer.forward`.
+            memory_pos (Tensor): Same in `TransformerDecoderLayer.forward`.
+                Default None.
+            query_pos (Tensor): Same in `TransformerDecoderLayer.forward`.
+                Default None.
+            memory_attn_mask (Tensor): Same in
+                `TransformerDecoderLayer.forward`. Default None.
+            target_attn_mask (Tensor): Same in
+                `TransformerDecoderLayer.forward`. Default None.
+            memory_key_padding_mask (Tensor): Same in
+                `TransformerDecoderLayer.forward`. Default None.
+            target_key_padding_mask (Tensor): Same in
+                `TransformerDecoderLayer.forward`. Default None.
+
+        Returns:
+            Tensor: Results with shape [num_query, bs, embed_dims].
+        """
+        intermediate = []
+        for layer in self.layers:
+            x = layer(x, memory, memory_pos, query_pos, memory_attn_mask,
+                      target_attn_mask, memory_key_padding_mask,
+                      target_key_padding_mask)
+            if self.return_intermediate:
+                intermediate.append(self.norm(x))
+        if self.norm is not None:
+            x = self.norm(x)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(x)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return x.unsqueeze(0)
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_layers={self.num_layers}, '
+        repr_str += f'embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'order={self.order}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'norm_cfg={self.norm_cfg}, '
+        repr_str += f'num_fcs={self.num_fcs}, '
+        repr_str += f'return_intermediate={self.return_intermediate})'
+        return repr_str
+
+
+@TRANSFORMER.register_module()
+class Transformer(nn.Module):
+    """Implements the DETR transformer.
+
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads. Same as
+            `nn.MultiheadAttention`.
+        num_encoder_layers (int): Number of `TransformerEncoderLayer`.
+        num_decoder_layers (int): Number of `TransformerDecoderLayer`.
+        feedforward_channels (int): The hidden dimension for FFNs used in both
+            encoder and decoder.
+        dropout (float): Probability of an element to be zeroed. Default 0.0.
+        act_cfg (dict): Activation config for FFNs used in both encoder
+            and decoder. Defalut ReLU.
+        norm_cfg (dict): Config dict for normalization used in both encoder
+            and decoder. Default layer normalization.
+        num_fcs (int): The number of fully-connected layers in FFNs, which is
+            used for both encoder and decoder.
+        pre_norm (bool): Whether the normalization layer is ordered
+            first in the encoder and decoder. Default False.
+        return_intermediate_dec (bool): Whether to return the intermediate
+            output from each TransformerDecoderLayer or only the last
+            TransformerDecoderLayer. Default False. If False, the returned
+            `hs` has shape [num_decoder_layers, bs, num_query, embed_dims].
+            If True, the returned `hs` will have shape [1, bs, num_query,
+            embed_dims].
+    """
+
+    def __init__(self,
+                 embed_dims=512,
+                 num_heads=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 feedforward_channels=2048,
+                 dropout=0.0,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 num_fcs=2,
+                 pre_norm=False,
+                 return_intermediate_dec=False):
+        super(Transformer, self).__init__()
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.feedforward_channels = feedforward_channels
+        self.dropout = dropout
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.num_fcs = num_fcs
+        self.pre_norm = pre_norm
+        self.return_intermediate_dec = return_intermediate_dec
+        if self.pre_norm:
+            encoder_order = ('norm', 'selfattn', 'norm', 'ffn')
+            decoder_order = ('norm', 'selfattn', 'norm', 'multiheadattn',
+                             'norm', 'ffn')
+        else:
+            encoder_order = ('selfattn', 'norm', 'ffn', 'norm')
+            decoder_order = ('selfattn', 'norm', 'multiheadattn', 'norm',
+                             'ffn', 'norm')
+        self.encoder = TransformerEncoder(num_encoder_layers, embed_dims,
+                                          num_heads, feedforward_channels,
+                                          dropout, encoder_order, act_cfg,
+                                          norm_cfg, num_fcs)
+        self.decoder = TransformerDecoder(num_decoder_layers, embed_dims,
+                                          num_heads, feedforward_channels,
+                                          dropout, decoder_order, act_cfg,
+                                          norm_cfg, num_fcs,
+                                          return_intermediate_dec)
+
+    def init_weights(self, distribution='uniform'):
+        """Initialize the transformer weights."""
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution=distribution)
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        x = x.flatten(2).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.flatten(1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(
+            x, pos=pos_embed, attn_mask=None, key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            target,
+            memory,
+            memory_pos=pos_embed,
+            query_pos=query_embed,
+            memory_attn_mask=None,
+            target_attn_mask=None,
+            memory_key_padding_mask=mask,
+            target_key_padding_mask=None)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(embed_dims={self.embed_dims}, '
+        repr_str += f'num_heads={self.num_heads}, '
+        repr_str += f'num_encoder_layers={self.num_encoder_layers}, '
+        repr_str += f'num_decoder_layers={self.num_decoder_layers}, '
+        repr_str += f'feedforward_channels={self.feedforward_channels}, '
+        repr_str += f'dropout={self.dropout}, '
+        repr_str += f'act_cfg={self.act_cfg}, '
+        repr_str += f'norm_cfg={self.norm_cfg}, '
+        repr_str += f'num_fcs={self.num_fcs}, '
+        repr_str += f'pre_norm={self.pre_norm}, '
+        repr_str += f'return_intermediate_dec={self.return_intermediate_dec})'
+        return repr_str
diff --git a/insightface/detection/scrfd/mmdet/ops/__init__.py b/insightface/detection/scrfd/mmdet/ops/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae6a83687d155ba755fc7a1880181e668b18e54d
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/ops/__init__.py
@@ -0,0 +1,32 @@
+# This file is added for back-compatibility. Thus, downstream codebase
+# could still use and import mmdet.ops.
+
+# yapf: disable
+from mmcv.ops import (ContextBlock, Conv2d, ConvTranspose2d, ConvWS2d,
+                      CornerPool, DeformConv, DeformConvPack, DeformRoIPooling,
+                      DeformRoIPoolingPack, GeneralizedAttention, Linear,
+                      MaskedConv2d, MaxPool2d, ModulatedDeformConv,
+                      ModulatedDeformConvPack, ModulatedDeformRoIPoolingPack,
+                      NonLocal2D, RoIAlign, RoIPool, SAConv2d,
+                      SigmoidFocalLoss, SimpleRoIAlign, batched_nms,
+                      build_plugin_layer, conv_ws_2d, deform_conv,
+                      deform_roi_pooling, get_compiler_version,
+                      get_compiling_cuda_version, modulated_deform_conv, nms,
+                      nms_match, point_sample, rel_roi_point_to_rel_img_point,
+                      roi_align, roi_pool, sigmoid_focal_loss, soft_nms)
+
+# yapf: enable
+
+__all__ = [
+    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
+    'DeformConv', 'DeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
+    'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
+    'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv',
+    'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss',
+    'MaskedConv2d', 'ContextBlock', 'GeneralizedAttention', 'NonLocal2D',
+    'get_compiler_version', 'get_compiling_cuda_version', 'ConvWS2d',
+    'conv_ws_2d', 'build_plugin_layer', 'batched_nms', 'Conv2d',
+    'ConvTranspose2d', 'MaxPool2d', 'Linear', 'nms_match', 'CornerPool',
+    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+    'SAConv2d'
+]
diff --git a/insightface/detection/scrfd/mmdet/utils/__init__.py b/insightface/detection/scrfd/mmdet/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ac489e2dbbc0e6fa87f5088b4edcc20f8cadc1a6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/__init__.py
@@ -0,0 +1,4 @@
+from .collect_env import collect_env
+from .logger import get_root_logger
+
+__all__ = ['get_root_logger', 'collect_env']
diff --git a/insightface/detection/scrfd/mmdet/utils/collect_env.py b/insightface/detection/scrfd/mmdet/utils/collect_env.py
new file mode 100755
index 0000000000000000000000000000000000000000..89c064accdb10abec4a03de04f601d27aab2da70
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/collect_env.py
@@ -0,0 +1,16 @@
+from mmcv.utils import collect_env as collect_base_env
+from mmcv.utils import get_git_hash
+
+import mmdet
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/insightface/detection/scrfd/mmdet/utils/contextmanagers.py b/insightface/detection/scrfd/mmdet/utils/contextmanagers.py
new file mode 100755
index 0000000000000000000000000000000000000000..38a639262d949b5754dedf12f33fa814b030ea38
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/contextmanagers.py
@@ -0,0 +1,121 @@
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/insightface/detection/scrfd/mmdet/utils/logger.py b/insightface/detection/scrfd/mmdet/utils/logger.py
new file mode 100755
index 0000000000000000000000000000000000000000..6fc6e6b438a73e857ba6f173594985807cb88b30
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/logger.py
@@ -0,0 +1,19 @@
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get root logger.
+
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name='mmdet', log_file=log_file, log_level=log_level)
+
+    return logger
diff --git a/insightface/detection/scrfd/mmdet/utils/profiling.py b/insightface/detection/scrfd/mmdet/utils/profiling.py
new file mode 100755
index 0000000000000000000000000000000000000000..4be9222c37e922329d537f883f5587995e27efc6
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/profiling.py
@@ -0,0 +1,39 @@
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of code
+        suitable for async implementation.
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms '
+            msg += f'gpu_time {gpu_time:.2f} ms stream {stream}'
+            print(msg, end_stream)
diff --git a/insightface/detection/scrfd/mmdet/utils/util_mixins.py b/insightface/detection/scrfd/mmdet/utils/util_mixins.py
new file mode 100755
index 0000000000000000000000000000000000000000..69669a3ca943eebe0f138b2784c5b61724196bbe
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/utils/util_mixins.py
@@ -0,0 +1,104 @@
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr(object):
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/insightface/detection/scrfd/mmdet/version.py b/insightface/detection/scrfd/mmdet/version.py
new file mode 100755
index 0000000000000000000000000000000000000000..753ab3c2970cd2c39eaea21464781b892d4a39af
--- /dev/null
+++ b/insightface/detection/scrfd/mmdet/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '2.7.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/insightface/detection/scrfd/pytest.ini b/insightface/detection/scrfd/pytest.ini
new file mode 100755
index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98
--- /dev/null
+++ b/insightface/detection/scrfd/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+addopts = --xdoctest --xdoctest-style=auto
+norecursedirs = .git ignore build __pycache__ data docker docs .eggs
+
+filterwarnings= default
+                ignore:.*No cfgstr given in Cacher constructor or call.*:Warning
+                ignore:.*Define the __nice__ method for.*:Warning
diff --git a/insightface/detection/scrfd/requirements.txt b/insightface/detection/scrfd/requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..6981bd723391a980c0f22baeab39d0adbcb68679
--- /dev/null
+++ b/insightface/detection/scrfd/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
diff --git a/insightface/detection/scrfd/requirements/build.txt b/insightface/detection/scrfd/requirements/build.txt
new file mode 100755
index 0000000000000000000000000000000000000000..81558298594a9619f3187d220f1accede1865de7
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmdetection
+cython
+numpy
diff --git a/insightface/detection/scrfd/requirements/docs.txt b/insightface/detection/scrfd/requirements/docs.txt
new file mode 100755
index 0000000000000000000000000000000000000000..89fbf86c01cb29f10f7e99c910248c4d5229da58
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/docs.txt
@@ -0,0 +1,4 @@
+recommonmark
+sphinx
+sphinx_markdown_tables
+sphinx_rtd_theme
diff --git a/insightface/detection/scrfd/requirements/optional.txt b/insightface/detection/scrfd/requirements/optional.txt
new file mode 100755
index 0000000000000000000000000000000000000000..6654b5b96f369e441585a7365750dc66d50a5855
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/optional.txt
@@ -0,0 +1,4 @@
+albumentations>=0.3.2
+cityscapesscripts
+imagecorruptions
+mmlvis
diff --git a/insightface/detection/scrfd/requirements/readthedocs.txt b/insightface/detection/scrfd/requirements/readthedocs.txt
new file mode 100755
index 0000000000000000000000000000000000000000..0542bfce6dff3b002a1d33e53c0be975e7feed4a
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/readthedocs.txt
@@ -0,0 +1,3 @@
+mmcv
+torch
+torchvision
diff --git a/insightface/detection/scrfd/requirements/runtime.txt b/insightface/detection/scrfd/requirements/runtime.txt
new file mode 100755
index 0000000000000000000000000000000000000000..f559b96124ea9893d7d9bd4da8553c8e034fc045
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/runtime.txt
@@ -0,0 +1,7 @@
+matplotlib
+mmpycocotools
+numpy
+six
+terminaltables
+tqdm
+autotorch
diff --git a/insightface/detection/scrfd/requirements/tests.txt b/insightface/detection/scrfd/requirements/tests.txt
new file mode 100755
index 0000000000000000000000000000000000000000..acd94d38bdd257054e79841d2dd424ba822861fe
--- /dev/null
+++ b/insightface/detection/scrfd/requirements/tests.txt
@@ -0,0 +1,12 @@
+asynctest
+codecov
+flake8
+interrogate
+isort==4.3.21
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+pytest
+ubelt
+xdoctest>=0.10.0
+yapf
+scipy
diff --git a/insightface/detection/scrfd/search_tools/generate_configs_2.5g.py b/insightface/detection/scrfd/search_tools/generate_configs_2.5g.py
new file mode 100755
index 0000000000000000000000000000000000000000..e118a7f9a5b6c09ee8312d95c1ce78b7809a5795
--- /dev/null
+++ b/insightface/detection/scrfd/search_tools/generate_configs_2.5g.py
@@ -0,0 +1,258 @@
+
+import os
+import os.path as osp
+import io
+import numpy as np
+import argparse
+import datetime
+import importlib
+import configparser
+from tqdm import tqdm
+from mmdet.models import build_detector
+
+import torch
+import autotorch as at
+from mmcv import Config
+
+from mmdet.models import build_detector
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+@at.obj(
+    block=at.Choice('BasicBlock', 'Bottleneck'),
+    base_channels=at.Int(8, 64),
+    stage_blocks=at.List(
+        at.Int(1,10),
+        at.Int(1,10),
+        at.Int(1,10),
+        at.Int(1,10),
+        ),
+    stage_planes_ratio=at.List(
+        at.Real(1.0,4.0),
+        at.Real(1.0,4.0),
+        at.Real(1.0,4.0),
+        ),
+)
+class GenConfigBackbone:
+    def __init__(self, **kwargs):
+        d = {}
+        d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        self.m = 1.0
+
+    def stage_blocks_multi(self, m):
+        self.m = m
+
+    def merge_cfg(self, det_cfg):
+
+        base_channels = max(8, int(self.base_channels*self.m)//8 * 8)
+        stage_planes = [base_channels]
+        for ratio in self.stage_planes_ratio:
+            planes = int(stage_planes[-1] * ratio) //8 * 8
+            stage_planes.append(planes)
+        stage_blocks = [max(1, int(x*self.m)) for x in self.stage_blocks]
+        #print('Blocks:', stage_blocks)
+        #print('Planes:', stage_planes)
+        block_cfg=dict(block=self.block, stage_blocks=tuple(stage_blocks), stage_planes=stage_planes)
+        det_cfg['model']['backbone']['block_cfg'] = block_cfg
+        det_cfg['model']['backbone']['base_channels'] = base_channels
+        neck_in_planes = stage_planes if self.block=='BasicBlock' else [4*x for x in stage_planes]
+        det_cfg['model']['neck']['in_channels'] = neck_in_planes
+        return det_cfg
+
+@at.obj(
+    stage_blocks_ratio=at.Real(0.5, 3.0),
+    base_channels_ratio=at.Real(0.5, 3.0),
+    fpn_channel=at.Int(8,128),
+    head_channel=at.Int(8,256),
+    head_stack=at.Int(1,4),
+)
+class GenConfigAll:
+    def __init__(self, **kwargs):
+        d = {}
+        d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        self.m = 1
+
+    def merge_cfg(self, det_cfg):
+        block_cfg = det_cfg['model']['backbone']['block_cfg']
+        stage_blocks = tuple([int(np.round(x*self.stage_blocks_ratio)) for x in block_cfg['stage_blocks']])
+        block_cfg['stage_blocks'] = stage_blocks
+        stage_planes = [int(np.round(x*self.base_channels_ratio))//8*8 for x in block_cfg['stage_planes']]
+        block_cfg['stage_planes'] = stage_planes
+        det_cfg['model']['backbone']['block_cfg'] = block_cfg
+        det_cfg['model']['backbone']['base_channels'] = stage_planes[0]
+        neck_in_planes = stage_planes if block_cfg['block']=='BasicBlock' else [4*x for x in stage_planes]
+        det_cfg['model']['neck']['in_channels'] = neck_in_planes
+
+        fpn_channel = self.fpn_channel//8*8
+        head_channel = self.head_channel//8*8
+        head_stack = self.head_stack
+        det_cfg['model']['neck']['out_channels'] = fpn_channel
+        det_cfg['model']['bbox_head']['in_channels'] = fpn_channel
+        det_cfg['model']['bbox_head']['feat_channels'] = head_channel
+        det_cfg['model']['bbox_head']['stacked_convs'] = head_stack
+        gn_num_groups = 8
+        for _gn_num_groups in [8, 16, 32, 64]:
+            if head_channel%_gn_num_groups!=0:
+                break
+            gn_num_groups = _gn_num_groups
+        det_cfg['model']['bbox_head']['norm_cfg']['num_groups'] = gn_num_groups
+        return det_cfg
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Auto-SCRFD')
+    # config files
+    parser.add_argument('--group', type=str, default='configs/scrfdgen2.5g', help='configs work dir')
+    parser.add_argument('--template', type=int, default=0, help='template config index')
+    parser.add_argument('--gflops', type=float, default=2.5, help='expected flops')
+    parser.add_argument('--mode', type=int, default=1, help='generation mode, 1 for searching backbone, 2 for search all')
+    # target flops
+    parser.add_argument('--eps', type=float, default=2e-2,
+                         help='eps for expected flops')
+    # num configs
+    parser.add_argument('--num-configs', type=int, default=64, help='num of expected configs')
+    parser = parser
+
+    args = parser.parse_args()
+    return args
+
+
+def is_config_valid(cfg, target_flops, input_shape, eps):
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+    flops, params = get_model_complexity_info(model, input_shape, print_per_layer_stat=False, as_strings=False)
+    print('FLOPs:', flops/1e9)
+    return flops <= (1. + eps) * target_flops and \
+        flops >= (1. - eps) * target_flops
+
+def get_flops(cfg, input_shape):
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    #if torch.cuda.is_available():
+    #    model.cuda()
+    model.eval()
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+    buf = io.StringIO()
+    all_flops, params = get_model_complexity_info(model, input_shape, print_per_layer_stat=True, as_strings=False, ost=buf)
+    buf = buf.getvalue()
+    #print(buf)
+    lines = buf.split("\n")
+    names = ['(stem)', '(layer1)', '(layer2)', '(layer3)', '(layer4)', '(neck)', '(bbox_head)']
+    name_ptr = 0
+    line_num = 0
+    _flops = []
+    while name_ptr<len(names):
+        line = lines[line_num].strip()
+        name = names[name_ptr]
+        if line.startswith(name):
+            flops = float(lines[line_num+1].split(',')[2].strip().split(' ')[0])
+            _flops.append(flops)
+            name_ptr+=1
+        line_num+=1
+
+    backbone_flops = np.array(_flops[:-2], dtype=np.float32)
+    neck_flops = _flops[-2]
+    head_flops = _flops[-1]
+
+    return all_flops/1e9, backbone_flops, neck_flops, head_flops
+
+def is_flops_valid(flops, target_flops, eps):
+    return flops <= (1. + eps) * target_flops and \
+        flops >= (1. - eps) * target_flops
+
+def main():
+    args = get_args()
+    print(datetime.datetime.now())
+
+    input_shape = (3,480,640)
+    runtime_input_shape = input_shape
+    flops_mult = 1.0
+
+    assert osp.exists(args.group)
+    group_name = args.group.split('/')[-1]
+    assert len(group_name)>0
+    input_template = osp.join(args.group, "%s_%d.py"%(group_name, args.template))
+    assert osp.exists(input_template)
+    write_index = args.template+1
+    while True:
+        output_cfg = osp.join(args.group, "%s_%d.py"%(group_name, write_index))
+        if not osp.exists(output_cfg):
+            break
+        write_index+=1
+    print('write-index from:', write_index)
+
+    if args.mode==1:
+        gen = GenConfigBackbone()
+    elif args.mode==2:
+        gen = GenConfigAll()
+        det_cfg = Config.fromfile(input_template)
+        _, template_backbone_flops, _, _= get_flops(det_cfg, runtime_input_shape)
+        template_backbone_ratios = list(map(lambda x:x/template_backbone_flops[0], template_backbone_flops))
+        print('template_backbone_ratios:', template_backbone_ratios)
+
+
+
+    pp = 0
+    write_count = 0
+    while write_count < args.num_configs:
+        pp+=1
+        det_cfg = Config.fromfile(input_template)
+        config = gen.rand
+        det_cfg = config.merge_cfg(det_cfg)
+        all_flops, backbone_flops, neck_flops, head_flops = get_flops(det_cfg, runtime_input_shape)
+        assert len(backbone_flops)==5
+        all_flops *= flops_mult
+        backbone_flops *= flops_mult
+        neck_flops *= flops_mult
+        head_flops *= flops_mult
+        is_valid = True
+        if pp%10==0:
+            print(pp, all_flops, backbone_flops, neck_flops, head_flops, datetime.datetime.now())
+        if args.mode==2:
+            backbone_ratios = list(map(lambda x:x/backbone_flops[0], backbone_flops))
+            #if head_flops*0.8<neck_flops:
+            #    continue
+            for i in range(1,5):
+                if not is_flops_valid(template_backbone_ratios[i], backbone_ratios[i], args.eps*5):
+                    is_valid = False
+                    break
+        if not is_valid:
+            continue
+        #if args.mode==1:
+        #    if np.argmax(backbone_flops)!=1:
+        #        continue
+        #    if np.mean(backbone_flops[1:3])*0.8<np.mean(backbone_flops[-2:]):
+        #        continue
+        if not is_flops_valid(all_flops, args.gflops, args.eps):
+            continue
+
+        output_cfg_file = osp.join(args.group, "%s_%d.py"%(group_name, write_index))
+        det_cfg.dump(output_cfg_file)
+        print('SUCC', write_index, all_flops, backbone_flops, neck_flops, head_flops, datetime.datetime.now())
+        write_index += 1
+        write_count += 1
+
+if __name__ == '__main__':
+    main()
+
diff --git a/insightface/detection/scrfd/search_tools/search_stat.py b/insightface/detection/scrfd/search_tools/search_stat.py
new file mode 100755
index 0000000000000000000000000000000000000000..f8b6013d82e9cf9e8426293ea327a8ee33328270
--- /dev/null
+++ b/insightface/detection/scrfd/search_tools/search_stat.py
@@ -0,0 +1,78 @@
+import os
+import json
+import os.path as osp
+import io
+import torch
+import numpy as np
+from mmcv import Config
+
+from mmdet.models import build_detector
+from mmcv.cnn import get_model_complexity_info
+
+def get_flops(cfg, input_shape):
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+    buf = io.StringIO()
+    all_flops, params = get_model_complexity_info(model, input_shape, print_per_layer_stat=True, as_strings=False, ost=buf)
+    buf = buf.getvalue()
+    lines = buf.split("\n")
+    names = ['(stem)', '(layer1)', '(layer2)', '(layer3)', '(layer4)', '(neck)', '(bbox_head)']
+    name_ptr = 0
+    line_num = 0
+    _flops = []
+    while name_ptr<len(names):
+        line = lines[line_num].strip()
+        name = names[name_ptr]
+        if line.startswith(name):
+            flops = float(lines[line_num+1].split(',')[2].strip().split(' ')[0])
+            _flops.append(flops)
+            name_ptr+=1
+        line_num+=1
+
+    backbone_flops = _flops[:-2]
+    neck_flops = _flops[-2]
+    head_flops = _flops[-1]
+    return all_flops/1e9, backbone_flops, neck_flops, head_flops
+def get_stat(result_dir, group, prefix, idx):
+    curr_dir = osp.join(result_dir, group, "%s_%d"%(prefix, idx))
+    aps_file = osp.join(curr_dir, 'aps')
+    aps = []
+    if osp.exists(aps_file):
+        with open(aps_file, 'r') as f:
+            aps = [float(x) for x in f.readline().strip().split(',')]
+    cfg_file = osp.join('configs', group, '%s_%d.py'%(prefix, idx))
+    cfg = Config.fromfile(cfg_file)
+    all_flops, backbone_flops, neck_flops, head_flops = get_flops(cfg, (3,480,640))
+    
+    return aps, all_flops, backbone_flops, neck_flops, head_flops
+
+result_dir = './wouts'
+group = 'scrfdgen2.5g'
+prefix = group
+idx_from = 0
+idx_to = 320
+
+outf = open(osp.join(result_dir, "%s.txt"%prefix), 'w')
+
+for idx in range(idx_from, idx_to):
+    aps, all_flops, backbone_flops, neck_flops, head_flops = get_stat(result_dir, group, prefix, idx)
+    backbone_ratio = np.sum(backbone_flops) / all_flops
+    neck_ratio = neck_flops / all_flops
+    head_ratio = head_flops / all_flops
+    print(idx, aps, all_flops, backbone_flops, backbone_ratio, neck_ratio, head_ratio)
+    name = "%s_%d"%(prefix, idx)
+    data = dict(name=name, backbone_flops=backbone_flops, neck_flops=neck_flops, head_flops=head_flops, all_flops=all_flops, aps=aps)
+    data = json.dumps(data)
+    outf.write(data)
+    outf.write("\n")
+outf.close()
+
diff --git a/insightface/detection/scrfd/search_tools/search_test.sh b/insightface/detection/scrfd/search_tools/search_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cc4ec842e4129631773cf34d6fbbbcbb05c8f1df
--- /dev/null
+++ b/insightface/detection/scrfd/search_tools/search_test.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+
+GPU=0
+OUTPUT_DIR=wouts
+THR=0.02
+GROUP=scrfdgen2p5g
+PREFIX=$GROUP
+
+for i in {1..320}
+do
+    TASK="$PREFIX"_"$i"
+    echo $TASK
+    CUDA_VISIBLE_DEVICES="$GPU" python -u tools/test_widerface.py ./configs/"$GROUP"/"$TASK".py ./work_dirs/"$TASK"/latest.pth --mode 0 --thr "$THR" --out "$OUTPUT_DIR"/"$GROUP"/"$TASK"
+done
+
diff --git a/insightface/detection/scrfd/search_tools/search_train.py b/insightface/detection/scrfd/search_tools/search_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..2322e65671c27b3c989f09031dbc6f66cab3f5ad
--- /dev/null
+++ b/insightface/detection/scrfd/search_tools/search_train.py
@@ -0,0 +1,21 @@
+import os, sys
+
+
+
+def train(group, prefix, idx, gpuid):
+    assert idx>=0
+    cmd = "CUDA_VISIBLE_DEVICES='%d' PORT=%d bash ./tools/dist_train.sh ./configs/%s/%s_%d.py 1 --no-validate"%(gpuid,29100+idx, group, prefix, idx)
+    print(cmd)
+    os.system(cmd)
+
+
+gpuid = int(sys.argv[1])
+idx_from = int(sys.argv[2])
+idx_to = int(sys.argv[3])
+group = 'scrfdgen'
+if len(sys.argv)>4:
+    group = sys.argv[4]
+
+for idx in range(idx_from, idx_to):
+    train(group, group, idx, gpuid)
+
diff --git a/insightface/detection/scrfd/search_tools/search_train.sh b/insightface/detection/scrfd/search_tools/search_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..53bcd46f246de213c5af09866b3ceadafafa0e9e
--- /dev/null
+++ b/insightface/detection/scrfd/search_tools/search_train.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+GROUP=scrfdgen2.5g
+TASKS_PER_GPU=8
+OFFSET=1
+for i in {0..7}
+do
+    let a=TASKS_PER_GPU*i+OFFSET
+    let i2=i+1
+    let b=TASKS_PER_GPU*i2+OFFSET
+    echo $i,$a,$b,$GROUP
+    python -u search_tools/search_train.py $i $a $b $GROUP > "gpu$i".log 2>&1 &
+done
+
diff --git a/insightface/detection/scrfd/setup.cfg b/insightface/detection/scrfd/setup.cfg
new file mode 100755
index 0000000000000000000000000000000000000000..873406e8f19ce243d5e010fcf34fbd6b43fced75
--- /dev/null
+++ b/insightface/detection/scrfd/setup.cfg
@@ -0,0 +1,13 @@
+[isort]
+line_length = 79
+multi_line_output = 0
+known_standard_library = setuptools
+known_first_party = mmdet
+known_third_party = PIL,asynctest,cityscapesscripts,cv2,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,robustness_eval,scipy,seaborn,six,terminaltables,torch
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
diff --git a/insightface/detection/scrfd/setup.py b/insightface/detection/scrfd/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..55eea6ba642e49e9a8e71e20aaecae2100542c3e
--- /dev/null
+++ b/insightface/detection/scrfd/setup.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+import os
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmdet/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name, module, sources, sources_cuda=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': []}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print(f'Compiling {name} without CUDA')
+        extension = CppExtension
+
+    return extension(
+        name=f'{module}.{name}',
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    import re
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+if __name__ == '__main__':
+    setup(
+        name='mmdet',
+        version=get_version(),
+        description='OpenMMLab Detection Toolbox and Benchmark',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='OpenMMLab',
+        author_email='openmmlab@gmail.com',
+        keywords='computer vision, object detection',
+        url='https://github.com/open-mmlab/mmdetection',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+        ],
+        license='Apache License 2.0',
+        setup_requires=parse_requirements('requirements/build.txt'),
+        tests_require=parse_requirements('requirements/tests.txt'),
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+        },
+        ext_modules=[],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
diff --git a/insightface/detection/scrfd/tools/benchmark.py b/insightface/detection/scrfd/tools/benchmark.py
new file mode 100755
index 0000000000000000000000000000000000000000..9c53a4291a2109db9948cf2736c56a7c19c321a6
--- /dev/null
+++ b/insightface/detection/scrfd/tools/benchmark.py
@@ -0,0 +1,116 @@
+import argparse
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import datetime
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from mmdet.core.evaluation import wider_evaluation, get_widerface_gts
+#from torch.utils import mkldnn as mkldnn_utils
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[480, 640],
+        help='input image size')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+
+    pipelines = cfg.data.test.pipeline
+    for pipeline in pipelines:
+        if pipeline.type=='MultiScaleFlipAug':
+            #pipeline.img_scale = (640, 640)
+            pipeline.img_scale = None
+            pipeline.scale_factor = 1.0
+            transforms = pipeline.transforms
+            for transform in transforms:
+                if transform.type=='Pad':
+                    #transform.size = pipeline.img_scale
+                    transform.size = None
+                    transform.size_divisor = 1
+    #print(cfg.data.test.pipeline)
+    distributed = False
+
+    # build the dataloader
+    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+    if samples_per_gpu > 1:
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    device = torch.device("cpu" if args.cpu else "cuda")
+
+    # build the model and load checkpoint
+    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+    fp16_cfg = cfg.get('fp16', None)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if 'CLASSES' in checkpoint['meta']:
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    model = model.to(device)
+
+    model.eval()
+    dataset = data_loader.dataset
+    for i, data in enumerate(data_loader):
+        img = data['img'][0]
+        #print(img.shape)
+        img = img[:,:,:args.shape[0],:args.shape[1]]
+        img = img.to(device)
+        with torch.no_grad():
+            ta = datetime.datetime.now()
+            result = model.feature_test(img)
+            tb = datetime.datetime.now()
+            print('cost:', (tb-ta).total_seconds())
+            
+
+
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/browse_dataset.py b/insightface/detection/scrfd/tools/browse_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..55110e8949ca998f346a95e8e21629f8e5474bef
--- /dev/null
+++ b/insightface/detection/scrfd/tools/browse_dataset.py
@@ -0,0 +1,68 @@
+import argparse
+import os
+from pathlib import Path
+
+import mmcv
+from mmcv import Config
+
+from mmdet.datasets.builder import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['DefaultFormatBundle', 'Normalize', 'Collect'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=int,
+        default=999,
+        help='the interval of show (ms)')
+    args = parser.parse_args()
+    return args
+
+
+def retrieve_data_cfg(config_path, skip_type):
+    cfg = Config.fromfile(config_path)
+    train_data_cfg = cfg.data.train
+    train_data_cfg['pipeline'] = [
+        x for x in train_data_cfg.pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+    cfg = retrieve_data_cfg(args.config, args.skip_type)
+
+    dataset = build_dataset(cfg.data.train)
+
+    progress_bar = mmcv.ProgressBar(len(dataset))
+    for item in dataset:
+        filename = os.path.join(args.output_dir,
+                                Path(item['filename']).name
+                                ) if args.output_dir is not None else None
+        mmcv.imshow_det_bboxes(
+            item['img'],
+            item['gt_bboxes'],
+            item['gt_labels'],
+            class_names=dataset.CLASSES,
+            show=not args.not_show,
+            out_file=filename,
+            wait_time=args.show_interval)
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/convert_crowdhuman.py b/insightface/detection/scrfd/tools/convert_crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..e116c5126cd59d5f7c05b1b2152baa8d15b92573
--- /dev/null
+++ b/insightface/detection/scrfd/tools/convert_crowdhuman.py
@@ -0,0 +1,92 @@
+from __future__ import print_function
+import cv2
+import argparse
+import os
+import os.path as osp
+import shutil
+import numpy as np
+import json
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='convert crowdhuman dataset to scrfd format')
+    parser.add_argument('--raw', help='raw dataset dir')
+    parser.add_argument('--save', default='data/crowdhuman', help='save path')
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+    raw_image_dir = osp.join(args.raw, 'Images')
+    for subset in ['train', 'val']:
+        save_image_dir = osp.join(args.save, subset, 'images')
+        if not osp.exists(save_image_dir):
+            os.makedirs(save_image_dir)
+        anno_file = osp.join(args.raw, 'annotation_%s.odgt'%subset)
+        fullbody_anno_file = osp.join(osp.join(args.save, subset, "label_fullbody.txt"))
+        head_anno_file = osp.join(osp.join(args.save, subset, "label_head.txt"))
+        fullbody_f = open(fullbody_anno_file, 'w')
+        head_f = open(head_anno_file, 'w')
+        for line in open(anno_file, 'r'):
+            data = json.loads(line)
+            img_id = data['ID']
+            img_name = "%s.jpg"%img_id
+            raw_image_file = osp.join(raw_image_dir, img_name)
+            target_image_file = osp.join(save_image_dir, img_name)
+            img = cv2.imread(raw_image_file)
+            print(raw_image_file, img.shape)
+            fullbody_f.write("# %s %d %d\n"%(img_name,img.shape[1],img.shape[0]))
+            head_f.write("# %s %d %d\n"%(img_name,img.shape[1],img.shape[0]))
+            shutil.copyfile(raw_image_file, target_image_file)
+            items = data['gtboxes']
+            for item in items:
+                fbox = item['fbox']
+                is_ignore = False
+                extra = item['extra']
+                if 'ignore' in extra:
+                    is_ignore = extra['ignore']==1
+                bbox = np.array(fbox, dtype=np.float32)
+                bbox[2] += bbox[0]
+                bbox[3] += bbox[1]
+                if is_ignore:
+                    fullbody_f.write("%.5f %.5f %.5f %.5f %d\n"%(bbox[0], bbox[1], bbox[2], bbox[3], is_ignore))
+                else:
+                    vbox = item['vbox']
+                    vbox = np.array(vbox, dtype=np.float32)
+                    vbox[2] += vbox[0]
+                    vbox[3] += vbox[1]
+                    x1, y1, x2, y2 = vbox[0], vbox[1], vbox[2], vbox[3]
+                    cx = (x1+x2)/2
+                    cy = (y1+y2)/2
+                    kps = np.ones( (5,3), dtype=np.float32)
+                    kps[0,0] = x1
+                    kps[0,1] = y1
+                    kps[1,0] = x2
+                    kps[1,1] = y1
+                    kps[2,0] = cx
+                    kps[2,1] = cy
+                    kps[3,0] = x1
+                    kps[3,1] = y2
+                    kps[4,0] = x2
+                    kps[4,1] = y2
+                    kps_str = " ".join(["%.5f"%x for x in kps.flatten()])
+                    fullbody_f.write("%.5f %.5f %.5f %.5f %s\n"%(bbox[0], bbox[1], bbox[2], bbox[3], kps_str))
+
+
+                hbox = item['hbox']
+                is_ignore = False
+                extra = item['head_attr']
+                if 'ignore' in extra:
+                    is_ignore = extra['ignore']==1
+                bbox = np.array(hbox, dtype=np.float32)
+                bbox[2] += bbox[0]
+                bbox[3] += bbox[1]
+                head_f.write("%.5f %.5f %.5f %.5f %d\n"%(bbox[0], bbox[1], bbox[2], bbox[3], is_ignore))
+        fullbody_f.close()
+        head_f.close()
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/insightface/detection/scrfd/tools/dist_test.sh b/insightface/detection/scrfd/tools/dist_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5488d938c28f7bd909146b5d7de0142a751b0de4
--- /dev/null
+++ b/insightface/detection/scrfd/tools/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -u -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
diff --git a/insightface/detection/scrfd/tools/dist_train.sh b/insightface/detection/scrfd/tools/dist_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5b43fffbf28fc9b8ba7c14efcd5e4f8b19279470
--- /dev/null
+++ b/insightface/detection/scrfd/tools/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/insightface/detection/scrfd/tools/get_flops.py b/insightface/detection/scrfd/tools/get_flops.py
new file mode 100755
index 0000000000000000000000000000000000000000..3f42b70648c15a655327106413c0be04f06df32c
--- /dev/null
+++ b/insightface/detection/scrfd/tools/get_flops.py
@@ -0,0 +1,75 @@
+import argparse
+
+import torch
+from mmcv import Config
+
+from mmdet.models import build_detector
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        #default=[1280, 800],
+        default=[640, 640],
+        help='input image size')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = Config.fromfile(args.config)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+    #print(model.bbox_head)
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not currently supported with {}'.
+            format(model.__class__.__name__))
+
+    #flops, params = get_model_complexity_info(model, input_shape)
+    flops, params = get_model_complexity_info(model, input_shape, print_per_layer_stat=True, as_strings=False)
+    flops += model.bbox_head.extra_flops
+    flops *= 0.75
+    split_line = '=' * 30
+    print('ON VGA:')
+    print(flops/1e9, 'G')
+    print(params/1e6, 'M')
+    #print(f'{split_line}\nInput shape: {input_shape}\n'
+    #      f'Flops: {flops}\nParams: {params}\n{split_line}')
+    #print('!!!Please be cautious if you use the results in papers. '
+    #      'You may need to check if all ops are supported and verify that the '
+    #      'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/print_config.py b/insightface/detection/scrfd/tools/print_config.py
new file mode 100755
index 0000000000000000000000000000000000000000..2ba994fb388477c4d1cc4d174b4af0d84731b475
--- /dev/null
+++ b/insightface/detection/scrfd/tools/print_config.py
@@ -0,0 +1,26 @@
+import argparse
+
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/publish_model.py b/insightface/detection/scrfd/tools/publish_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..c20e7e38b6461bd1e0697eece6f128824189ff5f
--- /dev/null
+++ b/insightface/detection/scrfd/tools/publish_model.py
@@ -0,0 +1,39 @@
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    if out_file.endswith('.pth'):
+        out_file_name = out_file[:-4]
+    else:
+        out_file_name = out_file
+    final_file = out_file_name + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/scrfd.py b/insightface/detection/scrfd/tools/scrfd.py
new file mode 100644
index 0000000000000000000000000000000000000000..176d90e9d1dcab521d48dfc5f3149ba750c33a98
--- /dev/null
+++ b/insightface/detection/scrfd/tools/scrfd.py
@@ -0,0 +1,337 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import datetime
+import numpy as np
+import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class SCRFD:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        self.batched = False
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        input_name = input_cfg.name
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.use_kps = False
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in scrfd model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, thresh):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/128, input_size, (127.5, 127.5, 127.5), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            # If model support batch dim, take first output
+            if self.batched:
+                scores = net_outs[idx][0]
+                bbox_preds = net_outs[idx + fmc][0]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2][0] * stride
+            # If model doesn't support batching take output as is
+            else:
+                scores = net_outs[idx]
+                bbox_preds = net_outs[idx + fmc]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2] * stride
+
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=thresh)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, thresh=0.5, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+def get_scrfd(name, download=False, root='~/.insightface/models', **kwargs):
+    if not download:
+        assert os.path.exists(name)
+        return SCRFD(name)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("scrfd_%s" % name, root=root)
+        return SCRFD(_file)
+
+
+def scrfd_2p5gkps(**kwargs):
+    return get_scrfd("2p5gkps", download=True, **kwargs)
+
+
+if __name__ == '__main__':
+    import glob
+    #detector = SCRFD(model_file='./det.onnx')
+    detector = SCRFD(model_file='./det.onnx')
+    detector.prepare(-1)
+    img_paths = ['tests/data/t3.jpg']
+    for img_path in img_paths:
+        img = cv2.imread(img_path)
+
+        for _ in range(1):
+            ta = datetime.datetime.now()
+            bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640))
+            #bboxes, kpss = detector.detect(img, 0.5)
+            tb = datetime.datetime.now()
+            print('all cost:', (tb-ta).total_seconds()*1000)
+        print(img_path, bboxes.shape)
+        if kpss is not None:
+            print(kpss.shape)
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i]
+            x1,y1,x2,y2,score = bbox.astype(np.int)
+            cv2.rectangle(img, (x1,y1)  , (x2,y2) , (255,0,0) , 2)
+            if kpss is not None:
+                kps = kpss[i]
+                for kp in kps:
+                    kp = kp.astype(np.int)
+                    cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2)
+        filename = img_path.split('/')[-1]
+        print('output:', filename)
+        cv2.imwrite('./outputs/%s'%filename, img)
+
diff --git a/insightface/detection/scrfd/tools/scrfd2onnx.py b/insightface/detection/scrfd/tools/scrfd2onnx.py
new file mode 100755
index 0000000000000000000000000000000000000000..2186515748152e24e0821495717ae7eb6ca7c6d1
--- /dev/null
+++ b/insightface/detection/scrfd/tools/scrfd2onnx.py
@@ -0,0 +1,201 @@
+import argparse
+import os.path as osp
+
+import numpy as np
+import onnx
+import os
+#import onnxruntime as rt
+import torch
+
+from mmdet.core import (build_model_from_cfg, generate_inputs_and_wrap_model,
+                        preprocess_example_input)
+
+#from mmdet.models import build
+
+def pytorch2onnx(config_path,
+                 checkpoint_path,
+                 input_img,
+                 input_shape,
+                 opset_version=11,
+                 show=False,
+                 output_file='tmp.onnx',
+                 verify=False,
+                 simplify = True,
+                 dynamic = True,
+                 normalize_cfg=None,
+                 dataset='coco',
+                 test_img=None):
+
+    input_config = {
+        'input_shape': input_shape,
+        'input_path': input_img,
+        'normalize_cfg': normalize_cfg
+    }
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    tmp_ckpt_file = None
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+        tmp_ckpt_file = checkpoint_path+"_slim.pth"
+        torch.save(checkpoint, tmp_ckpt_file)
+        print('remove optimizer params and save to', tmp_ckpt_file)
+        checkpoint_path = tmp_ckpt_file
+
+    model, tensor_data = generate_inputs_and_wrap_model(
+        config_path, checkpoint_path, input_config)
+
+    if tmp_ckpt_file is not None:
+        os.remove(tmp_ckpt_file)
+
+    if simplify or dynamic:
+        ori_output_file = output_file.split('.')[0]+"_ori.onnx"
+    else:
+        ori_output_file = output_file
+
+    # Define input and outputs names, which are required to properly define
+    # dynamic axes
+    input_names = ['input.1']
+    output_names = ['score_8', 'score_16', 'score_32',
+                    'bbox_8', 'bbox_16', 'bbox_32',
+                    ]
+
+    # If model graph contains keypoints strides add keypoints to outputs
+    if 'stride_kps' in str(model):
+        output_names += ['kps_8', 'kps_16', 'kps_32']
+
+    # Define dynamic axes for export
+    dynamic_axes = None
+    if dynamic:
+        dynamic_axes = {out: {0: '?', 1: '?'} for out in output_names}
+        dynamic_axes[input_names[0]] = {
+            0: '?',
+            2: '?',
+            3: '?'
+        }
+
+    torch.onnx.export(
+        model,
+        tensor_data,
+        ori_output_file,
+        keep_initializers_as_inputs=False,
+        verbose=False,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        opset_version=opset_version)
+
+    if simplify or dynamic:
+        model = onnx.load(ori_output_file)
+        if simplify:
+            from onnxsim import simplify
+            #print(model.graph.input[0])
+            if dynamic:
+                input_shapes = {model.graph.input[0].name : list(input_shape)}
+                model, check = simplify(model, input_shapes=input_shapes, dynamic_input_shape=True)
+            else:
+                model, check = simplify(model)
+            assert check, "Simplified ONNX model could not be validated"
+        onnx.save(model, output_file)
+        os.remove(ori_output_file)
+
+
+    print(f'Successfully exported ONNX model: {output_file}')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMDetection models to ONNX')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--input-img', type=str, help='Images for input')
+    parser.add_argument('--show', action='store_true', help='show onnx graph')
+    parser.add_argument('--output-file', type=str, default='')
+    parser.add_argument('--opset-version', type=int, default=11)
+    parser.add_argument(
+        '--test-img', type=str, default=None, help='Images for test')
+    parser.add_argument(
+        '--dataset', type=str, default='coco', help='Dataset name')
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        help='verify the onnx model output against pytorch output')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        #default=[640, 640],
+        #default=[384, 384],
+        default=[-1, -1],
+        help='input image size')
+    parser.add_argument(
+        '--mean',
+        type=float,
+        nargs='+',
+        default=[127.5, 127.5, 127.5],
+        help='mean value used for preprocess input data')
+    parser.add_argument(
+        '--std',
+        type=float,
+        nargs='+',
+        default=[128.0, 128.0, 128.0],
+        help='variance value used for preprocess input data')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    assert args.opset_version == 11, 'MMDet only support opset 11 now'
+
+    if not args.input_img:
+        args.input_img = osp.join(
+            osp.dirname(__file__), '../tests/data/t1.jpg')
+
+    if len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (1, 3) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    assert len(args.mean) == 3
+    assert len(args.std) == 3
+
+    simplify = True
+    dynamic = False
+    if input_shape[2]<=0 or input_shape[3]<=0:
+        input_shape = (1,3,640,640)
+        dynamic = True
+        #simplify = False
+        print('set to dynamic input with dummy shape:', input_shape)
+
+    normalize_cfg = {'mean': args.mean, 'std': args.std}
+
+    if len(args.output_file)==0:
+        output_dir = osp.join(osp.dirname(__file__), '../onnx')
+        if not osp.exists(output_dir):
+            os.makedirs(output_dir)
+        cfg_name = args.config.split('/')[-1]
+        pos = cfg_name.rfind('.')
+        cfg_name = cfg_name[:pos]
+        if dynamic:
+            args.output_file = osp.join(output_dir, "%s.onnx"%cfg_name)
+        else:
+            args.output_file = osp.join(output_dir, "%s_shape%dx%d.onnx"%(cfg_name,input_shape[2],input_shape[3]))
+
+    # convert model to onnx file
+    pytorch2onnx(
+        args.config,
+        args.checkpoint,
+        args.input_img,
+        input_shape,
+        opset_version=args.opset_version,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify,
+        simplify = simplify,
+        dynamic = dynamic,
+        normalize_cfg=normalize_cfg,
+        dataset=args.dataset,
+        test_img=args.test_img)
diff --git a/insightface/detection/scrfd/tools/test.py b/insightface/detection/scrfd/tools/test.py
new file mode 100755
index 0000000000000000000000000000000000000000..8dcd305e155858a89d6da5c054e796ef2c419daa
--- /dev/null
+++ b/insightface/detection/scrfd/tools/test.py
@@ -0,0 +1,208 @@
+import argparse
+import os
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both '
+            'specified, --options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    if cfg.model.get('neck'):
+        if isinstance(cfg.model.neck, list):
+            for neck_cfg in cfg.model.neck:
+                if neck_cfg.get('rfp_backbone'):
+                    if neck_cfg.rfp_backbone.get('pretrained'):
+                        neck_cfg.rfp_backbone.pretrained = None
+        elif cfg.model.neck.get('rfp_backbone'):
+            if cfg.model.neck.rfp_backbone.get('pretrained'):
+                cfg.model.neck.rfp_backbone.pretrained = None
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # build the dataloader
+    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+    if samples_per_gpu > 1:
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint['meta']:
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
+                                  args.show_score_thr)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
+                                 args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+            print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/test_example.sh b/insightface/detection/scrfd/tools/test_example.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a3a1935fa3239f731e21ae3ac8a42e3ed0bd55ee
--- /dev/null
+++ b/insightface/detection/scrfd/tools/test_example.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+GPU=1
+GROUP=scrfd
+TASK=scrfd_2.5g_bnkps
+
+#CUDA_VISIBLE_DEVICES="$GPU" python -u tools/benchmark_vga.py ./configs/"$GROUP"/"$TASK".py ./work_dirs/"$TASK"/latest.pth #--cpu
+CUDA_VISIBLE_DEVICES="$GPU" python -u tools/test_widerface.py ./configs/"$GROUP"/"$TASK".py ./work_dirs/"$TASK"/model.pth --mode 0 --out wouts --save-preds
diff --git a/insightface/detection/scrfd/tools/test_widerface.py b/insightface/detection/scrfd/tools/test_widerface.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a388b8c79925419da1b80ba00bbb823d90ef40e
--- /dev/null
+++ b/insightface/detection/scrfd/tools/test_widerface.py
@@ -0,0 +1,256 @@
+import argparse
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.datasets import (build_dataloader, build_dataset,
+                            replace_ImageToTensor)
+from mmdet.models import build_detector
+from mmdet.core.evaluation import wider_evaluation, get_widerface_gts
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', default='wout', help='output folder')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument('--save-preds', action='store_true', help='save results')
+    parser.add_argument('--show-assign', action='store_true', help='show bbox assign')
+    parser.add_argument('--debug', action='store_true', help='debug flag')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold (default: 0.3)')
+    parser.add_argument(
+        '--thr',
+        type=float,
+        default=0.02,
+        help='score threshold')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--mode', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+
+    cfg = Config.fromfile(args.config)
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    if cfg.model.get('neck'):
+        if isinstance(cfg.model.neck, list):
+            for neck_cfg in cfg.model.neck:
+                if neck_cfg.get('rfp_backbone'):
+                    if neck_cfg.rfp_backbone.get('pretrained'):
+                        neck_cfg.rfp_backbone.pretrained = None
+        elif cfg.model.neck.get('rfp_backbone'):
+            if cfg.model.neck.rfp_backbone.get('pretrained'):
+                cfg.model.neck.rfp_backbone.pretrained = None
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+
+    gt_path = os.path.join(os.path.dirname(cfg.data.test.ann_file), 'gt')
+    pipelines = cfg.data.test.pipeline
+    for pipeline in pipelines:
+        if pipeline.type=='MultiScaleFlipAug':
+            if args.mode==0: #640 scale
+                pipeline.img_scale = (640, 640)
+            elif args.mode==1: #for single scale in other pages
+                pipeline.img_scale = (1100, 1650)
+            elif args.mode==2: #original scale
+                pipeline.img_scale = None
+                pipeline.scale_factor = 1.0
+            elif args.mode>30:
+                pipeline.img_scale = (args.mode, args.mode)
+            transforms = pipeline.transforms
+            for transform in transforms:
+                if transform.type=='Pad':
+                    if args.mode!=2:
+                        transform.size = pipeline.img_scale
+                    else:
+                        transform.size = None
+                        transform.size_divisor = 32
+    print(cfg.data.test.pipeline)
+    distributed = False
+
+    # build the dataloader
+    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+    if samples_per_gpu > 1:
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    cfg.test_cfg.score_thr = args.thr
+
+    # build the model and load checkpoint
+    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+    fp16_cfg = cfg.get('fp16', None)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if 'CLASSES' in checkpoint['meta']:
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    if args.show_assign:
+        gts_easy, gts_medium, gts_hard = get_widerface_gts(gt_path)
+        assign_stat = [0, 0]
+        gts_size = []
+    model = MMDataParallel(model, device_ids=[0])
+    model.eval()
+    results = {}
+    output_folder = args.out
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+        assert len(result)==1
+        batch_size = 1
+        result = result[0][0]
+        img_metas = data['img_metas'][0].data[0][0]
+        filepath = img_metas['ori_filename']
+        det_scale = img_metas['scale_factor'][0]
+        #print(img_metas)
+        ori_shape = img_metas['ori_shape']
+        img_width = ori_shape[1]
+        img_height = ori_shape[0]
+        _vec = filepath.split('/')
+        pa, pb = _vec[-2], _vec[1]
+        if pa not in results:
+            results[pa] = {}
+        xywh = result.copy()
+        w = xywh[:,2] - xywh[:,0]
+        h = xywh[:,3] - xywh[:,1]
+        xywh[:,2] = w
+        xywh[:,3] = h
+
+        event_name = pa
+        img_name = pb.rstrip('.jpg')
+        results[event_name][img_name] = xywh
+        if args.save_preds:
+            out_dir = os.path.join(output_folder, pa)
+            if not os.path.exists(out_dir):
+                os.makedirs(out_dir)
+            out_file = os.path.join(out_dir, pb.replace('jpg', 'txt'))
+            boxes = result
+            with open(out_file, 'w') as f:
+                name = '/'.join([pa, pb])
+                f.write("%s\n"%(name))
+                f.write("%d\n"%(boxes.shape[0]))
+                for b in range(boxes.shape[0]):
+                    box = boxes[b]
+                    f.write("%.5f %.5f %.5f %.5f %g\n"%(box[0], box[1], box[2]-box[0], box[3]-box[1], box[4]))
+
+        if args.show_assign:
+            assert args.mode==0
+            input_height, input_width = 640, 640
+            gt_hard = gts_hard[event_name][img_name]
+            #print(event_name, img_name, gt_hard.shape)
+            gt_bboxes = gt_hard * det_scale
+            bbox_width = gt_bboxes[:,2] - gt_bboxes[:,0]
+            bbox_height = gt_bboxes[:,3] - gt_bboxes[:,1]
+            bbox_area = bbox_width * bbox_height
+            gt_size = np.sqrt(bbox_area+0.0001)
+            gts_size += list(gt_size)
+            anchor_cxs = []
+            anchor_cys = []
+            for idx, stride in enumerate([8,16,32,64,128]):
+                height = input_height // stride
+                width = input_width // stride
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                anchor_cx = anchor_centers[:,0]
+                anchor_cy = anchor_centers[:,1]
+                anchor_cxs += list(anchor_cx)
+                anchor_cys += list(anchor_cy)
+            anchor_cx = np.array(anchor_cxs, dtype=np.float32)
+            anchor_cy = np.array(anchor_cys, dtype=np.float32)
+
+            num_gts = gt_bboxes.shape[0]
+            num_anchors = anchor_cx.shape[0]
+            anchor_cx = np.broadcast_to(anchor_cx.reshape((1,-1)), (num_gts, num_anchors)).reshape(num_anchors, num_gts)
+            anchor_cy = np.broadcast_to(anchor_cy.reshape((1,-1)), (num_gts, num_anchors)).reshape(num_anchors, num_gts)
+            gt_x1 = gt_bboxes[:,0]
+            gt_y1 = gt_bboxes[:,1]
+            gt_x2 = gt_bboxes[:,2]
+            gt_y2 = gt_bboxes[:,3]
+            gt_cover = np.zeros( (gt_bboxes.shape[0], ), dtype=np.float32)
+            l_ = anchor_cx - gt_x1
+            t_ = anchor_cy - gt_y1
+            r_ = gt_x2 - anchor_cx
+            b_ = gt_y2 - anchor_cy
+            dist = np.stack([l_, t_, r_, b_], axis=1).min(axis=1)
+            gt_dist = dist.max(axis=0)
+            gt_dist  = gt_dist / gt_size
+            center_thres = 0.01
+            #center_thres = -0.25
+            gt_cover_inds = np.where(gt_dist>center_thres)[0]
+            num_assigned = len(gt_cover_inds)
+            assign_stat[0] += num_gts
+            assign_stat[1] += num_assigned
+            
+
+
+
+
+        for _ in range(batch_size):
+            prog_bar.update()
+    aps = wider_evaluation(results, gt_path, 0.5, args.debug)
+    with open(os.path.join(output_folder, 'aps'), 'w') as f:
+        f.write("%f,%f,%f\n"%(aps[0],aps[1],aps[2]))
+    print('APS:', aps)
+    if args.show_assign:
+        print('ASSIGN:', assign_stat)
+        gts_size = np.array(gts_size, dtype=np.float32)
+        gts_size = np.sort(gts_size)
+        assert len(gts_size)==assign_stat[0]
+        print(gts_size[assign_stat[0]//2])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/detection/scrfd/tools/train.py b/insightface/detection/scrfd/tools/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..e7c215481e4078fccf543a276254722027fcda23
--- /dev/null
+++ b/insightface/detection/scrfd/tools/train.py
@@ -0,0 +1,182 @@
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import warnings
+
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from mmcv.utils import get_git_hash
+
+from mmdet import __version__
+from mmdet.apis import set_random_seed, train_detector
+from mmdet.datasets import build_dataset
+from mmdet.models import build_detector
+from mmdet.utils import collect_env, get_root_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+
+    datasets = [build_dataset(cfg.data.train)]
+    #print('ds count', len(datasets))
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=__version__ + get_git_hash()[:7],
+            CLASSES=datasets[0].CLASSES)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    train_detector(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/examples/README.md b/insightface/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a414efb79c53be3a482d48c461513cff4084002f
--- /dev/null
+++ b/insightface/examples/README.md
@@ -0,0 +1,4 @@
+InsightFace Example
+---
+
+Before running the examples, please install insightface package via `pip install -U insightface`
diff --git a/insightface/examples/demo_analysis.py b/insightface/examples/demo_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..28045da015aa79c2077bf2eef3f8314c8fded203
--- /dev/null
+++ b/insightface/examples/demo_analysis.py
@@ -0,0 +1,34 @@
+import argparse
+import cv2
+import sys
+import numpy as np
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+assert insightface.__version__>='0.3'
+
+parser = argparse.ArgumentParser(description='insightface app test')
+# general
+parser.add_argument('--ctx', default=0, type=int, help='ctx id, <0 means using cpu')
+parser.add_argument('--det-size', default=640, type=int, help='detection size')
+args = parser.parse_args()
+
+app = FaceAnalysis()
+app.prepare(ctx_id=args.ctx, det_size=(args.det_size,args.det_size))
+
+img = ins_get_image('t1')
+faces = app.get(img)
+assert len(faces)==6
+rimg = app.draw_on(img, faces)
+cv2.imwrite("./t1_output.jpg", rimg)
+
+# then print all-to-all face similarity
+feats = []
+for face in faces:
+    feats.append(face.normed_embedding)
+feats = np.array(feats, dtype=np.float32)
+sims = np.dot(feats, feats.T)
+print(sims)
+
+
diff --git a/insightface/examples/edge_inference/README.md b/insightface/examples/edge_inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68fdd16aae1709fb44a9b71e854499dbba8aebeb
--- /dev/null
+++ b/insightface/examples/edge_inference/README.md
@@ -0,0 +1,33 @@
+# InsightFace Edge Inference and Deployment
+
+In this tutorial, we give examples and benchmarks of running insightface models on edge devices, mainly using 8-bits quantization technologies to make acceleration.
+
+## Recognition
+
+In recognition tutorial, we use an open-source model: *IR50@Glint360K*, and use a hard private 1:N testset(N=50000). The metric contains Rank1 and TAR@FAR<=e-3.
+
+
+
+Granularity and symmetry both stand for quantization setting, and mostly defined by hardware providers. Symmetric uses INT8 to save quantization results while Asymmetric uses UINT8 type.
+
+| Hardware    | Provider | Type | Backend     | Time | Granularity | Symmetry   | Rank1-Acc | TAR@FAR<=e-3 |
+| ----------- | -------- | ---- | ----------- | ---- | ----------- | ---------- | --------- | ------------ |
+| V100        | NVIDIA   | GPU  | onnxruntime | 4ms  | -           | -          | 80.94     | 30.77        |
+| Jetson NX   | NVIDIA   | GPU  | TensorRT    | 16ms | Per-channel | Symmetric  | 79.26     | 31.07        |
+| A311D       | Khadas   | ASIC | Tengine     | 26ms | Per-tensor  | Asymmetric | 77.83     | 26.58        |
+| A311D*      | Khadas   | ASIC | Tengine     | 26ms | Per-tensor  | Asymmetric | 79.38     | 28.59        |
+| NXP-IMX8P   | NXP      | ASIC | Tengine     | 24ms | Per-tensor  | Asymmetric | 77.87     | 26.80        |
+| NXP-IMX8P*  | NXP      | ASIC | Tengine     | 24ms | Per-tensor  | Asymmetric | 79.42     | 28.39        |
+| RV1126      | Rockchip | ASIC | RKNN        | 38ms | Per-tensor  | Asymmetric | 75.60     | 24.23        |
+| RV1126*     | Rockchip | ASIC | RKNN        | 38ms | Per-tensor  | Asymmetric | 77.82     | 26.30        |
+
+Suffix-* means mixed mode: using float32 model for gallery while using quantized model for probe images. Result features are all in float32 type.
+
+The example code of running quantized networks can be now found at [Tengine](https://github.com/OAID/Tengine/tree/tengine-lite/demos). Later, we will put a copy here and give full tutorial on how to quantize recognition models from 0 to 1.
+
+
+
+## Detection
+
+TODO
+
diff --git a/insightface/examples/face_detection/README.md b/insightface/examples/face_detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0942a86b3fcfac84e735716a39446c0506c212f
--- /dev/null
+++ b/insightface/examples/face_detection/README.md
@@ -0,0 +1,2 @@
+
+#
diff --git a/insightface/examples/face_recognition/README.md b/insightface/examples/face_recognition/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d0942a86b3fcfac84e735716a39446c0506c212f
--- /dev/null
+++ b/insightface/examples/face_recognition/README.md
@@ -0,0 +1,2 @@
+
+#
diff --git a/insightface/examples/in_swapper/README.md b/insightface/examples/in_swapper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd4ec10628c272e7508bb44757274276dbbbf296
--- /dev/null
+++ b/insightface/examples/in_swapper/README.md
@@ -0,0 +1,44 @@
+# InsightFace Swapper
+
+## Update
+
+Please use our discord web demo instead. This python example is temporarily removed.
+
+
+In this example, we provide one-line simple code for subject agnostic identity transfer from source face to the target face.
+
+The input and output resolution of this tool is 128x128.
+
+
+## Usage
+
+Firstly install insightface python library, with version>=0.7:
+
+```
+pip install -U insightface
+```
+
+Second, download the `inswapper_128.onnx` swapping model from [googledrive]() and put it under `~/.insightface/models/`.
+
+Then use the recognition model from our `buffalo_l` pack and initialize the INSwapper class. 
+
+Note that now we can only accept latent embedding from the `buffalo_l` arcface model, otherwise the result will be not normal.
+
+For detail code, please check the [example](inswapper_main.py).
+
+## Result:
+
+Input: 
+
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/t1.jpg" width="640" />
+
+---Then we change the identity to Ross for all faces in this image.---
+
+Direct Outputs:
+
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/t1_swapped2.jpg" width="640" />
+
+Paste Back:
+
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/t1_swapped.jpg" width="640" />
+
diff --git a/insightface/examples/in_swapper/inswapper_main.py b/insightface/examples/in_swapper/inswapper_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..058e174bc97e8c3f975f8e42ff4c4f07ee93f454
--- /dev/null
+++ b/insightface/examples/in_swapper/inswapper_main.py
@@ -0,0 +1,36 @@
+import datetime
+import numpy as np
+import os
+import os.path as osp
+import glob
+import cv2
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+
+assert insightface.__version__>='0.7'
+
+if __name__ == '__main__':
+    app = FaceAnalysis(name='buffalo_l')
+    app.prepare(ctx_id=0, det_size=(640, 640))
+    swapper = insightface.model_zoo.get_model('inswapper_128.onnx', download=True, download_zip=True)
+
+
+    img = ins_get_image('t1')
+    faces = app.get(img)
+    faces = sorted(faces, key = lambda x : x.bbox[0])
+    assert len(faces)==6
+    source_face = faces[2]
+    res = img.copy()
+    for face in faces:
+        res = swapper.get(res, face, source_face, paste_back=True)
+    cv2.imwrite("./t1_swapped.jpg", res)
+    res = []
+    for face in faces:
+        _img, _ = swapper.get(img, face, source_face, paste_back=False)
+        res.append(_img)
+    res = np.concatenate(res, axis=1)
+    cv2.imwrite("./t1_swapped2.jpg", res)
+
+
diff --git a/insightface/examples/mask_renderer.py b/insightface/examples/mask_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b047423b614c5ef226b88041e606b3543f2c8fd
--- /dev/null
+++ b/insightface/examples/mask_renderer.py
@@ -0,0 +1,22 @@
+import os, sys, datetime
+import numpy as np
+import os.path as osp
+import cv2
+import insightface
+from insightface.app import MaskRenderer
+from insightface.data import get_image as ins_get_image
+
+
+if __name__ == "__main__":
+    #make sure that you have download correct insightface model pack.
+    #make sure that BFM.mat and BFM_UV.mat have been generated
+    tool = MaskRenderer()
+    tool.prepare(ctx_id=0, det_size=(128,128))
+    image = ins_get_image('Tom_Hanks_54745')
+    mask_image  = "mask_blue"
+    params = tool.build_params(image)
+    mask_out = tool.render_mask(image, mask_image, params)
+
+    cv2.imwrite('output_mask.jpg', mask_out)
+
+
diff --git a/insightface/examples/mxnet_to_onnx.py b/insightface/examples/mxnet_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fd04b154899126f6a75173d8f7f3a2e85ab9f7
--- /dev/null
+++ b/insightface/examples/mxnet_to_onnx.py
@@ -0,0 +1,178 @@
+import sys
+import os
+import argparse
+import onnx
+import json
+import mxnet as mx
+from onnx import helper
+from onnx import TensorProto
+from onnx import numpy_helper
+import onnxruntime
+import cv2
+
+print('mxnet version:', mx.__version__)
+print('onnx version:', onnx.__version__)
+
+assert mx.__version__ >= '1.8', 'mxnet version should >= 1.8'
+assert onnx.__version__ >= '1.2.1', 'onnx version should >= 1.2.1'
+
+import numpy as np
+from mxnet.contrib import onnx as onnx_mxnet
+
+def create_map(graph_member_list):
+    member_map={}
+    for n in graph_member_list:
+        member_map[n.name]=n
+    return member_map
+
+
+parser = argparse.ArgumentParser(description='convert mxnet model to onnx')
+# general
+parser.add_argument('params', default='./r100a/model-0000.params', help='mxnet params to load.')
+parser.add_argument('output', default='./r100a.onnx', help='path to write onnx model.')
+parser.add_argument('--eps', default=1.0e-8, type=float, help='eps for weights.')
+parser.add_argument('--input-shape', default='3,112,112', help='input shape.')
+parser.add_argument('--check', action='store_true')
+parser.add_argument('--batch', action='store_true')
+parser.add_argument('--input-mean', default=0.0, type=float, help='input mean for checking.')
+parser.add_argument('--input-std', default=1.0, type=float, help='input std for checking.')
+args = parser.parse_args()
+input_shape = (1,) + tuple( [int(x) for x in args.input_shape.split(',')] )
+
+params_file = args.params
+pos = params_file.rfind('-')
+prefix = params_file[:pos]
+epoch = int(params_file[pos+1:pos+5])
+sym_file = prefix + "-symbol.json"
+assert os.path.exists(sym_file)
+assert os.path.exists(params_file)
+
+sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+nodes = json.loads(sym.tojson())['nodes']
+bn_fixgamma_list = []
+for nodeid, node in enumerate(nodes):
+    if node['op'] == 'BatchNorm':
+        attr = node['attrs']
+        fix_gamma = False
+        if attr is not None and 'fix_gamma' in attr:
+            if str(attr['fix_gamma']).lower()=='true':
+                fix_gamma = True
+        if fix_gamma:
+            bn_fixgamma_list.append(node['name'])
+        #print(node, fix_gamma)
+
+print('fixgamma list:', bn_fixgamma_list)
+layer = None
+#layer = 'conv_2_dw_relu' #for debug
+
+if layer is not None:
+    all_layers = sym.get_internals()
+    sym = all_layers[layer + '_output']
+
+
+eps = args.eps
+
+arg = {}
+aux = {}
+invalid = 0
+ac = 0
+for k in arg_params:
+    v = arg_params[k]
+    nv = v.asnumpy()
+    nv = nv.astype(np.float32)
+    #print(k, nv.shape)
+    if k.endswith('_gamma'):
+        bnname = k[:-6]
+        if bnname in bn_fixgamma_list:
+            nv[:] = 1.0
+    ac += nv.size
+    invalid += np.count_nonzero(np.abs(nv)<eps)
+    nv[np.abs(nv) < eps] = 0.0
+    arg[k] = mx.nd.array(nv, dtype='float32')
+arg_params = arg
+invalid = 0
+ac = 0
+for k in aux_params:
+    v = aux_params[k]
+    nv = v.asnumpy().astype(np.float32)
+
+    ac += nv.size
+    invalid += np.count_nonzero(np.abs(nv)<eps)
+    nv[np.abs(nv) < eps] = 0.0
+    aux[k] = mx.nd.array(nv, dtype='float32')
+aux_params = aux
+
+all_args = {}
+all_args.update(arg_params)
+all_args.update(aux_params)
+converted_model_path = onnx_mxnet.export_model(sym, all_args, [input_shape], np.float32, args.output, opset_version=11)
+
+
+model = onnx.load(args.output)
+graph = model.graph
+input_map = create_map(graph.input)
+node_map = create_map(graph.node)
+init_map = create_map(graph.initializer)
+
+#fix PRelu issue
+for input_name in input_map.keys():
+    if input_name.endswith('_gamma'):
+        node_name = input_name[:-6]
+        if not node_name in node_map:
+            continue
+        node = node_map[node_name]
+        if node.op_type!='PRelu':
+            continue
+        _input_shape = input_map[input_name].type.tensor_type.shape.dim
+        input_dim_val=_input_shape[0].dim_value
+        
+        graph.initializer.remove(init_map[input_name])
+        weight_array = numpy_helper.to_array(init_map[input_name])
+        
+        b=[]
+        for w in weight_array:
+            b.append(w)
+        new_nv = helper.make_tensor(input_name, TensorProto.FLOAT, [input_dim_val,1,1], b)
+        graph.initializer.extend([new_nv])
+
+for init_name in init_map.keys():
+    weight_array = numpy_helper.to_array(init_map[init_name])
+    assert weight_array.dtype==np.float32
+    if init_name in input_map:
+        graph.input.remove(input_map[init_name])
+
+#support batch-inference
+if args.batch:
+    graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+
+onnx.save(model, args.output)
+
+#start to check correctness
+if args.check:
+    im_size = tuple(input_shape[2:])+(3,)
+    img = np.random.randint(0, 256, size=im_size, dtype=np.uint8)
+    input_size = tuple(input_shape[2:4][::-1])
+    input_std = args.input_std
+    input_mean = args.input_mean
+    #print(img.shape, input_size)
+    img = cv2.dnn.blobFromImage(img, 1.0/input_std, input_size, (input_mean, input_mean, input_mean), swapRB=True)
+    ctx = mx.cpu()
+    model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
+    model.bind(for_training=False, data_shapes=[('data', input_shape)])
+    _, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) #reload original params
+    model.set_params(arg_params, aux_params)
+
+    data = mx.nd.array(img)
+    db = mx.io.DataBatch(data=(data,))
+    model.forward(db, is_train=False)
+    x1 = model.get_outputs()[-1].asnumpy()
+
+    session = onnxruntime.InferenceSession(args.output, None)
+    input_name = session.get_inputs()[0].name
+    output_name = session.get_outputs()[0].name
+    x2 = session.run([output_name], {input_name : img})[0]
+    print(x1.shape, x2.shape)
+    print(x1.flatten()[:20])
+    print(x2.flatten()[:20])
+
diff --git a/insightface/examples/person_detection/README.md b/insightface/examples/person_detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8fa6e206fab9ef5257cf4da521a412f8686b6a9
--- /dev/null
+++ b/insightface/examples/person_detection/README.md
@@ -0,0 +1,38 @@
+# Person Detection
+
+This person detection example is built by [SCRFD](../../detection/scrfd) approch.
+
+## Usage
+
+Firstly install insightface python library:
+
+```
+pip install -U insightface
+```
+
+and then load our person detection model by:
+
+```
+detector = insightface.model_zoo.get_model('scrfd_person_2.5g.onnx', download=True)
+detector.prepare(0, nms_thresh=0.5, input_size=(640, 640))
+```
+
+the model will be auto-downloaded from our storage server.
+
+## Detection Result:
+
+In this example, we support full-body detection and recognize the corresponding visible region in a single forward pass.
+
+Please see [scrfd_person.py](scrfd_person.py) for detail on how to visualize the results.
+
+The green bounding box shows the full-body while the blue mask indicates the visible region.
+
+(We make tests on the input size of 640)
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/283554,c2d0000d40862ba.jpg" width="640" />
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/283647,18e170005675c161.jpg" width="640" />
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/283554,2290700005b7d575.jpg" width="640" />
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/283554,175820000e7255da.jpg" width="640" />
diff --git a/insightface/examples/person_detection/scrfd_person.py b/insightface/examples/person_detection/scrfd_person.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2422b0a14f64c4c08fbd8913ac456e9f07d64f6
--- /dev/null
+++ b/insightface/examples/person_detection/scrfd_person.py
@@ -0,0 +1,49 @@
+import datetime
+import numpy as np
+import os
+import os.path as osp
+import glob
+import cv2
+import insightface
+
+assert insightface.__version__>='0.4'
+
+
+def detect_person(img, detector):
+    bboxes, kpss = detector.detect(img)
+    bboxes = np.round(bboxes[:,:4]).astype(np.int)
+    kpss = np.round(kpss).astype(np.int)
+    kpss[:,:,0] = np.clip(kpss[:,:,0], 0, img.shape[1])
+    kpss[:,:,1] = np.clip(kpss[:,:,1], 0, img.shape[0])
+    vbboxes = bboxes.copy()
+    vbboxes[:,0] = kpss[:, 0, 0]
+    vbboxes[:,1] = kpss[:, 0, 1]
+    vbboxes[:,2] = kpss[:, 4, 0]
+    vbboxes[:,3] = kpss[:, 4, 1]
+    return bboxes, vbboxes
+
+if __name__ == '__main__':
+    import glob
+    detector = insightface.model_zoo.get_model('scrfd_person_2.5g.onnx', download=True)
+    detector.prepare(0, nms_thresh=0.5, input_size=(640, 640))
+    img_paths = glob.glob('data/images/*.jpg')
+    for img_path in img_paths:
+        img = cv2.imread(img_path)
+        bboxes, vbboxes = detect_person(img, detector)
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i]
+            vbbox = vbboxes[i]
+            x1,y1,x2,y2 = bbox
+            vx1,vy1,vx2,vy2 = vbbox
+            cv2.rectangle(img, (x1,y1)  , (x2,y2) , (0,255,0) , 1)
+            alpha = 0.8
+            color = (255, 0, 0)
+            for c in range(3):
+                img[vy1:vy2,vx1:vx2,c] = img[vy1:vy2, vx1:vx2, c]*alpha + color[c]*(1.0-alpha)
+            cv2.circle(img, (vx1,vy1) , 1, color , 2)
+            cv2.circle(img, (vx1,vy2) , 1, color , 2)
+            cv2.circle(img, (vx2,vy1) , 1, color , 2)
+            cv2.circle(img, (vx2,vy2) , 1, color , 2)
+        filename = img_path.split('/')[-1]
+        cv2.imwrite('./outputs/%s'%filename, img)
+
diff --git a/insightface/generation/README.md b/insightface/generation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f8b7ecafd0ca812a9b53f533265447aaa21b4cc
--- /dev/null
+++ b/insightface/generation/README.md
@@ -0,0 +1 @@
+# InsightFace Generation Projects
diff --git a/insightface/model_zoo/README.md b/insightface/model_zoo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1991b989b54e1fe673c9833e813d0340a577a9fd
--- /dev/null
+++ b/insightface/model_zoo/README.md
@@ -0,0 +1,187 @@
+# InsightFace Model Zoo
+
+:bell:   **ALL models are available for non-commercial research purposes only.**
+
+## 0. Python Package models
+
+To check the detail of insightface python package, please see [here](../python-package).
+
+To install: ``pip install -U insightface``
+
+To use the specific model pack:
+
+```
+model_pack_name = 'buffalo_l'
+app = FaceAnalysis(name=model_pack_name)
+```
+
+Name in **bold** is the default model pack in latest version.
+
+
+| Name           | Detection Model | Recognition Model   | Alignment    | Attributes | Model-Size |
+| -------------- | --------------- | ------------------- | ------------ | ---------- | ---------- |
+| antelopev2 | RetinaFace-10GF      | ResNet100@Glint360K | 2d106 & 3d68 | Gender&Age | 407MB |
+| **buffalo_l**      | RetinaFace-10GF      | ResNet50@WebFace600K | 2d106 & 3d68 | Gender&Age | 326MB |
+| buffalo_m      | RetinaFace-2.5GF     | ResNet50@WebFace600K | 2d106 & 3d68 | Gender&Age | 313MB |
+| buffalo_s      | RetinaFace-500MF     | MBF@WebFace600K | 2d106 & 3d68 | Gender&Age | 159MB |
+| buffalo_sc      | RetinaFace-500MF     | MBF@WebFace600K | - | - | 16MB |
+
+### Recognition accuracy of python library model packs:
+
+| Name      | MR-ALL | African | Caucasian | South Asian | East Asian | LFW    | CFP-FP | AgeDB-30 | IJB-C(E4) |
+| :-------- | ------ | ------- | --------- | ----------- | ---------- | ------ | ------ | -------- | --------- |
+| buffalo_l | 91.25  | 90.29   | 94.70     | 93.16       | 74.96      | 99.83  | 99.33  | 98.23    | 97.25     |
+| buffalo_s	      | 71.87 | 69.45  | 80.45    | 73.39      | 51.03     | 99.70 | 98.00  | 96.58    | 95.02 |
+
+*buffalo_m has the same accuracy with buffalo_l.*
+
+*buffalo_sc has the same accuracy with buffalo_s.*
+
+(Note that almost all ONNX models in our model_zoo can be called by python library.)
+
+##  1. Face Recognition models.
+
+### Definition:
+
+The default training loss is margin based softmax if not specified.
+
+``MFN``: MobileFaceNet
+
+``MS1MV2``: MS1M-ArcFace
+
+``MS1MV3``: MS1M-RetinaFace
+
+``MS1M_MegaFace``: MS1MV2+MegaFace_train
+
+``_pfc``: using Partial FC, with sample-ratio=0.1
+
+``MegaFace``: MegaFace identification test, with gallery=1e6.
+
+``IJBC``: IJBC 1:1 test, under FAR<=1e-4.
+
+``BDrive``: BaiduDrive
+
+``GDrive``: GoogleDrive
+
+### List of models by MXNet and PaddlePaddle:
+
+| Backbone | Dataset | Method  | LFW   | CFP-FP | AgeDB-30 | MegaFace | Link.                                                        |
+| -------- | ------- | ------- | ----- | ------ | -------- | -------- | ------------------------------------------------------------ |
+| R100 (mxnet)     | MS1MV2  | ArcFace | 99.77 | 98.27  | 98.28    | 98.47    | [BDrive](https://pan.baidu.com/s/1wuRTf2YIsKt76TxFufsRNA), [GDrive](https://drive.google.com/file/d/1Hc5zUfBATaXUgcU2haUNa7dcaZSw95h2/view?usp=sharing) |
+| MFN (mxnet)     | MS1MV1  | ArcFace | 99.50 | 88.94  | 95.91    | -        | [BDrive](https://pan.baidu.com/s/1If28BkHde4fiuweJrbicVA), [GDrive](https://drive.google.com/file/d/1RHyJIeYuHduVDDBTn3ffpYEZoXWRamWI/view?usp=sharing) |
+| MFN (paddle)     | MS1MV2  | ArcFace | 99.45 | 93.43  | 96.13    |  -   | [pretrained model](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/MobileFaceNet_128_v1.0_pretrained.tar), [inference model](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar) |
+| iResNet50 (paddle)     | MS1MV2  | ArcFace | 99.73 | 97.43  | 97.88    |  -   | [pretrained model](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/arcface_iresnet50_v1.0_pretrained.tar), [inference model](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/arcface_iresnet50_v1.0_infer.tar) |
+
+
+
+### List of models by various depth IResNet and training datasets:
+
+| Backbone | Dataset   | MR-ALL | African | Caucasian | South Asian | East Asian | Link(onnx)                                                            |
+|----------|-----------|--------|---------|-----------|-------------|------------|-----------------------------------------------------------------------|
+| R100     | Casia     | 42.735 | 39.666  | 53.933    | 47.807      | 21.572     | [GDrive](https://drive.google.com/file/d/1WOrOK-qZO5FcagscCI3td6nnABUPPepD/view?usp=sharing) |
+| R100     | MS1MV2    | 80.725 | 79.117  | 87.176    | 85.501      | 55.807     | [GDrive](https://drive.google.com/file/d/1772DTho9EG047KNUIv2lop2e7EobiCFn/view?usp=sharing) |
+| R18      | MS1MV3    | 68.326 | 62.613  | 75.125    | 70.213      | 43.859     | [GDrive](https://drive.google.com/file/d/1dWZb0SLcdzr-toUzsVZ1zogn9dEIW1Dk/view?usp=sharing) |
+| R34      | MS1MV3    | 77.365 | 71.644  | 83.291    | 80.084      | 53.712     | [GDrive](https://drive.google.com/file/d/1ON6ImX-AigDKAi4pelFPf12vkJVyGFKl/view?usp=sharing) |
+| R50      | MS1MV3    | 80.533 | 75.488  | 86.115    | 84.305      | 57.352     | [GDrive](https://drive.google.com/file/d/1FPldzmZ6jHfaC-R-jLkxvQRP-cLgxjCT/view?usp=sharing) |
+| R100     | MS1MV3    | 84.312 | 81.083  | 89.040    | 88.082      | 62.193     | [GDrive](https://drive.google.com/file/d/1fZOfvfnavFYjzfFoKTh5j1YDcS8KCnio/view?usp=sharing) |
+| R18      | Glint360K | 72.074 | 68.230  | 80.575    | 75.852      | 47.831     | [GDrive](https://drive.google.com/file/d/1Z0eoO1Wqv32K8TdFHKqrlrxv46_W4390/view?usp=sharing) |
+| R34      | Glint360K | 83.015 | 79.907  | 88.620    | 86.815      | 60.604     | [GDrive](https://drive.google.com/file/d/1G1oeLkp_b3JA_z4wGs62RdLpg-u_Ov2Y/view?usp=sharing) |
+| R50      | Glint360K | 87.077 | 85.272  | 91.617    | 90.541      | 66.813     | [GDrive](https://drive.google.com/file/d/1MpRhM76OQ6cTzpr2ZSpHp2_CP19Er4PI/view?usp=sharing) |
+| R100     | Glint360K | 90.659 | 89.488  | 94.285    | 93.434      | 72.528     | [GDrive](https://drive.google.com/file/d/1Gh8C-bwl2B90RDrvKJkXafvZC3q4_H_z/view?usp=sharing) |
+
+
+### List of models by IResNet-50 and different training datasets:
+
+| Dataset           | MR-ALL | African | Caucasian | South Asian | East Asian | LFW   | CFP-FP | AgeDB-30 | IJB-C(E4) | Link(onnx) |
+| :--------         | ------ | ------- | ----      | ------      | --------   | ----- | ------ | -------- | --------- | --- |
+| CISIA	            | 36.794 | 42.550  | 55.825    | 49.618      | 19.611     | 99.450| 95.214 | 94.900   | 87.220    | [GDrive](https://drive.google.com/file/d/1km-cVFvUAPU1UumLLi1fIRasdg6VA-vM/view?usp=sharing) |
+| CISIA_pfc	        | 37.107 | 38.934  | 53.823    | 48.674      | 19.927     | 99.367| 95.429 | 94.600   | 84.970    | [GDrive](https://drive.google.com/file/d/1z8linstTZopL5Yy7NOUgVVtgzGtsu1LM/view?usp=sharing) |
+| VGG2	            | 38.578 | 35.259  | 54.304    | 44.081      | 24.095     | 99.550| 97.410 | 95.080   | 91.220    | [GDrive](https://drive.google.com/file/d/1UwyVIDSNDkHKClBANrWi8qpMU4nXizT6/view?usp=sharing) |
+| VGG2_pfc	        | 40.673 | 36.767  | 60.180    | 49.039      | 24.255     | 99.683| 98.529 | 95.400   | 92.490    | [GDrive](https://drive.google.com/file/d/1uW0EsctVyPklSyXMXF39AniIhSRXCRtp/view?usp=sharing) |
+| GlintAsia	        | 62.663 | 49.531  | 64.829    | 57.984      | 61.743     | 99.583| 93.186 | 95.400   | 91.500    | [GDrive](https://drive.google.com/file/d/1IyXh7m1HMwTZw4B5N1WMPIsN-S9kdS95/view?usp=sharing) |
+| GlintAsia_pfc	    | 63.149 | 50.366  | 65.227    | 57.936      | 61.820     | 99.650| 93.029 | 95.233   | 91.140    | [GDrive](https://drive.google.com/file/d/1CTjalggNucgPkmpFi5ij-NGG1Fy9sL5r/view?usp=sharing) |
+| MS1MV2	        | 77.696 | 74.596  | 84.126    | 82.041      | 51.105     | 99.833| 98.083 | 98.083   | 96.140    | [GDrive](https://drive.google.com/file/d/1rd4kbiXtXBTWE8nP7p4OTv_CAp2FUa1i/view?usp=sharing) |
+| MS1MV2_pfc	    | 77.738 | 74.728  | 84.883    | 82.798      | 52.507     | 99.783| 98.071 | 98.017   | 96.080    | [GDrive](https://drive.google.com/file/d/1ryrXenGQa-EGyk64mVaG136ihNUBmNMW/view?usp=sharing) |
+| MS1M_MegaFace	    | 78.372 | 74.138  | 82.251    | 77.223      | 60.203     | 99.750| 97.557 | 97.400   | 95.350    | [GDrive](https://drive.google.com/file/d/1c2JG0StcTMDrL4ywz3qWTN_9io3lo_ER/view?usp=sharing) |
+| MS1M_MegaFace_pfc | 78.773 | 73.690  | 82.947    | 78.793      | 57.566     | 99.800| 97.870 | 97.733   | 95.400    | [GDrive](https://drive.google.com/file/d/1BnG48LS_HIvYlSbSnP_LzpO3xjx0_rpu/view?usp=sharing) |
+| MS1MV3	        | 82.522 | 77.172  | 87.028    | 86.006      | 60.625     | 99.800| 98.529 | 98.267   | 96.580    | [GDrive](https://drive.google.com/file/d/1Tqorubgcl0qfjbjEM_Y9EDmjG5tCWzbr/view?usp=sharing) |
+| MS1MV3_pfc	    | 81.683 | 78.126  | 87.286    | 85.542      | 58.925     | 99.800| 98.443 | 98.167   | 96.430    | [GDrive](https://drive.google.com/file/d/15jrHCqhEmoSZ93kKL9orVMhbKfNWAhp-/view?usp=sharing) |
+| Glint360k	        | 86.789 | 84.749  | 91.414    | 90.088      | 66.168     | 99.817| 99.143 | 98.450   | 97.130    | [GDrive](https://drive.google.com/file/d/1gnt6P3jaiwfevV4hreWHPu0Mive5VRyP/view?usp=sharing) |
+| Glint360k_pfc	    | 87.077 | 85.272  | 91.616    | 90.541      | 66.813     | 99.817| 99.143 | 98.450   | 97.020    | [GDrive](https://drive.google.com/file/d/164o2Ct42tyJdQjckeMJH2-7KTXolu-EP/view?usp=sharing) |
+| WebFace600K	    | 90.566 | 89.355  | 94.177    | 92.358      | 73.852     | 99.800| 99.200 | 98.100   | 97.120    | [GDrive](https://drive.google.com/file/d/1N0GL-8ehw_bz2eZQWz2b0A5XBdXdxZhg/view?usp=sharing) |
+| WebFace600K_pfc    | 89.951 | 89.301  | 94.016    | 92.381      | 73.007     | 99.817| 99.143 | 98.117   | 97.010    | [GDrive](https://drive.google.com/file/d/11TASXssTnwLY1ZqKlRjsJiV-1nWu9pDY/view?usp=sharing) |
+| Average	        | 69.247 | 65.908  | 77.121    | 72.819      | 52.014     | 99.706| 97.374 | 96.962   | 93.925    |  |
+| Average_pfc	    | 69.519 | 65.898  | 77.497    | 73.213      | 51.853     | 99.715| 97.457 | 96.965   | 93.818    |  |
+
+### List of models by MobileFaceNet and different training datasets:
+
+**``FLOPS``:** 450M FLOPs
+
+**``Model-Size``:** 13MB
+
+| Dataset           | MR-ALL | African | Caucasian | South Asian | East Asian | LFW   | CFP-FP | AgeDB-30 | IJB-C(E4) | Link(onnx) |
+| :--------         | ------ | ------- | ----      | ------      | --------   | ----- | ------ | -------- | --------- | --- |
+| WebFace600K	      | 71.865 | 69.449  | 80.454    | 73.394      | 51.026     | 99.70 | 98.00  | 96.58    | 95.02     | - |
+
+
+## 2. Face Detection models.
+
+### 2.1 RetinaFace
+
+In RetinaFace, mAP was evaluated with multi-scale testing.
+
+``m025``: means MobileNet-0.25
+
+| Impelmentation           | Easy-Set | Medium-Set | Hard-Set | Link                                                         |
+| ------------------------ | -------- | ---------- | -------- | ------------------------------------------------------------ |
+| RetinaFace-R50           | 96.5     | 95.6       | 90.4     | [BDrive](https://pan.baidu.com/s/1C6nKq122gJxRhb37vK0_LQ), [GDrive](https://drive.google.com/file/d/1wm-6K688HQEx_H90UdAIuKv-NAsKBu85/view?usp=sharing) |
+| RetinaFace-m025(yangfly) | -        | -          | 82.5     | [BDrive](https://pan.baidu.com/s/1P1ypO7VYUbNAezdvLm2m9w)(nzof), [GDrive](https://drive.google.com/drive/folders/1OTXuAUdkLVaf78iz63D1uqGLZi4LbPeL?usp=sharing) |
+| BlazeFace-FPN-SSH	(paddle)           | 91.9     | 89.8       | 81.7%     | [pretrained model](https://paddledet.bj.bcebos.com/models/blazeface_fpn_ssh_1000e.pdparams), [inference model](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/blazeface_fpn_ssh_1000e_v1.0_infer.tar) |
+
+### 2.2 SCRFD
+
+In SCRFD, mAP was evaluated with single scale testing, VGA resolution.
+
+``2.5G``: means the model cost ``2.5G`` FLOPs while the input image is in VGA(640x480) resolution.
+
+``_KPS``: means this model can detect five facial keypoints.
+
+|      Name      | Easy  | Medium | Hard  | FLOPs | Params(M) | Infer(ms) | Link(pth)                                                    |
+| :------------: | ----- | ------ | ----- | ----- | --------- | --------- | ------------------------------------------------------------ |
+|   SCRFD_500M   | 90.57 | 88.12  | 68.51 | 500M  | 0.57      | 3.6       | [GDrive](https://drive.google.com/file/d/1OX0i_vWDp1Fp-ZynOUMZo-q1vB5g1pTN/view?usp=sharing) |
+|    SCRFD_1G    | 92.38 | 90.57  | 74.80 | 1G    | 0.64      | 4.1       | [GDrive](https://drive.google.com/file/d/1acd5wKjWnl1zMgS5YJBtCh13aWtw9dej/view?usp=sharing) |
+|   SCRFD_2.5G   | 93.78 | 92.16  | 77.87 | 2.5G  | 0.67      | 4.2       | [GDrive](https://drive.google.com/file/d/1wgg8GY2vyP3uUTaAKT0_MSpAPIhmDsCQ/view?usp=sharing) |
+|   SCRFD_10G    | 95.16 | 93.87  | 83.05 | 10G   | 3.86      | 4.9       | [GDrive](https://drive.google.com/file/d/1kUYa0s1XxLW37ZFRGeIfKNr9L_4ScpOg/view?usp=sharing) |
+|   SCRFD_34G    | 96.06 | 94.92  | 85.29 | 34G   | 9.80      | 11.7      | [GDrive](https://drive.google.com/file/d/1w9QOPilC9EhU0JgiVJoX0PLvfNSlm1XE/view?usp=sharing) |
+| SCRFD_500M_KPS | 90.97 | 88.44  | 69.49 | 500M  | 0.57      | 3.6       | [GDrive](https://drive.google.com/file/d/1TXvKmfLTTxtk7tMd2fEf-iWtAljlWDud/view?usp=sharing) |
+| SCRFD_2.5G_KPS | 93.80 | 92.02  | 77.13 | 2.5G  | 0.82      | 4.3       | [GDrive](https://drive.google.com/file/d/1KtOB9TocdPG9sk_S_-1QVG21y7OoLIIf/view?usp=sharing) |
+| SCRFD_10G_KPS  | 95.40 | 94.01  | 82.80 | 10G   | 4.23      | 5.0       | [GDrive](https://drive.google.com/file/d/1-2uy0tgkenw6ZLxfKV1qVhmkb5Ep_5yx/view?usp=sharing) |
+
+
+
+## 3. Face Alignment models.
+
+### 2.1 2D Face Alignment
+
+| Impelmentation        | Points | Backbone      | Params(M) | Link(onnx)                                                   |
+| --------------------- | ------ | ------------- | --------- | ------------------------------------------------------------ |
+| Coordinate-regression | 106    | MobileNet-0.5 | 1.2       | [GDrive](https://drive.google.com/file/d/1M5685m-bKnMCt0u2myJoEK5gUY3TDt_1/view?usp=sharing) |
+
+### 2.2 3D Face Alignment
+
+| Impelmentation | Points | Backbone  | Params(M) | Link(onnx)                                                   |
+| -------------- | ------ | --------- | --------- | ------------------------------------------------------------ |
+| -              | 68     | ResNet-50 | 34.2      | [GDrive](https://drive.google.com/file/d/1aJe5Rzoqrtf_a9U84E-V1b0rUi8-QbCI/view?usp=sharing) |
+
+### 2.3 Dense Face Alignment
+
+## 4. Face Attribute models.
+
+### 4.1 Gender&Age 
+
+| Training-Set | Backbone       | Params(M) | Link(onnx)                                                   |
+| ------------ | -------------- | --------- | ------------------------------------------------------------ |
+| CelebA       | MobileNet-0.25 | 0.3       | [GDrive](https://drive.google.com/file/d/1Mm3TeUuaZOwmEMp0nGOddvgXCjpRodPU/view?usp=sharing) |
+
+
+### 4.2 Expression
diff --git a/insightface/parsing/dml_csr/README.md b/insightface/parsing/dml_csr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b2ffceef6668f2d80a075f2e05315b02b7b4dba
--- /dev/null
+++ b/insightface/parsing/dml_csr/README.md
@@ -0,0 +1,61 @@
+# Decoupled Multi-task Learning with Cyclical Self-Regulation for Face Parsing.
+
+The official repository of *[Decoupled Multi-task Learning with Cyclical Self-Regulation for Face Parsing. (CVPR 2022)](https://arxiv.org/abs/2203.14448)*. 
+
+## Installation
+
+Our model is based on Pytorch 1.7.1 with Python 3.6.2.
+
+```sh
+pip install -r requirements.txt
+```
+
+## Data
+You can download original datasets:
+- **Helen** : [https://www.sifeiliu.net/face-parsing](https://www.sifeiliu.net/face-parsing)
+- **LaPa** : [https://github.com/JDAI-CV/lapa-dataset](https://github.com/JDAI-CV/lapa-dataset)
+- **CelebAMask-HQ** : [https://github.com/switchablenorms/CelebAMask-HQ](https://github.com/switchablenorms/CelebAMask-HQ)
+
+and put them in ./dataset folder as below
+```
+dataset/
+    images/
+    labels/
+    edges/
+    train_list.txt
+    test_list.txt
+        each line: 'images/100032540_1.jpg labels/100032540_1.png'
+```
+Besides, we provide the edge genearation code in the *generate_edge.py*.
+
+## Usage
+
+If you need imagenet pretrained resent-101, please download from [baidu drive]() or [Google drive](https://drive.google.com/open?id=1rzLU-wK6rEorCNJfwrmIu5hY2wRMyKTK), and put it into snapshot folder.
+
+For dstributed(multi-gpu) training. Inplace-abn requires pytorch distributed data parallel.
+```
+GPU=4,5,6,7
+Node=4
+dataset=./datasets/CelebAMask-HQ/
+snapshot=./work_dirs/
+CUDA_VISIBLE_DEVICES="$GPU" python -m torch.distributed.launch --nproc_per_node="$Node"  --master_port=295002 train.py --data-dir "$dataset"  --random-mirror --random-scale \
+--gpu "$GPU" --batch-size 7 --input-size 473,473 --snapshot-dir "$snapshot" --num-classes 19 --epochs 200 --schp-start 150
+```
+
+For testing [pretrained models](https://drive.google.com/file/d/1-PjUts1AMzXNyvw3VaJQmg43GJbfEpEQ/view?usp=sharing)
+```
+python test.py --data-dir "$dataset" --out-dir "$out_dir" --restore-from "$snapshot" --gpu "$GPU" --batch-size 7 --input-size 473,473 --dataset test --num-classes 19
+```
+
+## Reference
+
+If you consider use our code, please cite our paper:
+
+```
+@inproceedings{Zheng2022DecoupledML,
+  title={Decoupled Multi-task Learning with Cyclical Self-Regulation for Face Parsing},
+  author={Qi Zheng and Jiankang Deng and Zheng Zhu and Ying Li and Stefanos Zafeiriou},
+  booktitle={Computer Vision and Pattern Recognition},
+  year={2022}
+}
+```
diff --git a/insightface/parsing/dml_csr/dataset/datasets.py b/insightface/parsing/dml_csr/dataset/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8e211a96f4ebc6ae8eed47c26195928b64af0e
--- /dev/null
+++ b/insightface/parsing/dml_csr/dataset/datasets.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   datasets.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+import os.path as osp
+import pickle
+import random
+
+import torch
+import torchvision.transforms.functional as TF
+
+from glob import glob
+from typing import Tuple
+from utils import transforms
+
+
+class FaceDataSet(torch.utils.data.Dataset):
+    """Face data set for model training and validating
+
+    Examples:
+
+        ./CelebAMask
+            |---test
+            |---train
+                |---images
+                    |---0.jpg
+                    |---1.jpg
+                |---labels
+                    |---0.png
+                    |---1.png
+                |---edges
+                    |---0.png
+                    |---1.png
+            |---valid
+            |---label_names.txt
+            |---test_list.txt
+            |---train_list.txt
+                |---images/0.jpg labels/0.png
+                |---images/1.jpg labels/1.png
+            |---valid_list.txt
+
+    Args:
+      root: A string, training/validating dataset path, e.g. "./CelebAMask"
+      dataset: A string, one of `"train"`, `"test"`, `"valid"`.
+      crop_size: A list of two intergers.
+      scale_factor: A float number.
+      rotation_factor: An integer number.
+      ignore_label: An integer number, default is 255.
+      transformer: A function of torchvision.transforms.Compose([])
+    """
+    def __init__(self, 
+                 root: str, 
+                 dataset: str, 
+                 crop_size: list=[473, 473], 
+                 scale_factor: float=0.25,
+                 rotation_factor: int=30, 
+                 ignore_label: int =255, 
+                 transform=None) -> None:
+
+        self.root = root
+        self.dataset = dataset
+        self.crop_size = np.asarray(crop_size)
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.ignore_label = ignore_label
+        self.transform = transform
+
+        self.flip_prob = 0.5
+        self.flip_pairs = [[4, 5], [6, 7]]
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+
+        self.file_list_name = osp.join(root, dataset + '_list.txt')
+        self.im_list = [line.split()[0][7:-4] for line in open(self.file_list_name).readlines()]
+        self.number_samples = len(self.im_list)
+
+
+    def __len__(self) -> int:
+        return self.number_samples 
+
+    def _box2cs(self, box: list) -> tuple:
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x: float, y: float, w: float, h: float) -> tuple:
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+
+        return center, scale
+
+    def __getitem__(self, index: int) -> tuple:
+        # Load training image
+        im_name = self.im_list[index]
+        im_path = osp.join(self.root, self.dataset, 'images', im_name + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+        # Get center and scale
+        center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+
+        if self.dataset not in ['test', 'valid']:
+            edge_path = osp.join(self.root, self.dataset, 'edges', im_name + '.png')
+            edge = cv2.imread(edge_path, cv2.IMREAD_GRAYSCALE)
+            parsing_anno_path = osp.join(self.root, self.dataset, 'labels', im_name + '.png')
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+
+            if self.dataset in 'train':
+
+                sf = self.scale_factor
+                rf = self.rotation_factor
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) \
+                    if random.random() <= 0.6 else 0
+
+        trans = transforms.get_affine_transform(center, s, r, self.crop_size)
+        image = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        
+        if self.dataset not in ['test', 'valid']:
+            edge = cv2.warpAffine(
+                edge,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(0, 0, 0))
+
+        if self.transform:
+            image = self.transform(image)
+
+        meta = {
+            'name': im_name,
+            'center': center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r,
+            'origin': image
+        }
+
+        if self.dataset not in 'train':
+            return image, meta
+        else:
+
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+
+            label_parsing = torch.from_numpy(label_parsing)
+
+            return image, label_parsing, edge, meta
+
diff --git a/insightface/parsing/dml_csr/loss/consistency_loss.py b/insightface/parsing/dml_csr/loss/consistency_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b6ac5992cabaf48024c36a2613e25da7632f620
--- /dev/null
+++ b/insightface/parsing/dml_csr/loss/consistency_loss.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   consistency_loss.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+def generate_edge_tensor(label, edge_width=3):
+    label = label.type(torch.cuda.FloatTensor)
+    if len(label.shape) == 2:
+        label = label.unsqueeze(0)
+    n, h, w = label.shape
+    edge = torch.zeros(label.shape, dtype=torch.float).cuda()
+    # right
+    edge_right = edge[:, 1:h, :]
+    edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
+               & (label[:, :h - 1, :] != 255)] = 1
+
+    # up
+    edge_up = edge[:, :, :w - 1]
+    edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
+            & (label[:, :, :w - 1] != 255)
+            & (label[:, :, 1:w] != 255)] = 1
+
+    # upright
+    edge_upright = edge[:, :h - 1, :w - 1]
+    edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
+                 & (label[:, :h - 1, :w - 1] != 255)
+                 & (label[:, 1:h, 1:w] != 255)] = 1
+
+    # bottomright
+    edge_bottomright = edge[:, :h - 1, 1:w]
+    edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
+                     & (label[:, :h - 1, 1:w] != 255)
+                     & (label[:, 1:h, :w - 1] != 255)] = 1
+
+    kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
+    with torch.no_grad():
+        edge = edge.unsqueeze(1)
+        edge = F.conv2d(edge, kernel, stride=1, padding=1)
+    edge[edge!=0] = 1
+    edge = edge.squeeze()
+    return edge
+
+
+class ConsistencyLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(ConsistencyLoss, self).__init__()
+        self.ignore_index=ignore_index
+
+    def forward(self, parsing, edge, label):
+        parsing_pre = torch.argmax(parsing, dim=1)
+        parsing_pre[label==self.ignore_index]=self.ignore_index
+        generated_edge = generate_edge_tensor(parsing_pre)
+        edge_pre = torch.argmax(edge, dim=1)
+        v_generate_edge = generated_edge[label!=255]
+        v_edge_pre = edge_pre[label!=255]
+        one = torch.ones_like(v_edge_pre)
+        v_edge_pre = torch.where(v_edge_pre > 0, one, v_edge_pre)
+        v_edge_pre = v_edge_pre.type(torch.cuda.FloatTensor)
+        positive_union = (v_generate_edge==1)&(v_edge_pre==1) # only the positive values count
+        return F.smooth_l1_loss(v_generate_edge[positive_union].squeeze(0), v_edge_pre[positive_union].squeeze(0))
diff --git a/insightface/parsing/dml_csr/loss/criterion.py b/insightface/parsing/dml_csr/loss/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa1bdc65aba369650c81351c6dce82cd2f2d02b
--- /dev/null
+++ b/insightface/parsing/dml_csr/loss/criterion.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   criterion.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.nn as nn
+import torch
+import numpy as np
+
+from torch.nn import functional as F
+from torch.nn.modules.loss import _Loss
+from .lovasz_softmax import LovaszSoftmax
+from .kl_loss import KLDivergenceLoss
+from .consistency_loss import ConsistencyLoss
+
+
+class Criterion(nn.Module):
+    """DML_CSR loss for face parsing.
+    
+    Put more focus on facial components like eyes, eyebrow, nose and mouth
+    """
+    def __init__(self, loss_weight=[1.0, 1.0, 1.0, 1.0, 1.0], ignore_index=255, lambda_1=1, lambda_2=1, lambda_3=1, num_classes=11):
+        super(Criterion, self).__init__()
+        self.ignore_index = ignore_index   
+        self.loss_weight = loss_weight
+        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=ignore_index) 
+        self.criterion_weight = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)
+        # self.sed = WeightedCrossEntropyWithLogits()
+        self.lovasz = LovaszSoftmax(ignore_index=ignore_index)
+        self.kldiv = KLDivergenceLoss(ignore_index=ignore_index)
+        self.reg = ConsistencyLoss(ignore_index=ignore_index)
+        self.lamda_1 = lambda_1
+        self.lamda_2 = lambda_2
+        self.lamda_3 = lambda_3
+        self.num_classes = num_classes
+          
+    def forward(self, preds, target, cycle_n=None):
+        h, w = target[0].size(1), target[0].size(2)
+        
+        # binary edge
+        input_binary_labels = target[1].data.cpu().numpy().astype(np.int64)
+        binary_pos_num = np.sum(input_binary_labels==1).astype(np.float)
+        binary_neg_num = np.sum(input_binary_labels==0).astype(np.float)
+
+        binary_weight_pos = binary_neg_num/(binary_pos_num + binary_neg_num)
+        binary_weight_neg = binary_pos_num/(binary_pos_num + binary_neg_num)
+        binary_weights = (binary_weight_neg, binary_weight_pos)  
+        binary_weights = torch.from_numpy(np.array(binary_weights)).float().cuda()
+
+        # print('target', target[0].size(), target[1].size())
+        binary_edge_p_num = target[1].cpu().numpy().reshape(target[1].size(0),-1).sum(axis=1)
+        # print('edge_p_num_1', edge_p_num.shape)
+        binary_edge_p_num = np.tile(binary_edge_p_num, [h, w, 1]).transpose(2,1,0)
+        # print('edge_p_num_2', edge_p_num.shape)
+        binary_edge_p_num = torch.from_numpy(binary_edge_p_num).cuda().float()
+
+        # semantic edge
+        input_semantic_labels = target[2].data.cpu().numpy().astype(np.int64)
+        semantic_weights = []
+        semantic_pos_num = np.sum(input_semantic_labels>0).astype(np.float)
+        semantic_neg_num = np.sum(input_semantic_labels==0).astype(np.float)
+
+        for lbl in range(self.num_classes):
+            lbl_num = np.sum(input_semantic_labels==lbl).astype(np.float)
+            weight_lbl = lbl_num/(semantic_pos_num + semantic_neg_num)
+            semantic_weights.append(weight_lbl)
+        semantic_weights = torch.from_numpy(np.array(semantic_weights)).float().cuda()
+
+        # print('target', target[0].size(), target[1].size())
+        semantic_edge_p_num = np.count_nonzero(target[2].cpu().numpy().reshape(target[2].size(0),-1), axis=1)
+        # print('edge_p_num_1', edge_p_num.shape)
+        semantic_edge_p_num = np.tile(semantic_edge_p_num, [h, w, 1]).transpose(2,1,0)
+        # print('edge_p_num_2', edge_p_num.shape)
+        semantic_edge_p_num = torch.from_numpy(semantic_edge_p_num).cuda().float()
+        
+        loss_binary_edge = 0; loss_semantic_edge = 0; loss_parse = 0; loss_att_parse = 0; loss_att_binary_edge = 0; loss_att_semantic_edge = 0; loss_consistency = 0
+        # print(preds[1].size(), target[1].size(), weights, len(preds))
+
+        # loss for parsing
+        scale_parse = F.interpolate(input=preds[0], size=(h, w), mode='bilinear', align_corners=True) # parsing
+        loss_parse += 0.5 * self.lamda_1 * self.lovasz(scale_parse, target[0])
+        
+        if target[3] is None:
+            loss_parse += 0.5 * self.lamda_1 * self.criterion(scale_parse, target[0])
+        else:
+            soft_scale_parse = F.interpolate(input=target[2], size=(h, w), mode='bilinear', align_corners=True)
+            soft_scale_parse = moving_average(soft_scale_parse, to_one_hot(target[0], num_cls=self.num_classes),
+                                                1.0 / (cycle_n + 1.0))
+            loss_parse += 0.5 * self.lamda_1 * self.kldiv(scale_parse, soft_scale_parse, target[0])
+
+        # loss for binary edge
+        scale_binary_edge = F.interpolate(input=preds[1], size=(h, w), mode='bilinear', align_corners=True)  # edge 
+        
+        if target[4] is None:
+            loss_binary_edge = self.lamda_2 * F.cross_entropy(scale_binary_edge, target[1], binary_weights)
+        else:
+            soft_scale_binary_edge = F.interpolate(input=target[4], size=(h, w), mode='bilinear', align_corners=True)
+            soft_scale_binary_edge = moving_average(soft_scale_binary_edge, to_one_hot(target[1], num_cls=2),
+                                                1.0 / (cycle_n + 1.0))
+            loss_binary_edge += self.lamda_2 * self.kldiv(scale_binary_edge, soft_scale_binary_edge, target[0])
+
+        # loss for semantic edge
+        scale_semantic_edge = F.interpolate(input=preds[2], size=(h, w), mode='bilinear', align_corners=True)  # edge 
+        
+        if target[5] is None:
+            loss_semantic_edge = self.lamda_3 * F.cross_entropy(scale_semantic_edge, target[2], semantic_weights)
+            # loss_edge = self.lamda_2 * self.sed(scale_edge, target[1])
+        else:
+            soft_scale_semantic_edge = F.interpolate(input=target[5], size=(h, w), mode='bilinear', align_corners=True)
+            soft_scale_semantic_edge = moving_average(soft_scale_semantic_edge, to_one_hot(target[2], num_cls=self.num_classes),
+                                                1.0 / (cycle_n + 1.0))
+            loss_semantic_edge += self.lamda_3 * self.kldiv(scale_semantic_edge, soft_scale_semantic_edge, target[0])
+    
+        # binary edge attention loss
+        loss_att_binary_edge_ = self.criterion_weight(scale_parse, target[0]) * target[1].float()
+        loss_att_binary_edge_ = loss_att_binary_edge_ / binary_edge_p_num  # only compute the edge pixels
+        loss_att_binary_edge_ = torch.sum(loss_att_binary_edge_) / target[1].size(0)  # mean for batchsize      
+                
+        loss_parse += loss_parse
+        loss_att_binary_edge += loss_att_binary_edge
+        loss_att_binary_edge += loss_att_binary_edge_
+
+        # semantic edge attention loss
+        loss_att_semantic_edge_ = self.criterion_weight(scale_parse, target[0]) * target[2].float()
+        loss_att_semantic_edge_ = loss_att_semantic_edge_ / semantic_edge_p_num  # only compute the edge pixels
+        loss_att_semantic_edge_ = torch.sum(loss_att_semantic_edge_) / target[2].size(0)  # mean for batchsize      
+                
+        loss_parse += loss_parse
+        loss_semantic_edge += loss_semantic_edge
+        loss_att_semantic_edge += loss_att_semantic_edge_
+        # loss_consistency += loss_consistency
+        
+        # print('loss_parse: {}\t loss_edge: {}\t loss_att_edge: {}\t loss_semantic_edge: {}'.format(loss_parse,loss_edge,loss_att_edge, loss_consistency))
+        return self.loss_weight[0]*loss_parse + self.loss_weight[1]*loss_binary_edge + self.loss_weight[2]*loss_semantic_edge \
+            + self.loss_weight[3]*loss_att_binary_edge + self.loss_weight[4]*loss_att_semantic_edge
+
+
+def moving_average(target1, target2, alpha=1.0):
+    target = 0
+    target += (1.0 - alpha) * target1
+    target += target2 * alpha
+    return target
+
+
+def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
+    b, h, w = tensor.shape
+    tensor[tensor == ignore_index] = 0
+    onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
+    onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
+    return onehot_tensor
diff --git a/insightface/parsing/dml_csr/loss/kl_loss.py b/insightface/parsing/dml_csr/loss/kl_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb63cb1bb898e5859e04f45279563f2be6d85106
--- /dev/null
+++ b/insightface/parsing/dml_csr/loss/kl_loss.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   kl_loss.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+def flatten_probas(input, target, labels, ignore=255):
+    """
+    Flattens predictions in the batch.
+    """
+    B, C, H, W = input.size()
+    input = input.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    target = target.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return input, target
+    valid = (labels != ignore)
+    vinput = input[torch.nonzero(valid, as_tuple=False).squeeze()]
+    vtarget = target[torch.nonzero(valid, as_tuple=False).squeeze()]
+    return vinput, vtarget
+
+
+class KLDivergenceLoss(nn.Module):
+    def __init__(self, ignore_index=255, T=1):
+        super(KLDivergenceLoss, self).__init__()
+        self.ignore_index=ignore_index
+        self.T = T
+
+    def forward(self, input, target, label):
+        log_input_prob = F.log_softmax(input / self.T, dim=1)
+        target_porb = F.softmax(target / self.T, dim=1)
+        loss = F.kl_div(*flatten_probas(log_input_prob, target_porb, label, ignore=self.ignore_index), reduction='batchmean')
+        return self.T*self.T*loss # balanced
diff --git a/insightface/parsing/dml_csr/loss/lovasz_softmax.py b/insightface/parsing/dml_csr/loss/lovasz_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..73916507292e1b2a28a5870f96c32e3e4566a506
--- /dev/null
+++ b/insightface/parsing/dml_csr/loss/lovasz_softmax.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   lovaz_softmax.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from torch import nn
+from torch.autograd import Variable
+
+try:
+    from itertools import ifilterfalse
+except ImportError:  # py3k
+    from itertools import filterfalse as ifilterfalse
+
+
+def lovasz_grad(gt_sorted):
+    """
+    Computes gradient of the Lovasz extension w.r.t sorted errors
+    See Alg. 1 in paper
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
+    """
+    IoU for foreground class
+    binary: 1 foreground, 0 background
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        intersection = ((label == 1) & (pred == 1)).sum()
+        union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
+        if not union:
+            iou = EMPTY
+        else:
+            iou = float(intersection) / float(union)
+        ious.append(iou)
+    iou = mean(ious)  # mean accross images if per_image
+    return 100 * iou
+
+
+def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
+    """
+    Array of IoU for each (non ignored) class
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        iou = []
+        for i in range(C):
+            if i != ignore:  # The ignored label is sometimes among predicted classes (ENet - CityScapes)
+                intersection = ((label == i) & (pred == i)).sum()
+                union = ((label == i) | ((pred == i) & (label != ignore))).sum()
+                if not union:
+                    iou.append(EMPTY)
+                else:
+                    iou.append(float(intersection) / float(union))
+        ious.append(iou)
+    ious = [mean(iou) for iou in zip(*ious)]  # mean accross images if per_image
+    return 100 * np.array(ious)
+
+
+# --------------------------- BINARY LOSSES ---------------------------
+
+
+def lovasz_hinge(logits, labels, per_image=True, ignore=None):
+    """
+    Binary Lovasz hinge loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      per_image: compute the loss per image instead of per batch
+      ignore: void class id
+    """
+    if per_image:
+        loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
+                    for log, lab in zip(logits, labels))
+    else:
+        loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
+    return loss
+
+
+def lovasz_hinge_flat(logits, labels):
+    """
+    Binary Lovasz hinge loss
+      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
+      labels: [P] Tensor, binary ground truth labels (0 or 1)
+      ignore: label to ignore
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * Variable(signs))
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
+    return loss
+
+
+def flatten_binary_scores(scores, labels, ignore=None):
+    """
+    Flattens predictions in the batch (binary case)
+    Remove labels equal to 'ignore'
+    """
+    scores = scores.view(-1)
+    labels = labels.view(-1)
+    if ignore is None:
+        return scores, labels
+    valid = (labels != ignore)
+    vscores = scores[valid]
+    vlabels = labels[valid]
+    return vscores, vlabels
+
+
+class StableBCELoss(torch.nn.modules.Module):
+    def __init__(self):
+        super(StableBCELoss, self).__init__()
+
+    def forward(self, input, target):
+        neg_abs = - input.abs()
+        loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
+        return loss.mean()
+
+
+def binary_xloss(logits, labels, ignore=None):
+    """
+    Binary Cross entropy loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      ignore: void class id
+    """
+    logits, labels = flatten_binary_scores(logits, labels, ignore)
+    loss = StableBCELoss()(logits, Variable(labels.float()))
+    return loss
+
+
+# --------------------------- MULTICLASS LOSSES ---------------------------
+
+
+def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=255, weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
+              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
+      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+      per_image: compute the loss per image instead of per batch
+      ignore: void class labels
+    """
+    if per_image:
+        loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes, weighted=weighted)
+                    for prob, lab in zip(probas, labels))
+    else:
+        loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes, weighted=weighted )
+    return loss
+
+
+def lovasz_softmax_flat(probas, labels, classes='present', weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
+      labels: [P] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+    if probas.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probas * 0.
+    C = probas.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes is 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probas[:, 0]
+        else:
+            class_pred = probas[:, c]
+        errors = (Variable(fg) - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        if weighted is not None:
+            losses.append(weighted[c]*torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+        else:
+            losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+    return mean(losses)
+
+
+def flatten_probas(probas, labels, ignore=None):
+    """
+    Flattens predictions in the batch
+    """
+    if probas.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probas.size()
+        probas = probas.view(B, 1, H, W)
+    B, C, H, W = probas.size()
+    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return probas, labels
+    valid = (labels != ignore)
+    # vprobas = probas[valid.nonzero().squeeze()]
+    vprobas = probas[torch.nonzero(valid, as_tuple =False).squeeze()]
+    vlabels = labels[valid]
+    return vprobas, vlabels
+
+
+def xloss(logits, labels, ignore=None):
+    """
+    Cross entropy loss
+    """
+    return F.cross_entropy(logits, Variable(labels), ignore_index=255)
+
+
+# --------------------------- HELPER FUNCTIONS ---------------------------
+def isnan(x):
+    return x != x
+
+
+def mean(l, ignore_nan=False, empty=0):
+    """
+    nanmean compatible with generators.
+    """
+    l = iter(l)
+    if ignore_nan:
+        l = ifilterfalse(isnan, l)
+    try:
+        n = 1
+        acc = next(l)
+    except StopIteration:
+        if empty == 'raise':
+            raise ValueError('Empty mean')
+        return empty
+    for n, v in enumerate(l, 2):
+        acc += v
+    if n == 1:
+        return acc
+    return acc / n
+
+# --------------------------- Class ---------------------------
+class LovaszSoftmax(nn.Module):
+    def __init__(self, per_image=False, ignore_index=255, weighted=None):
+        super(LovaszSoftmax, self).__init__()
+        self.lovasz_softmax = lovasz_softmax
+        self.per_image = per_image
+        self.ignore_index=ignore_index
+        self.weighted = weighted
+
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return self.lovasz_softmax(pred, label, per_image=self.per_image, ignore=self.ignore_index, weighted=self.weighted)
diff --git a/insightface/parsing/dml_csr/networks/dml_csr.py b/insightface/parsing/dml_csr/networks/dml_csr.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b9408ec3959951cfdc88c9bfa5390709079aff
--- /dev/null
+++ b/insightface/parsing/dml_csr/networks/dml_csr.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   dml_csr.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import torch.nn as nn
+
+from torch.nn import functional as F
+from inplace_abn import InPlaceABNSync
+from .modules.ddgcn import DDualGCNHead
+from .modules.parsing import Parsing
+from .modules.edges import Edges
+from .modules.util import Bottleneck
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class DML_CSR(nn.Module):
+    def __init__(self, 
+                 num_classes, 
+                 abn=InPlaceABNSync,
+                 trained=True):
+        super().__init__()
+        self.inplanes = 128
+        self.is_trained = trained
+
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = abn(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = abn(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = abn(128)
+        self.relu3 = nn.ReLU(inplace=False)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layers = [3, 4, 23, 3]
+        self.abn = abn
+        strides = [1, 2, 1, 1]
+        dilations = [1, 1, 1, 2]
+
+        self.layer1 = self._make_layer(Bottleneck, 64, self.layers[0], stride=strides[0], dilation=dilations[0])
+        self.layer2 = self._make_layer(Bottleneck, 128, self.layers[1], stride=strides[1], dilation=dilations[1])
+        self.layer3 = self._make_layer(Bottleneck, 256, self.layers[2], stride=strides[2], dilation=dilations[2])
+        self.layer4 = self._make_layer(Bottleneck, 512, self.layers[3], stride=strides[3], dilation=dilations[3], multi_grid=(1,1,1))
+        # Context Aware
+        self.context = DDualGCNHead(2048, 512, abn)
+        self.layer6 = Parsing(512, 256, num_classes, abn)
+        # edge
+        if self.is_trained:
+            self.edge_layer = Edges(abn, out_fea=num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                self.abn(planes * block.expansion, affine=True))
+
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index%len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, abn=self.abn, dilation=dilation, downsample=downsample, multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, abn=self.abn, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        input = x
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x1 = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x1)
+        x2 = self.layer1(x) # 119 x 119
+        x3 = self.layer2(x2) # 60 x 60
+        x4 = self.layer3(x3) # 60 x 60
+        x5 = self.layer4(x4) # 60 x 60
+        x = self.context(x5)
+        seg, x = self.layer6(x, x2)
+
+        if self.is_trained:
+            binary_edge, semantic_edge, edge_fea = self.edge_layer(x2,x3,x4)
+            return seg, binary_edge, semantic_edge
+        
+        return seg
+        
diff --git a/insightface/parsing/dml_csr/networks/modules/ddgcn.py b/insightface/parsing/dml_csr/networks/modules/ddgcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..368d60ee01eea548ebea372dd22b4cbe04c765cd
--- /dev/null
+++ b/insightface/parsing/dml_csr/networks/modules/ddgcn.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   ddgcn.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+
+from inplace_abn import InPlaceABNSync
+
+
+class SpatialGCN(nn.Module):
+    def __init__(self, plane, abn=InPlaceABNSync):
+        super(SpatialGCN, self).__init__()
+        inter_plane = plane // 2
+        self.node_k = nn.Conv2d(plane, inter_plane, kernel_size=1)
+        self.node_v = nn.Conv2d(plane, inter_plane, kernel_size=1)
+        self.node_q = nn.Conv2d(plane, inter_plane, kernel_size=1)
+
+        self.conv_wg = nn.Conv1d(inter_plane, inter_plane, kernel_size=1, bias=False)
+        self.bn_wg   = nn.BatchNorm1d(inter_plane)
+        self.softmax = nn.Softmax(dim=2)
+
+        self.out = nn.Sequential(nn.Conv2d(inter_plane, plane, kernel_size=1),
+                                 abn(plane))
+
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+    def forward(self, x):
+        # b, c, h, w = x.size()
+        node_k = self.node_k(x)
+        node_v = self.node_v(x)
+        node_q = self.node_q(x)
+        b,c,h,w = node_k.size()
+        node_k = node_k.view(b, c, -1).permute(0, 2, 1)
+        node_q = node_q.view(b, c, -1)
+        node_v = node_v.view(b, c, -1).permute(0, 2, 1)
+        # A = k * q
+        # AV = k * q * v
+        # AVW = k *(q *v) * w
+        AV = torch.bmm(node_q,node_v)
+        AV = self.softmax(AV)
+        AV = torch.bmm(node_k, AV)
+        AV = AV.transpose(1, 2).contiguous()
+        AVW = self.conv_wg(AV)
+        AVW = self.bn_wg(AVW)
+        AVW = AVW.view(b, c, h, -1)
+        # out = F.relu_(self.out(AVW) + x)
+        out = self.gamma * self.out(AVW) + x
+        return out
+
+
+class DDualGCN(nn.Module):
+    """
+        Feature GCN with coordinate GCN
+    """
+    def __init__(self, planes, abn=InPlaceABNSync, ratio=4):
+        super(DDualGCN, self).__init__()
+
+        self.phi      = nn.Conv2d(planes, planes // ratio * 2, kernel_size=1, bias=False)
+        self.bn_phi   = abn(planes // ratio * 2)
+        self.theta    = nn.Conv2d(planes, planes // ratio, kernel_size=1, bias=False)
+        self.bn_theta = abn(planes // ratio)
+
+        #  Interaction Space
+        #  Adjacency Matrix: (-)A_g
+        self.conv_adj = nn.Conv1d(planes // ratio, planes // ratio, kernel_size=1, bias=False)
+        self.bn_adj   = nn.BatchNorm1d(planes // ratio)
+
+        #  State Update Function: W_g
+        self.conv_wg = nn.Conv1d(planes // ratio * 2, planes // ratio * 2, kernel_size=1, bias=False)
+        self.bn_wg   = nn.BatchNorm1d(planes // ratio * 2)
+
+        #  last fc
+        self.conv3 = nn.Conv2d(planes // ratio * 2, planes, kernel_size=1, bias=False)
+        self.bn3   = abn(planes)
+
+        self.local = nn.Sequential(
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes),
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes),
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes))
+        self.gcn_local_attention = SpatialGCN(planes, abn)
+
+        self.final = nn.Sequential(nn.Conv2d(planes * 2, planes, kernel_size=1, bias=False),
+                                   abn(planes))
+
+        self.gamma1 = nn.Parameter(torch.zeros(1))
+
+    def to_matrix(self, x):
+        n, c, h, w = x.size()
+        x = x.view(n, c, -1)
+        return x
+
+    def forward(self, feat):
+        # # # # Local # # # #
+        x = feat
+        local = self.local(feat)
+        local = self.gcn_local_attention(local)
+        local = F.interpolate(local, size=x.size()[2:], mode='bilinear', align_corners=True)
+        spatial_local_feat = x * local + x
+
+        # # # # Projection Space # # # #
+        x_sqz, b = x, x
+
+        x_sqz = self.phi(x_sqz)
+        x_sqz = self.bn_phi(x_sqz)
+        x_sqz = self.to_matrix(x_sqz)
+
+        b = self.theta(b)
+        b = self.bn_theta(b)
+        b = self.to_matrix(b)
+
+        # Project
+        z_idt = torch.matmul(x_sqz, b.transpose(1, 2))  # channel
+
+        # # # # Interaction Space # # # #
+        z = z_idt.transpose(1, 2).contiguous()
+
+        z = self.conv_adj(z)
+        z = self.bn_adj(z)
+
+        z = z.transpose(1, 2).contiguous()
+        # Laplacian smoothing: (I - A_g)Z => Z - A_gZ
+        z += z_idt
+
+        z = self.conv_wg(z)
+        z = self.bn_wg(z)
+
+        # # # # Re-projection Space # # # #
+        # Re-project
+        y = torch.matmul(z, b)
+
+        n, _, h, w = x.size()
+        y = y.view(n, -1, h, w)
+
+        y = self.conv3(y)
+        y = self.bn3(y)
+
+        # g_out = x + y
+        # g_out = F.relu_(x+y)
+        g_out = self.gamma1*y + x  
+
+        # cat or sum, nearly the same results
+        out = self.final(torch.cat((spatial_local_feat, g_out), 1))
+
+        return out
+
+
+class DDualGCNHead(nn.Module):
+    def __init__(self, inplanes, interplanes, abn=InPlaceABNSync):
+        super(DDualGCNHead, self).__init__()
+        self.conva = nn.Sequential(nn.Conv2d(inplanes, interplanes, 3, padding=1, bias=False),
+                                   abn(interplanes))
+        self.dualgcn = DDualGCN(interplanes, abn)
+        self.convb = nn.Sequential(nn.Conv2d(interplanes, interplanes, 3, padding=1, bias=False),
+                                   abn(interplanes))
+
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inplanes + interplanes, interplanes, kernel_size=3, padding=1, dilation=1, bias=False),
+            abn(interplanes)
+        )
+
+    def forward(self, x):
+        output = self.conva(x)
+        output = self.dualgcn(output)
+        output = self.convb(output)
+        output = self.bottleneck(torch.cat([x, output], 1))
+        return output
diff --git a/insightface/parsing/dml_csr/networks/modules/edges.py b/insightface/parsing/dml_csr/networks/modules/edges.py
new file mode 100644
index 0000000000000000000000000000000000000000..5059f235dbf8f9866b99764f46af0c3738d3e675
--- /dev/null
+++ b/insightface/parsing/dml_csr/networks/modules/edges.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   edges.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+
+from inplace_abn import InPlaceABNSync
+
+
+class Edges(nn.Module):
+
+    def __init__(self, abn=InPlaceABNSync, in_fea=[256,512,1024], mid_fea=256, out_fea=2):
+        super(Edges, self).__init__()
+        
+        self.conv1 =  nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+            ) 
+        self.conv2 =  nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+            )  
+        self.conv3 =  nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea,out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5_b = nn.Conv2d(out_fea*3,2, kernel_size=1, padding=0, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea*3,out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+            
+
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+        
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)        
+        
+        edge2_fea =  F.interpolate(edge2_fea, size=(h, w), mode='bilinear',align_corners=True) 
+        edge3_fea =  F.interpolate(edge3_fea, size=(h, w), mode='bilinear',align_corners=True) 
+        edge2 =  F.interpolate(edge2, size=(h, w), mode='bilinear',align_corners=True)
+        edge3 =  F.interpolate(edge3, size=(h, w), mode='bilinear',align_corners=True) 
+ 
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        semantic_edge = self.conv5(edge)
+        binary_edge = self.conv5_b(edge)
+         
+        return binary_edge, semantic_edge, edge_fea
+
diff --git a/insightface/parsing/dml_csr/networks/modules/parsing.py b/insightface/parsing/dml_csr/networks/modules/parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed1addb31907193c1c3642ba2c2db81bdf4aa813
--- /dev/null
+++ b/insightface/parsing/dml_csr/networks/modules/parsing.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   parsing.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+
+from inplace_abn import InPlaceABNSync
+
+
+class Parsing(nn.Module):
+    def __init__(self, in_plane1, in_plane2, num_classes, abn=InPlaceABNSync):
+        super(Parsing, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_plane1, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256)
+            )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_plane2, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            abn(48)
+            )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256)
+            )
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x 
+      
diff --git a/insightface/parsing/dml_csr/networks/modules/util.py b/insightface/parsing/dml_csr/networks/modules/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe23dd8e5ac40a469783972a9694263b4962a73
--- /dev/null
+++ b/insightface/parsing/dml_csr/networks/modules/util.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   util.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.nn as nn
+
+from inplace_abn import InPlaceABNSync
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, abn=InPlaceABNSync, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = abn(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation*multi_grid, dilation=dilation*multi_grid, bias=False)
+        self.bn2 = abn(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = abn(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = out + residual      
+        out = self.relu_inplace(out)
+
+        return out
diff --git a/insightface/parsing/dml_csr/test.py b/insightface/parsing/dml_csr/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aad2527a142e30795e09016ce4c2a0c61a27d86
--- /dev/null
+++ b/insightface/parsing/dml_csr/test.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   datasets.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import torch
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+
+from copy import deepcopy
+from inplace_abn import InPlaceABN
+from dataset import datasets
+from networks import dml_csr
+from utils import miou
+
+torch.multiprocessing.set_start_method("spawn", force=True)
+
+DATA_DIRECTORY = './datasets/Helen'
+IGNORE_LABEL = 255
+NUM_CLASSES = 20
+SNAPSHOT_DIR = './snapshots/'
+INPUT_SIZE = (473,473)
+
+
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+    
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="DML_CSR Network")
+    parser.add_argument("--batch-size", type=int, default=1,
+                        help="Number of images sent to the network in one step.")
+    parser.add_argument("--data-dir", type=str, default=DATA_DIRECTORY,
+                        help="Path to the directory containing the PASCAL VOC dataset.")
+    parser.add_argument("--out-dir", type=str, default=DATA_DIRECTORY,
+                        help="Path to the directory containing the PASCAL VOC dataset.")
+    parser.add_argument("--dataset", type=str, default='val',
+                        help="Path to the file listing the images in the dataset.")
+    parser.add_argument("--ignore-label", type=int, default=IGNORE_LABEL,
+                        help="The index of the label to ignore during the training.")
+    parser.add_argument("--num-classes", type=int, default=NUM_CLASSES,
+                        help="Number of classes to predict (including background).")
+    parser.add_argument("--restore-from", type=str,
+                        help="Where restore model parameters from.")
+    parser.add_argument("--gpu", type=str, default='7',
+                        help="choose gpu device.")
+    parser.add_argument("--input-size", type=str, default=INPUT_SIZE,
+                        help="Comma-separated string with height and width of images.")
+    parser.add_argument("--local_rank", type=int, default=0,
+                        help="choose gpu numbers") 
+    parser.add_argument('--dist-backend', default='nccl', type=str,
+                        help='distributed backend')
+    parser.add_argument("--model_type", type=int, default=0,
+                        help="choose model type") 
+    return parser.parse_args()
+
+
+def valid(model, valloader, input_size, num_samples, dir=None, dir_edge=None, dir_img=None):
+
+    height = input_size[0]
+    width  = input_size[1]
+    with torch.autograd.profiler.profile(enabled=True, use_cuda=True, \
+        record_shapes=False, profile_memory=False) as prof:
+        model.eval()
+        parsing_preds = np.zeros((num_samples, height, width), dtype=np.uint8)
+        scales = np.zeros((num_samples, 2), dtype=np.float32)
+        centers = np.zeros((num_samples, 2), dtype=np.int32)
+
+        idx = 0
+        interp = torch.nn.Upsample(size=(height, width), mode='bilinear', align_corners=True)
+
+        with torch.no_grad():
+            for index, batch in enumerate(valloader):
+                image, meta = batch
+                num_images = image.size(0)
+                if index % 10 == 0:
+                    print('%d  processd' % (index * num_images))
+
+                c = meta['center'].numpy()
+                s = meta['scale'].numpy()
+                scales[idx:idx + num_images, :] = s[:, :]
+                centers[idx:idx + num_images, :] = c[:, :]
+
+                results = model(image.cuda())
+                outputs = results
+
+                if isinstance(results, list):
+                    outputs = results[0]
+
+                if isinstance(outputs, list):
+                    for k, output in enumerate(outputs):
+                        parsing = output
+                        nums = len(parsing)
+                        parsing = interp(parsing).data.cpu().numpy()
+                        parsing = parsing.transpose(0, 2, 3, 1)  # NCHW NHWC
+                        parsing_preds[idx:idx + nums, :, :] = np.asarray(np.argmax(parsing, axis=3), dtype=np.uint8)
+                        idx += nums
+                else:
+                    parsing = outputs
+                    parsing = interp(parsing).data.cpu().numpy()
+                    parsing = parsing.transpose(0, 2, 3, 1)  # NCHW NHWC
+                    parsing_preds[idx:idx + num_images, :, :] = np.asarray(np.argmax(parsing, axis=3), dtype=np.uint8)
+
+                    if dir is not None:
+                        for i in range(len(meta['name'])):
+                            cv2.imwrite(os.path.join(dir, meta['name'][i] + '.png'), np.asarray(np.argmax(parsing, axis=3))[i])
+                    idx += num_images
+        parsing_preds = parsing_preds[:num_samples, :, :]
+
+    return parsing_preds, scales, centers
+
+
+def main():
+    """Create the model and start the evaluation process."""
+
+    args = get_arguments()
+
+    os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
+    gpus = [int(i) for i in args.gpu.split(',')]
+
+    print(args.gpu)
+
+    h, w = map(int, args.input_size.split(','))
+    
+    input_size = (h, w)
+
+    cudnn.benchmark = True
+    cudnn.enabled = True
+
+    model = dml_csr.DML_CSR(args.num_classes, InPlaceABN, False)
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        normalize,
+    ])
+
+    dataset = datasets.FaceDataSet(args.data_dir, args.dataset, \
+        crop_size=input_size, transform=transform)
+    num_samples = len(dataset)
+
+    valloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, \
+        shuffle=False, pin_memory=True)
+
+    restore_from = args.restore_from
+    print(restore_from)
+    state_dict = torch.load(restore_from,map_location='cuda:0')
+    model.load_state_dict(state_dict)
+        
+    model.cuda()
+    model.eval()
+
+    save_path =  os.path.join(args.out_dir, args.dataset, 'parsing')
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    parsing_preds, scales, centers = valid(model, valloader, input_size, num_samples, save_path)
+    mIoU, f1 = miou.compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, \
+        input_size, args.dataset, reverse=True)
+
+    print(mIoU)
+    print(f1)
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/parsing/dml_csr/train.py b/insightface/parsing/dml_csr/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6591bda6118de244907ef414c1aa157fd448bb6
--- /dev/null
+++ b/insightface/parsing/dml_csr/train.py
@@ -0,0 +1,418 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   train.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import os.path as osp
+import timeit
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim as optim
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+import utils.schp as schp
+
+from datetime import datetime
+from tensorboardX import SummaryWriter
+from torch.utils.data.distributed import DistributedSampler
+from inplace_abn import InPlaceABNSync
+from inplace_abn import InPlaceABN
+
+from dataset import datasets
+from networks import dml_csr
+from utils.logging import get_root_logger
+from utils.utils import decode_parsing, inv_preprocess, SingleGPU
+from utils.encoding import DataParallelModel, DataParallelCriterion 
+from utils.miou import compute_mean_ioU
+from utils.warmup_scheduler import SGDRScheduler
+from loss.criterion import Criterion
+from test import valid
+
+torch.multiprocessing.set_start_method("spawn", force=True)
+
+RESTORE_FROM = 'resnet101-imagenet.pth'
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="Training Network")
+    # Data Preference
+    parser.add_argument("--data-dir", type=str, default='./datasets',
+                        help="Path to the directory containing the dataset.")
+    parser.add_argument("--train-dataset", type=str, default='train', choices=['train', 'valid', 'test'],
+                        help="Path to the file listing the images in the dataset.")
+    parser.add_argument("--valid-dataset", type=str, default='test', choices=['valid', 'test_resize', 'test'],
+                        help="Path to the file listing the images in the dataset.")
+    parser.add_argument("--batch-size", type=int, default=8,
+                        help="Number of images sent to the network in one step.")
+    parser.add_argument("--input-size", type=str, default='473,473',
+                        help="Comma-separated string with height and width of images.")
+    parser.add_argument("--num-classes", type=int, default=11,
+                        help="Number of classes to predict (including background).")
+    parser.add_argument("--edge-classes", type=int, default=2,
+                        help="Number of classes to predict (including background).") 
+    parser.add_argument("--ignore-label", type=int, default=255,
+                        help="The index of the label to ignore during the training.")
+    parser.add_argument("--random-mirror", action="store_true",
+                        help="Whether to randomly mirror the inputs during the training.")
+    parser.add_argument("--random-scale", action="store_true",
+                        help="Whether to randomly scale the inputs during the training.")
+    parser.add_argument("--random-seed", type=int, default=1234,
+                        help="Random seed to have reproducible results.")
+    # Model
+    parser.add_argument("--restore-from", type=str, default=None,
+                        help="Where restore model parameters from.")
+    parser.add_argument("--snapshot-dir", type=str, default='./snapshots/',
+                        help="Where to save snapshots of the model.")
+    # Training Strategy
+    parser.add_argument("--save-num-images", type=int, default=2,
+                        help="How many images to save.")
+    parser.add_argument("--learning-rate", type=float, default=1e-3,
+                        help="Base learning rate for training with polynomial decay.")
+    parser.add_argument("--momentum", type=float, default=0.9,
+                        help="Momentum component of the optimiser.")
+    parser.add_argument("--weight-decay", type=float, default=0.0005,
+                        help="Regularisation parameter for L2-loss.")
+    parser.add_argument("--power", type=float, default=0.9,
+                        help="Decay parameter to compute the learning rate.")
+    parser.add_argument("--eval_epochs", type=int, default=1,
+                        help="Number of classes to predict (including background).")                                            
+    parser.add_argument("--start-epoch", type=int, default=0,
+                        help="choose the number of recurrence.")
+    parser.add_argument("--epochs", type=int, default=150,
+                        help="choose the number of recurrence.")
+    # Distributed Training
+    parser.add_argument("--gpu", type=str, default='None',
+                        help="choose gpu numbers")
+    parser.add_argument("--local_rank", type=int, default=0,
+                        help="choose gpu numbers")
+    parser.add_argument('--dist-backend', default='nccl', type=str,
+                        help='distributed backend')
+    # self correlation training
+    parser.add_argument("--schp-start", type=int, default=100, 
+                        help='schp start epoch')
+    parser.add_argument("--cycle-epochs", type=int, default=10, 
+                        help='schp cyclical epoch')
+    parser.add_argument("--schp-restore", type=str, default=None,
+                        help="Where restore schp model parameters from.")
+    parser.add_argument("--lambda-s", type=float, default=1,   
+                        help='segmentation loss weight')
+    parser.add_argument("--lambda-e", type=float, default=1, 
+                        help='edge loss weight')
+    parser.add_argument("--lambda-c", type=float, default=0.1, 
+                        help='segmentation-edge consistency loss weight')
+    return parser.parse_args()
+
+
+args = get_arguments()
+TIMESTAMP = "{0:%Y_%m_%dT%H_%M_%S/}".format(datetime.now())
+global writer
+
+def lr_poly(base_lr, iter, max_iter, power):
+    return base_lr * ((1 - float(iter) / max_iter) ** (power))
+
+
+def adjust_learning_rate(optimizer, i_iter, total_iters):
+    """Sets the learning rate to the initial LR divided by 5 at 60th, 120th and 160th epochs"""
+    lr = lr_poly(args.learning_rate, i_iter, total_iters, args.power)
+    optimizer.param_groups[0]['lr'] = lr
+    return lr
+
+
+def adjust_learning_rate_pose(optimizer, epoch):
+    decay = 0
+    if epoch + 1 >= 230:
+        decay = 0.05
+    elif epoch + 1 >= 200:
+        decay = 0.1
+    elif epoch + 1 >= 120:
+        decay = 0.25
+    elif epoch + 1 >= 90:
+        decay = 0.5
+    else:
+        decay = 1
+
+    lr = args.learning_rate * decay
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def set_bn_eval(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') != -1:
+        m.eval()
+
+
+def set_bn_momentum(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') != -1 or classname.find('InPlaceABN') != -1:
+        m.momentum = 0.0003
+
+
+def main():
+    """Create the model and start the training."""
+    cycle_n = 0
+    start_epoch = args.start_epoch
+    writer = SummaryWriter(osp.join(args.snapshot_dir, TIMESTAMP)) 
+    if not os.path.exists(args.snapshot_dir):
+        os.makedirs(args.snapshot_dir)
+
+    h, w = map(int, args.input_size.split(','))
+    input_size = [h, w]
+    best_f1 = 0
+
+    torch.cuda.set_device(args.local_rank)
+
+    try:
+        world_size = int(os.environ['WORLD_SIZE'])
+        distributed = world_size > 1
+    except:
+        distributed = False
+        world_size = 1
+    if distributed:
+        dist.init_process_group(backend=args.dist_backend, init_method='env://')
+    rank = 0 if not distributed else dist.get_rank()
+
+    log_file = args.snapshot_dir + '/' + TIMESTAMP + 'output.log'
+    logger = get_root_logger(log_file=log_file, log_level='INFO')
+    logger.info(f'Distributed training: {distributed}')
+
+    cudnn.enabled = True
+    cudnn.benchmark = True
+    torch.backends.cudnn.deterministic = False
+    torch.backends.cudnn.enabled = True
+ 
+    if distributed:
+        model = dml_csr.DML_CSR(args.num_classes)
+        schp_model = dml_csr.DML_CSR(args.num_classes)
+    else:
+        model = dml_csr.DML_CSR(args.num_classes, InPlaceABN)
+        schp_model = dml_csr.DML_CSR(args.num_classes, InPlaceABN)
+    
+    if args.restore_from is not None:
+        print('Resume training from {}'.format(args.restore_from))
+        model.load_state_dict(torch.load(args.restore_from), True)
+        start_epoch = int(float(args.restore_from.split('.')[0].split('_')[-1])) + 1
+    else:
+        resnet_params = torch.load(RESTORE_FROM)
+        new_params = model.state_dict().copy()
+        for i in resnet_params:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = resnet_params[i]
+        model.load_state_dict(new_params)
+    model.cuda()
+
+    args.schp_restore = osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth')
+    if os.path.exists(args.schp_restore):
+        print('Resume schp checkpoint from {}'.format(args.schp_restore))
+        schp_model.load_state_dict(torch.load(args.schp_restore), True)
+    else:
+        schp_resnet_params = torch.load(RESTORE_FROM)
+        schp_new_params = schp_model.state_dict().copy()
+        for i in schp_resnet_params:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                schp_new_params['.'.join(i_parts[0:])] = schp_resnet_params[i]
+        schp_model.load_state_dict(schp_new_params)
+    schp_model.cuda()
+
+    if distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+        schp_model = torch.nn.parallel.DistributedDataParallel(schp_model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    else:
+        model = SingleGPU(model)
+        schp_model = SingleGPU(schp_model)
+
+    criterion = Criterion(loss_weight=[1, 1, 1, 4, 1], 
+                lambda_1=args.lambda_s, lambda_2=args.lambda_e, lambda_3=args.lambda_c, num_classes=args.num_classes)
+    criterion.cuda()
+
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    transform = transforms.Compose([transforms.ToTensor(), normalize])
+    
+    train_dataset = FaceDataSet(args.data_dir, args.train_dataset, crop_size=input_size, transform=transform)
+    if distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+    trainloader = data.DataLoader(train_dataset, batch_size=args.batch_size , shuffle=False, num_workers=2,
+                                  pin_memory=True, drop_last=True, sampler=train_sampler)
+    
+    val_dataset = datasets[str(args.model_type)](args.data_dir, args.valid_dataset, crop_size=input_size, transform=transform)
+    num_samples = len(val_dataset)
+    valloader = data.DataLoader(val_dataset, batch_size=args.batch_size , shuffle=False, pin_memory=True, drop_last=False)
+
+    # Optimizer Initialization
+    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum,
+                        weight_decay=args.weight_decay)
+    lr_scheduler = SGDRScheduler(optimizer, total_epoch=args.epochs,
+                                eta_min=args.learning_rate / 100, warmup_epoch=10,
+                                start_cyclical=args.schp_start, cyclical_base_lr=args.learning_rate / 2,
+                                cyclical_epoch=args.cycle_epochs)
+
+    optimizer.zero_grad()
+
+    total_iters = args.epochs * len(trainloader)
+    start = timeit.default_timer()
+    for epoch in range(start_epoch, args.epochs):
+        model.train()
+        if distributed:
+            train_sampler.set_epoch(epoch)
+        for i_iter, batch in enumerate(trainloader):
+            i_iter += len(trainloader) * epoch
+
+            if epoch < args.schp_start:
+                lr = adjust_learning_rate(optimizer, i_iter, total_iters)
+            else:
+                lr = lr_scheduler.get_lr()[0]
+
+            images, labels, edges, semantic_edges, _ = batch
+            labels = labels.long().cuda(non_blocking=True)
+            edges = edges.long().cuda(non_blocking=True)
+            semantic_edges = semantic_edges.long().cuda(non_blocking=True)
+
+            preds = model(images)
+            
+            if cycle_n >= 1:
+                with torch.no_grad():
+                    soft_preds, soft_edges, soft_semantic_edges = schp_model(images)
+            else:
+                soft_preds = None
+                soft_edges = None
+                soft_semantic_edges = None
+
+            loss = criterion(preds, [labels, edges, semantic_edges, soft_preds, soft_edges, soft_semantic_edges], cycle_n)
+            
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+
+            with torch.no_grad():
+                loss = loss.detach() * labels.shape[0]
+                count = labels.new_tensor([labels.shape[0]], dtype=torch.long)
+                if dist.is_initialized():
+                    dist.all_reduce(count, dist.ReduceOp.SUM)
+                    dist.all_reduce(loss, dist.ReduceOp.SUM)
+                loss /= count.item()
+
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if i_iter % 50 == 0:
+                    writer.add_scalar('learning_rate', lr, i_iter)
+                    writer.add_scalar('loss', loss.data.cpu().numpy(), i_iter)
+
+                if i_iter % 500 == 0:
+                    images_inv = inv_preprocess(images, args.save_num_images)
+                    labels_colors = decode_parsing(labels, args.save_num_images, args.num_classes, is_pred=False)
+                    edges_colors = decode_parsing(edges, args.save_num_images, 2, is_pred=False)
+                    semantic_edges_colors = decode_parsing(semantic_edges, args.save_num_images, args.num_classes, is_pred=False)
+
+                    if isinstance(preds, list):
+                        preds = preds[0]
+                    preds_colors = decode_parsing(preds[0], args.save_num_images, args.num_classes, is_pred=True)
+                    pred_edges = decode_parsing(preds[1], args.save_num_images, 2, is_pred=True)
+                    pred_semantic_edges_colors = decode_parsing(preds[2], args.save_num_images, args.num_classes, is_pred=True)
+
+                    img = vutils.make_grid(images_inv, normalize=False, scale_each=True)
+                    lab = vutils.make_grid(labels_colors, normalize=False, scale_each=True)
+                    pred = vutils.make_grid(preds_colors, normalize=False, scale_each=True)
+                    edge = vutils.make_grid(edges_colors, normalize=False, scale_each=True)
+                    pred_edge = vutils.make_grid(pred_edges, normalize=False, scale_each=True)
+                    pred_semantic_edges = vutils.make_grid(pred_semantic_edges_colors, normalize=False, scale_each=True)
+
+
+                    writer.add_image('Images/', img, i_iter)
+                    writer.add_image('Labels/', lab, i_iter)
+                    writer.add_image('Preds/', pred, i_iter)
+                    writer.add_image('Edge/', edge, i_iter)
+                    writer.add_image('Pred_edge/', pred_edge, i_iter)
+    
+                cur_loss = loss.data.cpu().numpy()
+                logger.info(f'iter = {i_iter} of {total_iters} completed, loss = {cur_loss}, lr = {lr}')
+
+        if (epoch + 1) % (args.eval_epochs) == 0:
+            parsing_preds, scales, centers = valid(model, valloader, input_size, num_samples)
+            mIoU, f1 = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size, args.valid_dataset, True)
+
+            if not dist.is_initialized() or dist.get_rank() == 0:  
+                torch.save(model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'checkpoint_{}.pth'.format(epoch + 1)))
+                if 'Helen' in args.data_dir:
+                    if f1['overall'] > best_f1:
+                        torch.save(model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth'))
+                        best_f1 = f1['overall']
+                else:
+                    if f1['Mean_F1'] > best_f1:
+                        torch.save(model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth'))
+                        best_f1 = f1['Mean_F1']
+
+            writer.add_scalars('mIoU', mIoU, epoch)
+            writer.add_scalars('f1', f1, epoch)
+            logger.info(f'mIoU = {mIoU}, and f1 = {f1} of epoch = {epoch}, util now, best_f1 = {best_f1}')
+
+            if (epoch + 1) >= args.schp_start and (epoch + 1 - args.schp_start) % args.cycle_epochs == 0:
+                logger.info(f'Self-correction cycle number {cycle_n}')
+                schp.moving_average(schp_model, model, 1.0 / (cycle_n + 1))
+                cycle_n += 1
+                schp.bn_re_estimate(trainloader, schp_model)
+                parsing_preds, scales, centers = valid(schp_model, valloader, input_size, num_samples)
+                mIoU, f1 = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size, args.valid_dataset, True)
+
+                if not dist.is_initialized() or dist.get_rank() == 0: 
+                    torch.save(schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'schp_{}_checkpoint.pth'.format(cycle_n)))
+
+                    if 'Helen' in args.data_dir:
+                        if f1['overall'] > best_f1:
+                            torch.save(schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth'))
+                            best_f1 = f1['overall']
+                    else:
+                        if f1['Mean_F1'] > best_f1:
+                            torch.save(schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth'))
+                            best_f1 = f1['Mean_F1']
+                writer.add_scalars('mIoU', mIoU, epoch)
+                writer.add_scalars('f1', f1, epoch)
+                logger.info(f'mIoU = {mIoU}, and f1 = {f1} of epoch = {epoch}, util now, best_f1 = {best_f1}')
+
+            torch.cuda.empty_cache()
+            end = timeit.default_timer()
+            print('epoch = {} of {} completed using {} s'.format(epoch, args.epochs,
+                                                                (end - start) / (epoch - start_epoch + 1)))
+
+    end = timeit.default_timer()
+    print(end - start, 'seconds')
+ 
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/parsing/dml_csr/utils/encoding.py b/insightface/parsing/dml_csr/utils/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..098ca8185d4695584522a5a1abccc58a6609c1b3
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/encoding.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   encoding.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import threading
+import torch
+import torch.cuda.comm as comm
+
+from torch.autograd import Variable, Function
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.parallel_apply import get_a_var
+from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
+
+torch_ver = torch.__version__[:3]
+
+__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion',
+           'patch_replication_callback']
+
+def allreduce(*inputs):
+    """Cross GPU all reduce autograd operation for calculate mean and
+    variance in SyncBN.
+    """
+    return AllReduce.apply(*inputs)
+
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, num_inputs, *inputs):
+        ctx.num_inputs = num_inputs
+        ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
+        inputs = [inputs[i:i + num_inputs]
+                 for i in range(0, len(inputs), num_inputs)]
+        # sort before reduce sum
+        inputs = sorted(inputs, key=lambda i: i[0].get_device())
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    @staticmethod
+    def backward(ctx, *inputs):
+        inputs = [i.data for i in inputs]
+        inputs = [inputs[i:i + ctx.num_inputs]
+                 for i in range(0, len(inputs), ctx.num_inputs)]
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
+
+
+class Reduce(Function):
+    @staticmethod
+    def forward(ctx, *inputs):
+        ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
+        inputs = sorted(inputs, key=lambda i: i.get_device())
+        return comm.reduce_add(inputs)
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return Broadcast.apply(ctx.target_gpus, gradOutput)
+
+
+class DataParallelModel(DataParallel):
+    """Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the
+    batch dimension.
+    In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
+    Note that the outputs are not gathered, please use compatible
+    :class:`encoding.parallel.DataParallelCriterion`.
+
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is
+    the same size (so that each GPU processes the same number of samples).
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+    Example::
+
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> y = net(x)
+    """
+    def gather(self, outputs, output_device):
+        return outputs
+
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelModel, self).replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+
+class DataParallelCriterion(DataParallel):
+    """
+    Calculate loss in multiple-GPUs, which balance the memory usage for
+    Semantic Segmentation.
+
+    The targets are splitted across the specified devices by chunking in
+    the batch dimension. Please use together with :class:`encoding.parallel.DataParallelModel`.
+
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+
+    Example::
+
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> criterion = encoding.nn.DataParallelCriterion(criterion, device_ids=[0, 1, 2])
+        >>> y = net(x)
+        >>> loss = criterion(y, target)
+    """
+    def forward(self, inputs, *targets, **kwargs):
+        # input should be already scatterd
+        # scattering the targets instead
+        if not self.device_ids:
+            return self.module(inputs, *targets, **kwargs)
+        targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(inputs, *targets[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
+        return Reduce.apply(*outputs) / len(outputs)
+        #return self.gather(outputs, self.output_device).mean()
+
+
+def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
+    assert len(modules) == len(inputs)
+    assert len(targets) == len(inputs)
+    if kwargs_tup:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+
+    lock = threading.Lock()
+    results = {}
+    if torch_ver != "0.3":
+        grad_enabled = torch.is_grad_enabled()
+
+    def _worker(i, module, input, target, kwargs, device=None):
+        if torch_ver != "0.3":
+            torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            device = get_a_var(input).get_device()
+        try:
+            if not isinstance(input, tuple):
+                input = (input,)
+            with torch.cuda.device(device):
+                output = module(*(input + target), **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, target,
+                                          kwargs, device),)
+                   for i, (module, input, target, kwargs, device) in
+                   enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
+
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
+
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs
+
+
+###########################################################################
+# Adapted from Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+#
+class CallbackContext(object):
+    pass
+
+
+def execute_replication_callbacks(modules):
+    """
+    Execute an replication callback `__data_parallel_replicate__` on each module created
+    by original replication.
+
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Note that, as all modules are isomorphism, we assign each sub-module with a context
+    (shared among multiple copies of this module on different devices).
+    Through this context, different copies can share some information.
+
+    We guarantee that the callback on the master copy (the first copy) will be called ahead
+    of calling the callback of any slave copies.
+    """
+    master_copy = modules[0]
+    nr_modules = len(list(master_copy.modules()))
+    ctxs = [CallbackContext() for _ in range(nr_modules)]
+
+    for i, module in enumerate(modules):
+        for j, m in enumerate(module.modules()):
+            if hasattr(m, '__data_parallel_replicate__'):
+                m.__data_parallel_replicate__(ctxs[j], i)
+
+
+def patch_replication_callback(data_parallel):
+    """
+    Monkey-patch an existing `DataParallel` object. Add the replication callback.
+    Useful when you have customized `DataParallel` implementation.
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
+        > patch_replication_callback(sync_bn)
+        # this is equivalent to
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+    """
+
+    assert isinstance(data_parallel, DataParallel)
+
+    old_replicate = data_parallel.replicate
+
+    @functools.wraps(old_replicate)
+    def new_replicate(module, device_ids):
+        modules = old_replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+    data_parallel.replicate = new_replicate
+    
diff --git a/insightface/parsing/dml_csr/utils/logging.py b/insightface/parsing/dml_csr/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89b6f0ccd982a0a39dfa7f78be3b6553cbb882a
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/logging.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   logging.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import torch.distributed as dist
+
+logger_initialized = {}
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Get root logger.
+
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name='face_parsing', log_file=log_file, log_level=log_level)
+
+    return logger
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
diff --git a/insightface/parsing/dml_csr/utils/miou.py b/insightface/parsing/dml_csr/utils/miou.py
new file mode 100644
index 0000000000000000000000000000000000000000..3739360959e719234c7c647855ddd4614239f442
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/miou.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   miou.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cv2
+import json
+import numpy as np
+import os
+
+from collections import OrderedDict
+from PIL import Image as PILImage
+from utils.transforms import transform_parsing
+
+
+LABELS = ['background', 'skin', 'nose', 'eye_g', 'l_eye', 'r_eye', \
+        'l_brow', 'r_brow', 'l_ear', 'r_ear', 'mouth', 'u_lip', \
+        'l_lip', 'hair', 'hat', 'ear_r', 'neck_l', 'neck', 'cloth']
+
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+
+def get_confusion_matrix(gt_label, pred_label, num_classes):
+    """
+    Calcute the confusion matrix by given label and pred
+    :param gt_label: the ground truth label
+    :param pred_label: the pred label
+    :param num_classes: the nunber of class
+    :return: the confusion matrix
+    """
+    index = (gt_label * num_classes + pred_label).astype('int32')
+    label_count = np.bincount(index)
+    confusion_matrix = np.zeros((num_classes, num_classes))
+
+    for i_label in range(num_classes):
+        for i_pred_label in range(num_classes):
+            cur_index = i_label * num_classes + i_pred_label
+            if cur_index < len(label_count):
+                confusion_matrix[i_label, i_pred_label] = label_count[cur_index]
+
+    return confusion_matrix
+
+
+def fast_histogram(a, b, na, nb):
+    '''
+    fast histogram calculation
+    ---
+    * a, b: non negative label ids, a.shape == b.shape, a in [0, ... na-1], b in [0, ..., nb-1]
+    '''
+    assert a.shape == b.shape
+    assert np.all((a >= 0) & (a < na) & (b >= 0) & (b < nb))
+    # k = (a >= 0) & (a < na) & (b >= 0) & (b < nb)
+    hist = np.bincount(
+        nb * a.reshape([-1]).astype(int) + b.reshape([-1]).astype(int),
+        minlength=na * nb).reshape(na, nb)
+    assert np.sum(hist) == a.size
+    return hist
+
+
+def _read_names(file_name):
+    label_names = []
+    for name in open(file_name, 'r'):
+        name = name.strip()
+        if len(name) > 0:
+            label_names.append(name)
+    return label_names
+
+
+def _merge(*list_pairs):
+    a = []
+    b = []
+    for al, bl in list_pairs:
+        a += al
+        b += bl
+    return a, b
+
+
+def compute_mean_ioU(preds, scales, centers, num_classes, datadir, input_size=[473, 473], dataset='val', reverse=False):
+    file_list_name = os.path.join(datadir, dataset + '_list.txt')
+    val_id = [line.split()[0][7:-4] for line in open(file_list_name).readlines()]
+
+    confusion_matrix = np.zeros((num_classes, num_classes))
+
+    label_names_file = os.path.join(datadir, 'label_names.txt')
+    gt_label_names = pred_label_names = _read_names(label_names_file)
+
+    assert gt_label_names[0] == pred_label_names[0] == 'bg'
+
+    hists = []
+    for i, im_name in enumerate(val_id):
+        gt_path = os.path.join(datadir, dataset, 'labels', im_name + '.png')
+        gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
+        h, w = gt.shape
+        pred_out = preds[i]
+        if scales is not None:
+            s = scales[i]
+            c = centers[i]
+        else:
+            s = None
+            c = None
+        pred_old = transform_parsing(pred_out, c, s, w, h, input_size)
+        gt = np.asarray(gt, dtype=np.int32)
+        pred = np.asarray(pred_old, dtype=np.int32)
+        ignore_index = gt != 255
+
+        gt = gt[ignore_index]
+        pred = pred[ignore_index]
+
+        hist = fast_histogram(gt, pred, len(gt_label_names), len(pred_label_names))
+        hists.append(hist)
+
+        confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
+
+    hist_sum = np.sum(np.stack(hists, axis=0), axis=0)
+
+    eval_names = dict()
+    for label_name in gt_label_names:
+        gt_ind = gt_label_names.index(label_name)
+        pred_ind = pred_label_names.index(label_name)
+        eval_names[label_name] = ([gt_ind], [pred_ind])
+        
+    if 'le' in eval_names and 're' in eval_names:
+        eval_names['eyes'] = _merge(eval_names['le'], eval_names['re'])
+    if 'lb' in eval_names and 'rb' in eval_names:
+        eval_names['brows'] = _merge(eval_names['lb'], eval_names['rb'])
+    if 'ulip' in eval_names and 'imouth' in eval_names and 'llip' in eval_names:
+        eval_names['mouth'] = _merge(
+            eval_names['ulip'], eval_names['imouth'], eval_names['llip'])
+    
+    # Helen 
+    if 'eyes' in eval_names and 'brows' in eval_names and 'nose' in eval_names and 'mouth' in eval_names:
+        eval_names['overall'] = _merge(
+            eval_names['eyes'], eval_names['brows'], eval_names['nose'], eval_names['mouth'])
+
+    pos = confusion_matrix.sum(1)
+    res = confusion_matrix.sum(0)
+    tp = np.diag(confusion_matrix)
+
+    pixel_accuracy = (tp.sum() / pos.sum()) * 100
+    mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
+    IoU_array = (tp / np.maximum(1.0, pos + res - tp))
+    IoU_array = IoU_array * 100
+    mean_IoU = IoU_array.mean()
+    print('Pixel accuracy: %f \n' % pixel_accuracy)
+    print('Mean accuracy: %f \n' % mean_accuracy)
+    print('Mean IU: %f \n' % mean_IoU)
+    mIoU_value = []
+    f1_value = []
+    mf1_value = []
+
+    for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
+        mIoU_value.append((label, iou))
+
+    mIoU_value.append(('Pixel accuracy', pixel_accuracy))
+    mIoU_value.append(('Mean accuracy', mean_accuracy))
+    mIoU_value.append(('Mean IU', mean_IoU))
+    mIoU_value = OrderedDict(mIoU_value)
+
+    for eval_name, (gt_inds, pred_inds) in eval_names.items():
+        A = hist_sum[gt_inds, :].sum()
+        B = hist_sum[:, pred_inds].sum()
+        intersected = hist_sum[gt_inds, :][:, pred_inds].sum()
+        f1 = 2 * intersected / (A + B)
+
+        if eval_name in gt_label_names[1:]:
+            mf1_value.append(f1)
+        f1_value.append((eval_name, f1))
+
+    f1_value.append(('Mean_F1', np.array(mf1_value).mean()))
+    f1_value = OrderedDict(f1_value)
+
+    return mIoU_value, f1_value
+
+
+def write_results(preds, scales, centers, datadir, dataset, result_dir, input_size=[473, 473]):
+    palette = get_palette(20)
+    if not os.path.exists(result_dir):
+        os.makedirs(result_dir)
+
+    json_file = os.path.join(datadir, 'annotations', dataset + '.json')
+    with open(json_file) as data_file:
+        data_list = json.load(data_file)
+        data_list = data_list['root']
+    for item, pred_out, s, c in zip(data_list, preds, scales, centers):
+        im_name = item['im_name']
+        w = item['img_width']
+        h = item['img_height']
+        pred = transform_parsing(pred_out, c, s, w, h, input_size)
+
+        save_path = os.path.join(result_dir, im_name[:-4]+'.png')
+        output_im = PILImage.fromarray(np.asarray(pred, dtype=np.uint8))
+        output_im.putpalette(palette)
+        output_im.save(save_path)
+
+
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="DeepLabLFOV NetworkEv")
+    parser.add_argument("--pred-path", type=str, default='',
+                        help="Path to predicted segmentation.")
+    parser.add_argument("--gt-path", type=str, default='',
+                        help="Path to the groundtruth dir.")
+
+    return parser.parse_args()
diff --git a/insightface/parsing/dml_csr/utils/schp.py b/insightface/parsing/dml_csr/utils/schp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6f58f19a3e85399f908293b93a8af9024f5c71
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/schp.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   schp.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import torch
+
+
+def moving_average(net1, net2, alpha=1):
+    for param1, param2 in zip(net1.parameters(), net2.parameters()):
+        param1.data *= (1.0 - alpha)
+        param1.data += param2.data * alpha
+
+
+def _check_bn(module, flag):
+    classname = module.__class__.__name__
+    if classname.find('BatchNorm') != -1 or classname.find('InPlaceABNSync') != -1:
+        flag[0] = True
+
+
+def check_bn(model):
+    flag = [False]
+    model.apply(lambda module: _check_bn(module, flag))
+    return flag[0]
+
+
+def reset_bn(module):
+    classname = module.__class__.__name__
+    if classname.find('BatchNorm') != -1 or classname.find('InPlaceABNSync') != -1:
+        module.running_mean = torch.zeros_like(module.running_mean)
+        module.running_var = torch.ones_like(module.running_var)
+
+
+def _get_momenta(module, momenta):
+    classname = module.__class__.__name__
+    if classname.find('BatchNorm') != -1 or classname.find('InPlaceABNSync') != -1:
+        momenta[module] = module.momentum
+
+
+def _set_momenta(module, momenta):
+    classname = module.__class__.__name__
+    if classname.find('BatchNorm') != -1 or classname.find('InPlaceABNSync') != -1:
+        module.momentum = momenta[module]
+
+
+def bn_re_estimate(loader, model):
+    if not check_bn(model):
+        print('No batch norm layer detected')
+        return
+    model.train()
+    momenta = {}
+    model.apply(reset_bn)
+    model.apply(lambda module: _get_momenta(module, momenta))
+    n = 0
+    for i_iter, batch in enumerate(loader):
+        # images, labels, edges, _ = batch
+        images = batch[0]
+        b = images.data.size(0)
+        momentum = b / (n + b)
+        for module in momenta.keys():
+            module.momentum = momentum
+        model(images)
+        n += b
+    model.apply(lambda module: _set_momenta(module, momenta))
+
+
+def save_schp_checkpoint(states, is_best_parsing, output_dir, filename='schp_checkpoint.pth.tar'):
+    save_path = os.path.join(output_dir, filename)
+    # if os.path.exists(save_path):
+    #     os.remove(save_path)
+    torch.save(states, save_path)
+    if is_best_parsing and 'state_dict' in states:
+        best_save_path = os.path.join(output_dir, 'model_parsing_best.pth.tar')
+        if os.path.exists(best_save_path):
+            os.remove(best_save_path)
+        torch.save(states, best_save_path)
diff --git a/insightface/parsing/dml_csr/utils/transforms.py b/insightface/parsing/dml_csr/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ac2e99ddbd7c6bd8b70da1dd8b91700f234c93
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/transforms.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   transforms.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import numpy as np
+import cv2
+
+
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+
+    output_flipped = output_flipped[:, :, :, ::-1]
+
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+
+    return output_flipped
+  
+
+def transform_parsing(pred, center, scale, width, height, input_size):
+
+    if center is not None:
+        trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+        target_pred = cv2.warpAffine(
+                pred,
+                trans,
+                (int(width), int(height)), #(int(width), int(height)),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(0))
+    else:
+        target_pred = cv2.resize(pred, (int(width), int(height)), interpolation=cv2.INTER_NEAREST)
+
+    return target_pred
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale
+
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
diff --git a/insightface/parsing/dml_csr/utils/utils.py b/insightface/parsing/dml_csr/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..322c53d2b4c6a976a819a3a03ff0de3993856623
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/utils.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   utils.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torchvision
+import torch
+
+from PIL import Image
+from torch import nn
+
+# colour map
+COLORS = [(0,0,0)
+                # 0=background
+                ,(128,0,0),(0,128,0),(128,128,0),(0,0,128),(128,0,128)
+                # 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle
+                ,(0,128,128),(128,128,128),(64,0,0),(192,0,0),(64,128,0)
+                # 6=bus, 7=car, 8=cat, 9=chair, 10=cow
+                ,(192,128,0),(64,0,128),(192,0,128),(64,128,128),(192,128,128)
+                # 11=diningtable, 12=dog, 13=horse, 14=motorbike, 15=person
+                ,(0,64,0),(128,64,0),(0,192,0),(128,192,0),(0,64,128)]
+                # 16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
+
+
+def decode_parsing(labels, num_images=1, num_classes=21, is_pred=False):
+    """Decode batch of segmentation masks.
+    
+    Args:
+      mask: result of inference after taking argmax.
+      num_images: number of images to decode from the batch.
+      num_classes: number of classes to predict (including background).
+    
+    Returns:
+      A batch with num_images RGB images of the same size as the input. 
+    """
+    pred_labels = labels[:num_images].clone().cpu().data
+    if is_pred:
+        pred_labels = torch.argmax(pred_labels, dim=1)
+    n, h, w = pred_labels.size()
+
+    labels_color = torch.zeros([n, 3, h, w], dtype=torch.uint8)
+    for i, c in enumerate(COLORS):
+        c0 = labels_color[:, 0, :, :]
+        c1 = labels_color[:, 1, :, :]
+        c2 = labels_color[:, 2, :, :]
+
+        c0[pred_labels == i] = c[0]
+        c1[pred_labels == i] = c[1]
+        c2[pred_labels == i] = c[2]
+
+    return labels_color
+
+
+def inv_preprocess(imgs, num_images):
+    """Inverse preprocessing of the batch of images.
+       Add the mean vector and convert from BGR to RGB.
+       
+    Args:
+      imgs: batch of input images.
+      num_images: number of images to apply the inverse transformations on.
+      img_mean: vector of mean colour values.
+  
+    Returns:
+      The batch of the size num_images with the same spatial dimensions as the input.
+    """
+    rev_imgs = imgs[:num_images].clone().cpu().data
+    rev_normalize = NormalizeInverse(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    for i in range(num_images):
+        rev_imgs[i] = rev_normalize(rev_imgs[i])
+
+    return rev_imgs
+
+
+class NormalizeInverse(torchvision.transforms.Normalize):
+    """
+    Undoes the normalization and returns the reconstructed images in the input domain.
+    """
+
+    def __init__(self, mean, std):
+        mean = torch.as_tensor(mean)
+        std = torch.as_tensor(std)
+        std_inv = 1 / (std + 1e-7)
+        mean_inv = -mean * std_inv
+        super().__init__(mean=mean_inv, std=std_inv)
+
+
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module = module
+
+    def forward(self, x):
+        return self.module(x.cuda(non_blocking=True))
+      
diff --git a/insightface/parsing/dml_csr/utils/warmup_scheduler.py b/insightface/parsing/dml_csr/utils/warmup_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4925bac8a5760ed72bcb8c6e2bc74da05c2fc9f
--- /dev/null
+++ b/insightface/parsing/dml_csr/utils/warmup_scheduler.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   warmup_scheduler.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :   
+@License    :   Licensed under the Apache License, Version 2.0 (the "License"); 
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class GradualWarmupScheduler(_LRScheduler):
+    """ Gradually warm-up learning rate with cosine annealing in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+    """
+
+    def __init__(self, optimizer, total_epoch, eta_min=0, warmup_epoch=10, last_epoch=-1):
+        self.total_epoch = total_epoch
+        self.eta_min = eta_min
+        self.warmup_epoch = warmup_epoch
+        super(GradualWarmupScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch <= self.warmup_epoch:
+            return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.total_epoch-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
+
+
+class SGDRScheduler(_LRScheduler):
+    """ Consine annealing with warm up and restarts.
+    Proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts`.
+    """
+    def __init__(self, optimizer, total_epoch=150, start_cyclical=100, cyclical_base_lr=7e-4, cyclical_epoch=10, eta_min=0, warmup_epoch=10, last_epoch=-1):
+        self.total_epoch = total_epoch
+        self.start_cyclical = start_cyclical
+        self.cyclical_epoch = cyclical_epoch
+        self.cyclical_base_lr = cyclical_base_lr
+        self.eta_min = eta_min
+        self.warmup_epoch = warmup_epoch
+        super(SGDRScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_epoch:
+            return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
+        elif self.last_epoch < self.start_cyclical:
+            return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.start_cyclical-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (self.cyclical_base_lr-self.eta_min)*(1+math.cos(math.pi* ((self.last_epoch-self.start_cyclical)% self.cyclical_epoch)/self.cyclical_epoch)) / 2 for base_lr in self.base_lrs]
+
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+    import torch
+    model = torch.nn.Linear(10, 2)
+    optimizer = torch.optim.SGD(params=model.parameters(), lr=7e-3, momentum=0.9, weight_decay=5e-4)
+    scheduler_warmup = SGDRScheduler(optimizer, total_epoch=150, eta_min=7e-5, warmup_epoch=10, start_cyclical=100, cyclical_base_lr=3.5e-3, cyclical_epoch=10)
+    lr = []
+    for epoch in range(0,150):
+        scheduler_warmup.step(epoch)
+        lr.append(scheduler_warmup.get_lr())
+    plt.style.use('ggplot')
+    plt.plot(list(range(0,150)), lr)
+    plt.show()
diff --git a/insightface/python-package/README.md b/insightface/python-package/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..69c199b5669dfe32a6d6ce6d264c62e468bc167f
--- /dev/null
+++ b/insightface/python-package/README.md
@@ -0,0 +1,153 @@
+# InsightFace Python Library
+
+## License
+
+The code of InsightFace Python Library is released under the MIT License. There is no limitation for both academic and commercial usage.
+
+**The pretrained models we provided with this library are available for non-commercial research purposes only, including both auto-downloading models and manual-downloading models.**
+
+## Install
+
+### Install Inference Backend
+
+For ``insightface<=0.1.5``, we use MXNet as inference backend.
+
+Starting from insightface>=0.2, we use onnxruntime as inference backend.
+
+You have to install ``onnxruntime-gpu`` manually to enable GPU inference, or install ``onnxruntime`` to use CPU only inference.
+
+## Change Log
+
+### [0.7.1] - 2022-12-14
+  
+#### Changed
+  
+- Change model downloading provider to cloudfront.
+
+### [0.7] - 2022-11-28
+  
+#### Added
+
+- Add face swapping model and example.
+ 
+#### Changed
+  
+- Set default ORT provider to CUDA and CPU.
+ 
+### [0.6] - 2022-01-29
+  
+#### Added
+
+- Add pose estimation in face-analysis app.
+ 
+#### Changed
+  
+- Change model automated downloading url, to ucloud.
+ 
+
+## Quick Example
+
+```
+import cv2
+import numpy as np
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+app = FaceAnalysis(providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+img = ins_get_image('t1')
+faces = app.get(img)
+rimg = app.draw_on(img, faces)
+cv2.imwrite("./t1_output.jpg", rimg)
+```
+
+This quick example will detect faces from the ``t1.jpg`` image and draw detection results on it.
+
+
+
+## Model Zoo
+
+In the latest version of insightface library, we provide following model packs:
+
+Name in **bold** is the default model pack. **Auto** means we can download the model pack through the python library directly.
+
+Once you manually downloaded the zip model pack, unzip it under `~/.insightface/models/` first before you call the program.
+
+| Name          | Detection Model | Recognition Model    | Alignment    | Attributes | Model-Size | Link                                                         | Auto |
+| ------------- | --------------- | -------------------- | ------------ | ---------- | ---------- | ------------------------------------------------------------ | ------------- |
+| antelopev2    | SCRFD-10GF      | ResNet100@Glint360K  | 2d106 & 3d68 | Gender&Age | 407MB      | [link](https://drive.google.com/file/d/18wEUfMNohBJ4K3Ly5wpTejPfDzp-8fI8/view?usp=sharing) | N             |
+| **buffalo_l** | SCRFD-10GF      | ResNet50@WebFace600K | 2d106 & 3d68 | Gender&Age | 326MB      | [link](https://drive.google.com/file/d/1qXsQJ8ZT42_xSmWIYy85IcidpiZudOCB/view?usp=sharing) | Y             |
+| buffalo_m     | SCRFD-2.5GF     | ResNet50@WebFace600K | 2d106 & 3d68 | Gender&Age | 313MB      | [link](https://drive.google.com/file/d/1net68yNxF33NNV6WP7k56FS6V53tq-64/view?usp=sharing) | N             |
+| buffalo_s     | SCRFD-500MF     | MBF@WebFace600K      | 2d106 & 3d68 | Gender&Age | 159MB      | [link](https://drive.google.com/file/d/1pKIusApEfoHKDjeBTXYB3yOQ0EtTonNE/view?usp=sharing) | N             |
+| buffalo_sc    | SCRFD-500MF     | MBF@WebFace600K      | -            | -          | 16MB       | [link](https://drive.google.com/file/d/19I-MZdctYKmVf3nu5Da3HS6KH5LBfdzG/view?usp=sharing) | N             |
+
+
+
+Recognition Accuracy:
+
+| Name      | MR-ALL | African | Caucasian | South Asian | East Asian | LFW   | CFP-FP | AgeDB-30 | IJB-C(E4) |
+| :-------- | ------ | ------- | --------- | ----------- | ---------- | ----- | ------ | -------- | --------- |
+| buffalo_l | 91.25  | 90.29   | 94.70     | 93.16       | 74.96      | 99.83 | 99.33  | 98.23    | 97.25     |
+| buffalo_s | 71.87  | 69.45   | 80.45     | 73.39       | 51.03      | 99.70 | 98.00  | 96.58    | 95.02     |
+
+*buffalo_m has the same accuracy with buffalo_l.*
+
+*buffalo_sc has the same accuracy with buffalo_s.*
+
+
+
+**Note that these models are available for non-commercial research purposes only.**
+
+
+
+For insightface>=0.3.3, models will be downloaded automatically once we init ``app = FaceAnalysis()`` instance.
+
+For insightface==0.3.2, you must first download the model package by command:
+
+```
+insightface-cli model.download buffalo_l
+```
+
+## Use Your Own Licensed Model
+
+You can simply create a new model directory under ``~/.insightface/models/`` and replace the pretrained models we provide with your own models. And then call ``app = FaceAnalysis(name='your_model_zoo')`` to load these models.
+
+## Call Models
+
+The latest insightface libary only supports onnx models. Once you have trained detection or recognition models by PyTorch, MXNet or any other frameworks, you can convert it to the onnx format and then they can be called with insightface library.
+
+### Call Detection Models
+
+```
+import cv2
+import numpy as np
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+# Method-1, use FaceAnalysis
+app = FaceAnalysis(allowed_modules=['detection']) # enable detection model only
+app.prepare(ctx_id=0, det_size=(640, 640))
+
+# Method-2, load model directly
+detector = insightface.model_zoo.get_model('your_detection_model.onnx')
+detector.prepare(ctx_id=0, input_size=(640, 640))
+
+```
+
+### Call Recognition Models
+
+```
+import cv2
+import numpy as np
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+
+handler = insightface.model_zoo.get_model('your_recognition_model.onnx')
+handler.prepare(ctx_id=0)
+
+```
+
+
diff --git a/insightface/python-package/insightface/__init__.py b/insightface/python-package/insightface/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ddc9e0a0687521eba04ee02b6652dddb39a80e2
--- /dev/null
+++ b/insightface/python-package/insightface/__init__.py
@@ -0,0 +1,21 @@
+# coding: utf-8
+# pylint: disable=wrong-import-position
+"""InsightFace: A Face Analysis Toolkit."""
+from __future__ import absolute_import
+
+try:
+    #import mxnet as mx
+    import onnxruntime
+except ImportError:
+    raise ImportError(
+        "Unable to import dependency onnxruntime. "
+    )
+
+__version__ = '0.7.3'
+
+from . import model_zoo
+from . import utils
+from . import app
+from . import data
+from . import thirdparty
+
diff --git a/insightface/python-package/insightface/app/__init__.py b/insightface/python-package/insightface/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a0e492020053bfe930479752ec828aa1db45c6b
--- /dev/null
+++ b/insightface/python-package/insightface/app/__init__.py
@@ -0,0 +1,2 @@
+from .face_analysis import *
+from .mask_renderer import *
diff --git a/insightface/python-package/insightface/app/common.py b/insightface/python-package/insightface/app/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ca987aeede35510b3aef72b4edf2390ad84e65
--- /dev/null
+++ b/insightface/python-package/insightface/app/common.py
@@ -0,0 +1,49 @@
+import numpy as np
+from numpy.linalg import norm as l2norm
+#from easydict import EasyDict
+
+class Face(dict):
+
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        #for k in self.__class__.__dict__.keys():
+        #    if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
+        #        setattr(self, k, getattr(self, k))
+
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x)
+                    if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(Face, self).__setattr__(name, value)
+        super(Face, self).__setitem__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getattr__(self, name):
+        return None
+
+    @property
+    def embedding_norm(self):
+        if self.embedding is None:
+            return None
+        return l2norm(self.embedding)
+
+    @property 
+    def normed_embedding(self):
+        if self.embedding is None:
+            return None
+        return self.embedding / self.embedding_norm
+
+    @property 
+    def sex(self):
+        if self.gender is None:
+            return None
+        return 'M' if self.gender==1 else 'F'
diff --git a/insightface/python-package/insightface/app/face_analysis.py b/insightface/python-package/insightface/app/face_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c3dfcbd2e59cbacac1f61c4ec4cc9e20cb81e1f
--- /dev/null
+++ b/insightface/python-package/insightface/app/face_analysis.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+
+from __future__ import division
+
+import glob
+import os.path as osp
+
+import numpy as np
+import onnxruntime
+from numpy.linalg import norm
+
+from ..model_zoo import model_zoo
+from ..utils import DEFAULT_MP_NAME, ensure_available
+from .common import Face
+
+__all__ = ['FaceAnalysis']
+
+class FaceAnalysis:
+    def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', allowed_modules=None, **kwargs):
+        onnxruntime.set_default_logger_severity(3)
+        self.models = {}
+        self.model_dir = ensure_available('models', name, root=root)
+        onnx_files = glob.glob(osp.join(self.model_dir, '*.onnx'))
+        onnx_files = sorted(onnx_files)
+        for onnx_file in onnx_files:
+            model = model_zoo.get_model(onnx_file, **kwargs)
+            if model is None:
+                print('model not recognized:', onnx_file)
+            elif allowed_modules is not None and model.taskname not in allowed_modules:
+                print('model ignore:', onnx_file, model.taskname)
+                del model
+            elif model.taskname not in self.models and (allowed_modules is None or model.taskname in allowed_modules):
+                print('find model:', onnx_file, model.taskname, model.input_shape, model.input_mean, model.input_std)
+                self.models[model.taskname] = model
+            else:
+                print('duplicated model task type, ignore:', onnx_file, model.taskname)
+                del model
+        assert 'detection' in self.models
+        self.det_model = self.models['detection']
+
+
+    def prepare(self, ctx_id, det_thresh=0.5, det_size=(640, 640)):
+        self.det_thresh = det_thresh
+        assert det_size is not None
+        print('set det-size:', det_size)
+        self.det_size = det_size
+        for taskname, model in self.models.items():
+            if taskname=='detection':
+                model.prepare(ctx_id, input_size=det_size, det_thresh=det_thresh)
+            else:
+                model.prepare(ctx_id)
+
+    def get(self, img, max_num=0):
+        bboxes, kpss = self.det_model.detect(img,
+                                             max_num=max_num,
+                                             metric='default')
+        if bboxes.shape[0] == 0:
+            return []
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i, 4]
+            kps = None
+            if kpss is not None:
+                kps = kpss[i]
+            face = Face(bbox=bbox, kps=kps, det_score=det_score)
+            for taskname, model in self.models.items():
+                if taskname=='detection':
+                    continue
+                model.get(img, face)
+            ret.append(face)
+        return ret
+
+    def draw_on(self, img, faces):
+        import cv2
+        dimg = img.copy()
+        for i in range(len(faces)):
+            face = faces[i]
+            box = face.bbox.astype(int)
+            color = (0, 0, 255)
+            cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
+            if face.kps is not None:
+                kps = face.kps.astype(int)
+                #print(landmark.shape)
+                for l in range(kps.shape[0]):
+                    color = (0, 0, 255)
+                    if l == 0 or l == 3:
+                        color = (0, 255, 0)
+                    cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color,
+                               2)
+            if face.gender is not None and face.age is not None:
+                cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)
+
+            #for key, value in face.items():
+            #    if key.startswith('landmark_3d'):
+            #        print(key, value.shape)
+            #        print(value[0:10,:])
+            #        lmk = np.round(value).astype(int)
+            #        for l in range(lmk.shape[0]):
+            #            color = (255, 0, 0)
+            #            cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color,
+            #                       2)
+        return dimg
+
diff --git a/insightface/python-package/insightface/app/mask_renderer.py b/insightface/python-package/insightface/app/mask_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2815f5018ca92cdcc541e47309d9466cd5f669dc
--- /dev/null
+++ b/insightface/python-package/insightface/app/mask_renderer.py
@@ -0,0 +1,232 @@
+import os, sys, datetime
+import numpy as np
+import os.path as osp
+import albumentations as A
+from albumentations.core.transforms_interface import ImageOnlyTransform
+from .face_analysis import FaceAnalysis
+from ..utils import get_model_dir
+from ..thirdparty import face3d
+from ..data import get_image as ins_get_image
+from ..utils import DEFAULT_MP_NAME
+import cv2
+
+class MaskRenderer:
+    def __init__(self, name=DEFAULT_MP_NAME, root='~/.insightface', insfa=None):
+        #if insfa is None, enter render_only mode
+        self.mp_name = name
+        self.root = root
+        self.insfa = insfa
+        model_dir = get_model_dir(name, root)
+        bfm_file = osp.join(model_dir, 'BFM.mat')
+        assert osp.exists(bfm_file), 'should contains BFM.mat in your model directory'
+        self.bfm = face3d.morphable_model.MorphabelModel(bfm_file)
+        self.index_ind = self.bfm.kpt_ind
+        bfm_uv_file = osp.join(model_dir, 'BFM_UV.mat')
+        assert osp.exists(bfm_uv_file), 'should contains BFM_UV.mat in your model directory'
+        uv_coords = face3d.morphable_model.load.load_uv_coords(bfm_uv_file)
+        self.uv_size = (224,224)
+        self.mask_stxr =  0.1
+        self.mask_styr = 0.33
+        self.mask_etxr = 0.9
+        self.mask_etyr =  0.7
+        self.tex_h , self.tex_w, self.tex_c = self.uv_size[1] , self.uv_size[0],3
+        texcoord = np.zeros_like(uv_coords)
+        texcoord[:, 0] = uv_coords[:, 0] * (self.tex_h - 1)
+        texcoord[:, 1] = uv_coords[:, 1] * (self.tex_w - 1)
+        texcoord[:, 1] = self.tex_w - texcoord[:, 1] - 1
+        self.texcoord = np.hstack((texcoord, np.zeros((texcoord.shape[0], 1))))
+        self.X_ind = self.bfm.kpt_ind
+        self.mask_image_names = ['mask_white', 'mask_blue', 'mask_black', 'mask_green']
+        self.mask_aug_probs = [0.4, 0.4, 0.1, 0.1]
+        #self.mask_images = []
+        #self.mask_images_rgb = []
+        #for image_name in mask_image_names:
+        #    mask_image = ins_get_image(image_name)
+        #    self.mask_images.append(mask_image)
+        #    mask_image_rgb = mask_image[:,:,::-1]
+        #    self.mask_images_rgb.append(mask_image_rgb)
+
+
+    def prepare(self, ctx_id=0, det_thresh=0.5, det_size=(128, 128)):
+        self.pre_ctx_id = ctx_id
+        self.pre_det_thresh = det_thresh
+        self.pre_det_size = det_size
+
+    def transform(self, shape3D, R):
+        s = 1.0
+        shape3D[:2, :] = shape3D[:2, :]
+        shape3D = s * np.dot(R, shape3D)
+        return shape3D
+
+    def preprocess(self, vertices, w, h):
+        R1 = face3d.mesh.transform.angle2matrix([0, 180, 180])
+        t = np.array([-w // 2, -h // 2, 0])
+        vertices = vertices.T
+        vertices += t
+        vertices = self.transform(vertices.T, R1).T
+        return vertices
+
+    def project_to_2d(self,vertices,s,angles,t):
+        transformed_vertices = self.bfm.transform(vertices, s, angles, t)
+        projected_vertices = transformed_vertices.copy() # using stantard camera & orth projection
+        return projected_vertices[self.bfm.kpt_ind, :2]
+
+    def params_to_vertices(self,params  , H , W):
+        fitted_sp, fitted_ep, fitted_s, fitted_angles, fitted_t  = params
+        fitted_vertices = self.bfm.generate_vertices(fitted_sp, fitted_ep)
+        transformed_vertices = self.bfm.transform(fitted_vertices, fitted_s, fitted_angles,
+                                                  fitted_t)
+        transformed_vertices = self.preprocess(transformed_vertices.T, W, H)
+        image_vertices = face3d.mesh.transform.to_image(transformed_vertices, H, W)
+        return image_vertices
+
+    def draw_lmk(self, face_image):
+        faces = self.insfa.get(face_image, max_num=1)
+        if len(faces)==0:
+            return face_image
+        return self.insfa.draw_on(face_image, faces)
+
+    def build_params(self, face_image):
+        #landmark = self.if3d68_handler.get(face_image)
+        #if landmark is None:
+        #    return None #face not found
+        if self.insfa is None:
+            self.insfa = FaceAnalysis(name=self.mp_name, root=self.root, allowed_modules=['detection', 'landmark_3d_68'])
+            self.insfa.prepare(ctx_id=self.pre_ctx_id,  det_thresh=self.pre_det_thresh, det_size=self.pre_det_size)
+
+        faces = self.insfa.get(face_image, max_num=1)
+        if len(faces)==0:
+            return None
+        landmark = faces[0].landmark_3d_68[:,:2]
+        fitted_sp, fitted_ep, fitted_s, fitted_angles, fitted_t = self.bfm.fit(landmark, self.X_ind, max_iter = 3)
+        return [fitted_sp, fitted_ep, fitted_s, fitted_angles, fitted_t]
+
+    def generate_mask_uv(self,mask, positions):
+        uv_size = (self.uv_size[1], self.uv_size[0], 3)
+        h, w, c = uv_size
+        uv = np.zeros(shape=(self.uv_size[1],self.uv_size[0], 3), dtype=np.uint8)
+        stxr, styr  = positions[0], positions[1]
+        etxr, etyr = positions[2], positions[3]
+        stx, sty = int(w * stxr), int(h * styr)
+        etx, ety = int(w * etxr), int(h * etyr)
+        height = ety - sty
+        width = etx - stx
+        mask = cv2.resize(mask, (width, height))
+        uv[sty:ety, stx:etx] = mask
+        return uv
+
+    def render_mask(self,face_image, mask_image, params, input_is_rgb=False, auto_blend = True, positions=[0.1, 0.33, 0.9, 0.7]):
+        if isinstance(mask_image, str):
+            to_rgb = True if input_is_rgb else False
+            mask_image = ins_get_image(mask_image, to_rgb=to_rgb)
+        uv_mask_image = self.generate_mask_uv(mask_image, positions)
+        h,w,c = face_image.shape
+        image_vertices = self.params_to_vertices(params ,h,w)
+        output = (1-face3d.mesh.render.render_texture(image_vertices, self.bfm.full_triangles , uv_mask_image, self.texcoord, self.bfm.full_triangles, h , w ))*255
+        output = output.astype(np.uint8)
+        if auto_blend:
+            mask_bd = (output==255).astype(np.uint8)
+            final = face_image*mask_bd + (1-mask_bd)*output
+            return final
+        return output
+
+    #def mask_augmentation(self, face_image, label, input_is_rgb=False, p=0.1):
+    #    if np.random.random()<p:
+    #        assert isinstance(label, (list, np.ndarray)), 'make sure the rec dataset includes mask params'
+    #        assert len(label)==237 or len(lable)==235, 'make sure the rec dataset includes mask params'
+    #        if len(label)==237:
+    #            if label[1]<0.0: #invalid label for mask aug
+    #                return face_image
+    #            label = label[2:]
+    #        params = self.decode_params(label)
+    #        mask_image_name = np.random.choice(self.mask_image_names, p=self.mask_aug_probs)
+    #        pos = np.random.uniform(0.33, 0.5)
+    #        face_image = self.render_mask(face_image, mask_image_name, params, input_is_rgb=input_is_rgb, positions=[0.1, pos, 0.9, 0.7])
+    #    return face_image
+
+    @staticmethod
+    def encode_params(params):
+        p0 = list(params[0])
+        p1 = list(params[1])
+        p2 = [float(params[2])]
+        p3 = list(params[3])
+        p4 = list(params[4])
+        return p0+p1+p2+p3+p4
+
+    @staticmethod
+    def decode_params(params):
+        p0 = params[0:199]
+        p0 = np.array(p0, dtype=np.float32).reshape( (-1, 1))
+        p1 = params[199:228]
+        p1 = np.array(p1, dtype=np.float32).reshape( (-1, 1))
+        p2 = params[228]
+        p3 = tuple(params[229:232])
+        p4 = params[232:235]
+        p4 = np.array(p4, dtype=np.float32).reshape( (-1, 1))
+        return p0, p1, p2, p3, p4
+    
+class MaskAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'],
+            mask_probs=[0.4,0.4,0.1,0.1],
+            h_low = 0.33,
+            h_high = 0.35,
+            always_apply=False,
+            p=1.0,
+            ):
+        super(MaskAugmentation, self).__init__(always_apply, p)
+        self.renderer = MaskRenderer()
+        assert len(mask_names)>0
+        assert len(mask_names)==len(mask_probs)
+        self.mask_names = mask_names
+        self.mask_probs = mask_probs
+        self.h_low = h_low
+        self.h_high = h_high
+        #self.hlabel = None
+
+
+    def apply(self, image, hlabel, mask_name, h_pos, **params):
+        #print(params.keys())
+        #hlabel = params.get('hlabel')
+        assert len(hlabel)==237 or len(hlabel)==235, 'make sure the rec dataset includes mask params'
+        if len(hlabel)==237:
+            if hlabel[1]<0.0:
+                return image
+            hlabel = hlabel[2:]
+        #print(len(hlabel))
+        mask_params = self.renderer.decode_params(hlabel)
+        image = self.renderer.render_mask(image, mask_name, mask_params, input_is_rgb=True, positions=[0.1, h_pos, 0.9, 0.7])
+        return image
+
+    @property
+    def targets_as_params(self):
+        return ["image", "hlabel"]
+
+    def get_params_dependent_on_targets(self, params):
+        hlabel = params['hlabel']
+        mask_name = np.random.choice(self.mask_names, p=self.mask_probs)
+        h_pos = np.random.uniform(self.h_low, self.h_high)
+        return {'hlabel': hlabel, 'mask_name': mask_name, 'h_pos': h_pos}
+
+    def get_transform_init_args_names(self):
+        #return ("hlabel", 'mask_names', 'mask_probs', 'h_low', 'h_high')
+        return ('mask_names', 'mask_probs', 'h_low', 'h_high')
+
+
+if __name__ == "__main__":
+    tool = MaskRenderer('antelope')
+    tool.prepare(det_size=(128,128))
+    image = cv2.imread("Tom_Hanks_54745.png")
+    params = tool.build_params(image)
+    #out = tool.draw_lmk(image)
+    #cv2.imwrite('output_lmk.jpg', out)
+    #mask_image  = cv2.imread("masks/mask1.jpg")
+    #mask_image  = cv2.imread("masks/black-mask.png")
+    #mask_image  = cv2.imread("masks/mask2.jpg")
+    mask_out = tool.render_mask(image, 'mask_blue', params)# use single thread to test the time cost
+
+    cv2.imwrite('output_mask.jpg', mask_out)
+
+
diff --git a/insightface/python-package/insightface/commands/__init__.py b/insightface/python-package/insightface/commands/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fedaa0dbb14464243e1ebef1b8134c7fe524cd8
--- /dev/null
+++ b/insightface/python-package/insightface/commands/__init__.py
@@ -0,0 +1,13 @@
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseInsightFaceCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/insightface/python-package/insightface/commands/insightface_cli.py b/insightface/python-package/insightface/commands/insightface_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8adbefcf0371c35689b2978c69510c1d3306846
--- /dev/null
+++ b/insightface/python-package/insightface/commands/insightface_cli.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+from argparse import ArgumentParser
+
+from .model_download import ModelDownloadCommand
+from .rec_add_mask_param import RecAddMaskParamCommand
+
+def main():
+    parser = ArgumentParser("InsightFace CLI tool", usage="insightface-cli <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="insightface-cli command-line helpers")
+
+    # Register commands
+    ModelDownloadCommand.register_subcommand(commands_parser)
+    RecAddMaskParamCommand.register_subcommand(commands_parser)
+
+    args = parser.parse_args()
+
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+
+    # Run
+    service = args.func(args)
+    service.run()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/insightface/python-package/insightface/commands/model_download.py b/insightface/python-package/insightface/commands/model_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a14a33d7767b5fdc0a780f326e2e39ca619823
--- /dev/null
+++ b/insightface/python-package/insightface/commands/model_download.py
@@ -0,0 +1,36 @@
+from argparse import ArgumentParser
+
+from . import BaseInsightFaceCLICommand
+import os
+import os.path as osp
+import zipfile
+import glob
+from ..utils import download
+
+
+def model_download_command_factory(args):
+    return ModelDownloadCommand(args.model, args.root, args.force)
+
+
+class ModelDownloadCommand(BaseInsightFaceCLICommand):
+    #_url_format = '{repo_url}models/{file_name}.zip'
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("model.download")
+        download_parser.add_argument(
+            "--root", type=str, default='~/.insightface', help="Path to location to store the models"
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in root-dir"
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
+        download_parser.set_defaults(func=model_download_command_factory)
+
+    def __init__(self, model: str, root: str, force: bool):
+        self._model = model
+        self._root = root
+        self._force = force
+
+    def run(self):
+        download('models', self._model, force=self._force, root=self._root)
+
diff --git a/insightface/python-package/insightface/commands/rec_add_mask_param.py b/insightface/python-package/insightface/commands/rec_add_mask_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00ee0b9a31c6426ea0804ec00fc68981d88a324
--- /dev/null
+++ b/insightface/python-package/insightface/commands/rec_add_mask_param.py
@@ -0,0 +1,94 @@
+
+import numbers
+import os
+from argparse import ArgumentParser, Namespace
+
+import mxnet as mx
+import numpy as np
+
+from ..app import MaskRenderer
+from ..data.rec_builder import RecBuilder
+from . import BaseInsightFaceCLICommand
+
+
+def rec_add_mask_param_command_factory(args: Namespace):
+
+    return RecAddMaskParamCommand(
+        args.input, args.output
+    )
+
+
+class RecAddMaskParamCommand(BaseInsightFaceCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        _parser = parser.add_parser("rec.addmaskparam")
+        _parser.add_argument("input", type=str, help="input rec")
+        _parser.add_argument("output", type=str, help="output rec, with mask param")
+        _parser.set_defaults(func=rec_add_mask_param_command_factory)
+
+    def __init__(
+        self,
+        input: str,
+        output: str,
+    ):
+        self._input = input
+        self._output = output
+
+
+    def run(self):
+        tool = MaskRenderer()
+        tool.prepare(ctx_id=0, det_size=(128,128))
+        root_dir = self._input
+        path_imgrec = os.path.join(root_dir, 'train.rec')
+        path_imgidx = os.path.join(root_dir, 'train.idx')
+        imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        save_path = self._output
+        wrec=RecBuilder(path=save_path)
+        s = imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        if header.flag > 0:
+            if len(header.label)==2:
+                imgidx = np.array(range(1, int(header.label[0])))
+            else:
+                imgidx = np.array(list(self.imgrec.keys))
+        else:
+            imgidx = np.array(list(self.imgrec.keys))
+        stat = [0, 0]
+        print('total:', len(imgidx))
+        for iid, idx in enumerate(imgidx):
+            #if iid==500000:
+            #    break
+            if iid%1000==0:
+                print('processing:', iid)
+            s = imgrec.read_idx(idx)
+            header, img = mx.recordio.unpack(s)
+            label = header.label
+            if not isinstance(label, numbers.Number):
+                label = label[0]
+            sample = mx.image.imdecode(img).asnumpy()
+            bgr = sample[:,:,::-1]
+            params = tool.build_params(bgr)
+            #if iid<10:
+            #    mask_out = tool.render_mask(bgr, 'mask_blue', params)
+            #    cv2.imwrite('maskout_%d.jpg'%iid, mask_out)
+            stat[1] += 1
+            if params is None:
+                wlabel = [label] + [-1.0]*236
+                stat[0] += 1
+            else:
+                #print(0, params[0].shape, params[0].dtype)
+                #print(1, params[1].shape, params[1].dtype)
+                #print(2, params[2])
+                #print(3, len(params[3]), params[3][0].__class__)
+                #print(4, params[4].shape, params[4].dtype)
+                mask_label = tool.encode_params(params)
+                wlabel = [label, 0.0]+mask_label # 237 including idlabel, total mask params size is 235
+                if iid==0:
+                    print('param size:', len(mask_label), len(wlabel), label)
+            assert len(wlabel)==237
+            wrec.add_image(img, wlabel)
+            #print(len(params))
+
+        wrec.close()
+        print('finished on', self._output, ', failed:', stat[0])
+
diff --git a/insightface/python-package/insightface/data/__init__.py b/insightface/python-package/insightface/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..665c59ec99b6ebf12822015e0350969c7903e243
--- /dev/null
+++ b/insightface/python-package/insightface/data/__init__.py
@@ -0,0 +1,2 @@
+from .image import get_image
+from .pickle_object import get_object
diff --git a/insightface/python-package/insightface/data/image.py b/insightface/python-package/insightface/data/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..343ccc580afb5f9d45a341b17fb41c7ab9621500
--- /dev/null
+++ b/insightface/python-package/insightface/data/image.py
@@ -0,0 +1,28 @@
+import cv2
+import os
+import os.path as osp
+from pathlib import Path
+
+class ImageCache:
+    data = {}
+
+def get_image(name, to_rgb=False, use_cache=True):
+    key = (name, to_rgb)
+    if key in ImageCache.data:
+        return ImageCache.data[key]
+    images_dir = osp.join(Path(__file__).parent.absolute(), 'images')
+    ext_names = ['.jpg', '.png', '.jpeg']
+    image_file = None
+    for ext_name in ext_names:
+        _image_file = osp.join(images_dir, "%s%s"%(name, ext_name))
+        if osp.exists(_image_file):
+            image_file = _image_file
+            break
+    assert image_file is not None, '%s not found'%name
+    img = cv2.imread(image_file)
+    if to_rgb:
+        img = img[:,:,::-1]
+    if use_cache:
+        ImageCache.data[key] = img
+    return img
+
diff --git a/insightface/python-package/insightface/data/images/Tom_Hanks_54745.png b/insightface/python-package/insightface/data/images/Tom_Hanks_54745.png
new file mode 100644
index 0000000000000000000000000000000000000000..83387c3572e335441b617d0ba1f6a7f897b87f37
--- /dev/null
+++ b/insightface/python-package/insightface/data/images/Tom_Hanks_54745.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8545da294e8c7c79911169c3915fed8528f1960cd0ed99b92453788ca4275083
+size 12123
diff --git a/insightface/python-package/insightface/data/images/mask_black.jpg b/insightface/python-package/insightface/data/images/mask_black.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0eab0df555c23f1e033537fe39f3c0c8303dd369
Binary files /dev/null and b/insightface/python-package/insightface/data/images/mask_black.jpg differ
diff --git a/insightface/python-package/insightface/data/images/mask_blue.jpg b/insightface/python-package/insightface/data/images/mask_blue.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..f71336b9a0d3038ebd84e6995ebfbe54946fcbb4
Binary files /dev/null and b/insightface/python-package/insightface/data/images/mask_blue.jpg differ
diff --git a/insightface/python-package/insightface/data/images/mask_green.jpg b/insightface/python-package/insightface/data/images/mask_green.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..ac2ad55f4fc580c915dfa4c157ca3bfc84e453f4
Binary files /dev/null and b/insightface/python-package/insightface/data/images/mask_green.jpg differ
diff --git a/insightface/python-package/insightface/data/images/mask_white.jpg b/insightface/python-package/insightface/data/images/mask_white.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2148ab2d09fdee6e3f59315470e98ecfc54339e4
Binary files /dev/null and b/insightface/python-package/insightface/data/images/mask_white.jpg differ
diff --git a/insightface/python-package/insightface/data/images/t1.jpg b/insightface/python-package/insightface/data/images/t1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0d1d64a59675c9590fd12429db647eb169cecff8
Binary files /dev/null and b/insightface/python-package/insightface/data/images/t1.jpg differ
diff --git a/insightface/python-package/insightface/data/objects/meanshape_68.pkl b/insightface/python-package/insightface/data/objects/meanshape_68.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d5297e9e8ea5574298ddd287b058252e03aa18c1
--- /dev/null
+++ b/insightface/python-package/insightface/data/objects/meanshape_68.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39ffecf84ba73f0d0d7e49380833ba88713c9fcdec51df4f7ac45a48b8f4cc51
+size 974
diff --git a/insightface/python-package/insightface/data/pickle_object.py b/insightface/python-package/insightface/data/pickle_object.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd87030ea15e1d01af1cd4cff1be2bc54cc82dd
--- /dev/null
+++ b/insightface/python-package/insightface/data/pickle_object.py
@@ -0,0 +1,17 @@
+import cv2
+import os
+import os.path as osp
+from pathlib import Path
+import pickle
+
+def get_object(name):
+    objects_dir = osp.join(Path(__file__).parent.absolute(), 'objects')
+    if not name.endswith('.pkl'):
+        name = name+".pkl"
+    filepath = osp.join(objects_dir, name)
+    if not osp.exists(filepath):
+        return None
+    with open(filepath, 'rb') as f:
+        obj = pickle.load(f)
+    return obj
+
diff --git a/insightface/python-package/insightface/data/rec_builder.py b/insightface/python-package/insightface/data/rec_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02abc969da2f882639326f5bad3c7e8d08c1fde
--- /dev/null
+++ b/insightface/python-package/insightface/data/rec_builder.py
@@ -0,0 +1,71 @@
+import pickle
+import numpy as np
+import os
+import os.path as osp
+import sys
+import mxnet as mx
+
+
+class RecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.widx = 0
+        self.wlabel = 0
+        self.max_label = -1
+        assert not osp.exists(path), '%s exists' % path
+        os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(os.path.join(path, 'train.idx'), 
+                                                    os.path.join(path, 'train.rec'),
+                                                    'w')
+        self.meta = []
+
+    def add(self, imgs):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        assert len(imgs) > 0
+        label = self.wlabel
+        for img in imgs:
+            idx = self.widx
+            image_meta = {'image_index': idx, 'image_classes': [label]}
+            header = mx.recordio.IRHeader(0, label, idx, 0)
+            if isinstance(img, np.ndarray):
+                s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
+            else:
+                s = mx.recordio.pack(header, img)
+            self.writer.write_idx(idx, s)
+            self.meta.append(image_meta)
+            self.widx += 1
+        self.max_label = label
+        self.wlabel += 1
+
+
+    def add_image(self, img, label):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        idx = self.widx
+        header = mx.recordio.IRHeader(0, label, idx, 0)
+        if isinstance(label, list):
+            idlabel = label[0]
+        else:
+            idlabel = label
+        image_meta = {'image_index': idx, 'image_classes': [idlabel]}
+        if isinstance(img, np.ndarray):
+            s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
+        else:
+            s = mx.recordio.pack(header, img)
+        self.writer.write_idx(idx, s)
+        self.meta.append(image_meta)
+        self.widx += 1
+        self.max_label = max(self.max_label, idlabel)
+
+    def close(self):
+        with open(osp.join(self.path, 'train.meta'), 'wb') as pfile:
+            pickle.dump(self.meta, pfile, protocol=pickle.HIGHEST_PROTOCOL)
+        print('stat:', self.widx, self.wlabel)
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.max_label+1, self.image_size[0], self.image_size[1]))
+            f.write("%d\n" % (self.widx))
+
diff --git a/insightface/python-package/insightface/model_zoo/__init__.py b/insightface/python-package/insightface/model_zoo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..225623d6142c968b4040f391039bfab88bdd1b2a
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/__init__.py
@@ -0,0 +1,6 @@
+from .model_zoo import get_model
+from .arcface_onnx import ArcFaceONNX
+from .retinaface import RetinaFace
+from .scrfd import SCRFD
+from .landmark import Landmark
+from .attribute import Attribute
diff --git a/insightface/python-package/insightface/model_zoo/arcface_onnx.py b/insightface/python-package/insightface/model_zoo/arcface_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b537ce2ee15d4a1834d54e185f34e336aab30a77
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/arcface_onnx.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+
+__all__ = [
+    'ArcFaceONNX',
+]
+
+
+class ArcFaceONNX:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'recognition'
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        self.output_shape = outputs[0].shape
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        aimg = face_align.norm_crop(img, landmark=face.kps, image_size=self.input_size[0])
+        face.embedding = self.get_feat(aimg).flatten()
+        return face.embedding
+
+    def compute_sim(self, feat1, feat2):
+        from numpy.linalg import norm
+        feat1 = feat1.ravel()
+        feat2 = feat2.ravel()
+        sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2))
+        return sim
+
+    def get_feat(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.input_size
+        
+        blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+    def forward(self, batch_data):
+        blob = (batch_data - self.input_mean) / self.input_std
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
diff --git a/insightface/python-package/insightface/model_zoo/attribute.py b/insightface/python-package/insightface/model_zoo/attribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..40c34de3f0995499448cf5779004cc1e5f3564fb
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/attribute.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-06-19
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+
+__all__ = [
+    'Attribute',
+]
+
+
+class Attribute:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+            if nid<3 and node.name=='bn_data':
+                find_sub = True
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 128.0
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        #print('init output_shape:', output_shape)
+        if output_shape[1]==3:
+            self.taskname = 'genderage'
+        else:
+            self.taskname = 'attribute_%d'%output_shape[1]
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        bbox = face.bbox
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
+        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        #assert input_size==self.input_size
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
+        if self.taskname=='genderage':
+            assert len(pred)==3
+            gender = np.argmax(pred[:2])
+            age = int(np.round(pred[2]*100))
+            face['gender'] = gender
+            face['age'] = age
+            return gender, age
+        else:
+            return pred
+
+
diff --git a/insightface/python-package/insightface/model_zoo/inswapper.py b/insightface/python-package/insightface/model_zoo/inswapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcaceb1f2dc590e066431bed5b708af66f6c74e0
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/inswapper.py
@@ -0,0 +1,105 @@
+import time
+import numpy as np
+import onnxruntime
+import cv2
+import onnx
+from onnx import numpy_helper
+from ..utils import face_align
+
+
+
+
+class INSwapper():
+    def __init__(self, model_file=None, session=None):
+        self.model_file = model_file
+        self.session = session
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        self.emap = numpy_helper.to_array(graph.initializer[-1])
+        self.input_mean = 0.0
+        self.input_std = 255.0
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        inputs = self.session.get_inputs()
+        self.input_names = []
+        for inp in inputs:
+            self.input_names.append(inp.name)
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        input_cfg = inputs[0]
+        input_shape = input_cfg.shape
+        self.input_shape = input_shape
+        print('inswapper-shape:', self.input_shape)
+        self.input_size = tuple(input_shape[2:4][::-1])
+
+    def forward(self, img, latent):
+        img = (img - self.input_mean) / self.input_std
+        pred = self.session.run(self.output_names, {self.input_names[0]: img, self.input_names[1]: latent})[0]
+        return pred
+
+    def get(self, img, target_face, source_face, paste_back=True):
+        aimg, M = face_align.norm_crop2(img, target_face.kps, self.input_size[0])
+        blob = cv2.dnn.blobFromImage(aimg, 1.0 / self.input_std, self.input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        latent = source_face.normed_embedding.reshape((1,-1))
+        latent = np.dot(latent, self.emap)
+        latent /= np.linalg.norm(latent)
+        pred = self.session.run(self.output_names, {self.input_names[0]: blob, self.input_names[1]: latent})[0]
+        #print(latent.shape, latent.dtype, pred.shape)
+        img_fake = pred.transpose((0,2,3,1))[0]
+        bgr_fake = np.clip(255 * img_fake, 0, 255).astype(np.uint8)[:,:,::-1]
+        if not paste_back:
+            return bgr_fake, M
+        else:
+            target_img = img
+            fake_diff = bgr_fake.astype(np.float32) - aimg.astype(np.float32)
+            fake_diff = np.abs(fake_diff).mean(axis=2)
+            fake_diff[:2,:] = 0
+            fake_diff[-2:,:] = 0
+            fake_diff[:,:2] = 0
+            fake_diff[:,-2:] = 0
+            IM = cv2.invertAffineTransform(M)
+            img_white = np.full((aimg.shape[0],aimg.shape[1]), 255, dtype=np.float32)
+            bgr_fake = cv2.warpAffine(bgr_fake, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            img_white = cv2.warpAffine(img_white, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            fake_diff = cv2.warpAffine(fake_diff, IM, (target_img.shape[1], target_img.shape[0]), borderValue=0.0)
+            img_white[img_white>20] = 255
+            fthresh = 10
+            fake_diff[fake_diff<fthresh] = 0
+            fake_diff[fake_diff>=fthresh] = 255
+            img_mask = img_white
+            mask_h_inds, mask_w_inds = np.where(img_mask==255)
+            mask_h = np.max(mask_h_inds) - np.min(mask_h_inds)
+            mask_w = np.max(mask_w_inds) - np.min(mask_w_inds)
+            mask_size = int(np.sqrt(mask_h*mask_w))
+            k = max(mask_size//10, 10)
+            #k = max(mask_size//20, 6)
+            #k = 6
+            kernel = np.ones((k,k),np.uint8)
+            img_mask = cv2.erode(img_mask,kernel,iterations = 1)
+            kernel = np.ones((2,2),np.uint8)
+            fake_diff = cv2.dilate(fake_diff,kernel,iterations = 1)
+            k = max(mask_size//20, 5)
+            #k = 3
+            #k = 3
+            kernel_size = (k, k)
+            blur_size = tuple(2*i+1 for i in kernel_size)
+            img_mask = cv2.GaussianBlur(img_mask, blur_size, 0)
+            k = 5
+            kernel_size = (k, k)
+            blur_size = tuple(2*i+1 for i in kernel_size)
+            fake_diff = cv2.GaussianBlur(fake_diff, blur_size, 0)
+            img_mask /= 255
+            fake_diff /= 255
+            #img_mask = fake_diff
+            img_mask = np.reshape(img_mask, [img_mask.shape[0],img_mask.shape[1],1])
+            fake_merged = img_mask * bgr_fake + (1-img_mask) * target_img.astype(np.float32)
+            fake_merged = fake_merged.astype(np.uint8)
+            return fake_merged
+
diff --git a/insightface/python-package/insightface/model_zoo/landmark.py b/insightface/python-package/insightface/model_zoo/landmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..598b4b29a2d0674d8bb25b681f921c61460d101c
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/landmark.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+from ..utils import face_align
+from ..utils import transform
+from ..data import get_object
+
+__all__ = [
+    'Landmark',
+]
+
+
+class Landmark:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+            if nid<3 and node.name=='bn_data':
+                find_sub = True
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 128.0
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', model_file, self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        output_shape = outputs[0].shape
+        self.require_pose = False
+        #print('init output_shape:', output_shape)
+        if output_shape[1]==3309:
+            self.lmk_dim = 3
+            self.lmk_num = 68
+            self.mean_lmk = get_object('meanshape_68.pkl')
+            self.require_pose = True
+        else:
+            self.lmk_dim = 2
+            self.lmk_num = output_shape[1]//self.lmk_dim
+        self.taskname = 'landmark_%dd_%d'%(self.lmk_dim, self.lmk_num)
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, face):
+        bbox = face.bbox
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        #print('param:', img.shape, bbox, center, self.input_size, _scale, rotate)
+        aimg, M = face_align.transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        #assert input_size==self.input_size
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
+        if pred.shape[0] >= 3000:
+            pred = pred.reshape((-1, 3))
+        else:
+            pred = pred.reshape((-1, 2))
+        if self.lmk_num < pred.shape[0]:
+            pred = pred[self.lmk_num*-1:,:]
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (self.input_size[0] // 2)
+        if pred.shape[1] == 3:
+            pred[:, 2] *= (self.input_size[0] // 2)
+
+        IM = cv2.invertAffineTransform(M)
+        pred = face_align.trans_points(pred, IM)
+        face[self.taskname] = pred
+        if self.require_pose:
+            P = transform.estimate_affine_matrix_3d23d(self.mean_lmk, pred)
+            s, R, t = transform.P2sRt(P)
+            rx, ry, rz = transform.matrix2angle(R)
+            pose = np.array( [rx, ry, rz], dtype=np.float32 )
+            face['pose'] = pose #pitch, yaw, roll
+        return pred
+
+
diff --git a/insightface/python-package/insightface/model_zoo/model_store.py b/insightface/python-package/insightface/model_zoo/model_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..50bb85d314f5b7a0ea8211d2cd21186e32791592
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/model_store.py
@@ -0,0 +1,103 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py
+"""
+from __future__ import print_function
+
+__all__ = ['get_model_file']
+import os
+import zipfile
+import glob
+
+from ..utils import download, check_sha1
+
+_model_sha1 = {
+    name: checksum
+    for checksum, name in [
+        ('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'),
+        ('', 'arcface_mfn_v1'),
+        ('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'),
+        ('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'),
+        ('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'),
+        ('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'),
+    ]
+}
+
+base_repo_url = 'https://insightface.ai/files/'
+_url_format = '{repo_url}models/{file_name}.zip'
+
+
+def short_hash(name):
+    if name not in _model_sha1:
+        raise ValueError(
+            'Pretrained model for {name} is not available.'.format(name=name))
+    return _model_sha1[name][:8]
+
+
+def find_params_file(dir_path):
+    if not os.path.exists(dir_path):
+        return None
+    paths = glob.glob("%s/*.params" % dir_path)
+    if len(paths) == 0:
+        return None
+    paths = sorted(paths)
+    return paths[-1]
+
+
+def get_model_file(name, root=os.path.join('~', '.insightface', 'models')):
+    r"""Return location for the pretrained on local file system.
+
+    This function will download from online model zoo when model cannot be found or has mismatch.
+    The root directory will be created if it doesn't exist.
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    file_path
+        Path to the requested pretrained model file.
+    """
+
+    file_name = name
+    root = os.path.expanduser(root)
+    dir_path = os.path.join(root, name)
+    file_path = find_params_file(dir_path)
+    #file_path = os.path.join(root, file_name + '.params')
+    sha1_hash = _model_sha1[name]
+    if file_path is not None:
+        if check_sha1(file_path, sha1_hash):
+            return file_path
+        else:
+            print(
+                'Mismatch in the content of model file detected. Downloading again.'
+            )
+    else:
+        print('Model file is not found. Downloading.')
+
+    if not os.path.exists(root):
+        os.makedirs(root)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    zip_file_path = os.path.join(root, file_name + '.zip')
+    repo_url = base_repo_url
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
+             path=zip_file_path,
+             overwrite=True)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(dir_path)
+    os.remove(zip_file_path)
+    file_path = find_params_file(dir_path)
+
+    if check_sha1(file_path, sha1_hash):
+        return file_path
+    else:
+        raise ValueError(
+            'Downloaded file has different hash. Please try again.')
+
diff --git a/insightface/python-package/insightface/model_zoo/model_zoo.py b/insightface/python-package/insightface/model_zoo/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6283114f7b550f084749e65d8b55e1393fe62f
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/model_zoo.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+import os
+import os.path as osp
+import glob
+import onnxruntime
+from .arcface_onnx import *
+from .retinaface import *
+#from .scrfd import *
+from .landmark import *
+from .attribute import Attribute
+from .inswapper import INSwapper
+from ..utils import download_onnx
+
+__all__ = ['get_model']
+
+
+class PickableInferenceSession(onnxruntime.InferenceSession): 
+    # This is a wrapper to make the current InferenceSession class pickable.
+    def __init__(self, model_path, **kwargs):
+        super().__init__(model_path, **kwargs)
+        self.model_path = model_path
+
+    def __getstate__(self):
+        return {'model_path': self.model_path}
+
+    def __setstate__(self, values):
+        model_path = values['model_path']
+        self.__init__(model_path)
+
+class ModelRouter:
+    def __init__(self, onnx_file):
+        self.onnx_file = onnx_file
+
+    def get_model(self, **kwargs):
+        session = PickableInferenceSession(self.onnx_file, **kwargs)
+        print(f'Applied providers: {session._providers}, with options: {session._provider_options}')
+        inputs = session.get_inputs()
+        input_cfg = inputs[0]
+        input_shape = input_cfg.shape
+        outputs = session.get_outputs()
+
+        if len(outputs)>=5:
+            return RetinaFace(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==192 and input_shape[3]==192:
+            return Landmark(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==96 and input_shape[3]==96:
+            return Attribute(model_file=self.onnx_file, session=session)
+        elif len(inputs)==2 and input_shape[2]==128 and input_shape[3]==128:
+            return INSwapper(model_file=self.onnx_file, session=session)
+        elif input_shape[2]==input_shape[3] and input_shape[2]>=112 and input_shape[2]%16==0:
+            return ArcFaceONNX(model_file=self.onnx_file, session=session)
+        else:
+            #raise RuntimeError('error on model routing')
+            return None
+
+def find_onnx_file(dir_path):
+    if not os.path.exists(dir_path):
+        return None
+    paths = glob.glob("%s/*.onnx" % dir_path)
+    if len(paths) == 0:
+        return None
+    paths = sorted(paths)
+    return paths[-1]
+
+def get_default_providers():
+    return ['CUDAExecutionProvider', 'CPUExecutionProvider']
+
+def get_default_provider_options():
+    return None
+
+def get_model(name, **kwargs):
+    root = kwargs.get('root', '~/.insightface')
+    root = os.path.expanduser(root)
+    model_root = osp.join(root, 'models')
+    allow_download = kwargs.get('download', False)
+    download_zip = kwargs.get('download_zip', False)
+    if not name.endswith('.onnx'):
+        model_dir = os.path.join(model_root, name)
+        model_file = find_onnx_file(model_dir)
+        if model_file is None:
+            return None
+    else:
+        model_file = name
+    if not osp.exists(model_file) and allow_download:
+        model_file = download_onnx('models', model_file, root=root, download_zip=download_zip)
+    assert osp.exists(model_file), 'model_file %s should exist'%model_file
+    assert osp.isfile(model_file), 'model_file %s should be a file'%model_file
+    router = ModelRouter(model_file)
+    providers = kwargs.get('providers', get_default_providers())
+    provider_options = kwargs.get('provider_options', get_default_provider_options())
+    model = router.get_model(providers=providers, provider_options=provider_options)
+    return model
+
diff --git a/insightface/python-package/insightface/model_zoo/retinaface.py b/insightface/python-package/insightface/model_zoo/retinaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc4ad91ed70688b38503127137e928dc7e5433e1
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/retinaface.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-09-18
+# @Function      : 
+
+from __future__ import division
+import datetime
+import numpy as np
+import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class RetinaFace:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in detection model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            scores = net_outs[idx]
+            bbox_preds = net_outs[idx+fmc]
+            bbox_preds = bbox_preds * stride
+            if self.use_kps:
+                kps_preds = net_outs[idx+fmc*2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+def get_retinaface(name, download=False, root='~/.insightface/models', **kwargs):
+    if not download:
+        assert os.path.exists(name)
+        return RetinaFace(name)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("retinaface_%s" % name, root=root)
+        return retinaface(_file)
+
+
diff --git a/insightface/python-package/insightface/model_zoo/scrfd.py b/insightface/python-package/insightface/model_zoo/scrfd.py
new file mode 100644
index 0000000000000000000000000000000000000000..674db4bba761157592dfb95c5d1638da1099f89c
--- /dev/null
+++ b/insightface/python-package/insightface/model_zoo/scrfd.py
@@ -0,0 +1,348 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+from __future__ import division
+import datetime
+import numpy as np
+import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class SCRFD:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        self.batched = False
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, None)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in scrfd model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            # If model support batch dim, take first output
+            if self.batched:
+                scores = net_outs[idx][0]
+                bbox_preds = net_outs[idx + fmc][0]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2][0] * stride
+            # If model doesn't support batching take output as is
+            else:
+                scores = net_outs[idx]
+                bbox_preds = net_outs[idx + fmc]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2] * stride
+
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+def get_scrfd(name, download=False, root='~/.insightface/models', **kwargs):
+    if not download:
+        assert os.path.exists(name)
+        return SCRFD(name)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("scrfd_%s" % name, root=root)
+        return SCRFD(_file)
+
+
+def scrfd_2p5gkps(**kwargs):
+    return get_scrfd("2p5gkps", download=True, **kwargs)
+
+
+if __name__ == '__main__':
+    import glob
+    detector = SCRFD(model_file='./det.onnx')
+    detector.prepare(-1)
+    img_paths = ['tests/data/t1.jpg']
+    for img_path in img_paths:
+        img = cv2.imread(img_path)
+
+        for _ in range(1):
+            ta = datetime.datetime.now()
+            #bboxes, kpss = detector.detect(img, 0.5, input_size = (640, 640))
+            bboxes, kpss = detector.detect(img, 0.5)
+            tb = datetime.datetime.now()
+            print('all cost:', (tb-ta).total_seconds()*1000)
+        print(img_path, bboxes.shape)
+        if kpss is not None:
+            print(kpss.shape)
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i]
+            x1,y1,x2,y2,score = bbox.astype(np.int)
+            cv2.rectangle(img, (x1,y1)  , (x2,y2) , (255,0,0) , 2)
+            if kpss is not None:
+                kps = kpss[i]
+                for kp in kps:
+                    kp = kp.astype(np.int)
+                    cv2.circle(img, tuple(kp) , 1, (0,0,255) , 2)
+        filename = img_path.split('/')[-1]
+        print('output:', filename)
+        cv2.imwrite('./outputs/%s'%filename, img)
+
diff --git a/insightface/python-package/insightface/thirdparty/__init__.py b/insightface/python-package/insightface/thirdparty/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/python-package/insightface/thirdparty/face3d/__init__.py b/insightface/python-package/insightface/thirdparty/face3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68b284753ecf951292cf69347f797db4ec1ad41e
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/__init__.py
@@ -0,0 +1,4 @@
+#import mesh
+#import morphable_model
+from . import mesh
+from . import morphable_model
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.cpp b/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..595c8b2c7e9f594970af6788510fa27b78042f4f
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.cpp
@@ -0,0 +1,2872 @@
+/* Generated by Cython 0.29.23 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "language": "c++",
+        "name": "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython",
+        "sources": [
+            "insightface/thirdparty/face3d/morphable_model/__init__.py",
+            "insightface/thirdparty/face3d/morphable_model/morphabel_model.py",
+            "insightface/thirdparty/face3d/morphable_model/load.py",
+            "insightface/thirdparty/face3d/morphable_model/fit.py",
+            "insightface/thirdparty/face3d/mesh_numpy/vis.py",
+            "insightface/thirdparty/face3d/mesh_numpy/__init__.py",
+            "insightface/thirdparty/face3d/mesh_numpy/transform.py",
+            "insightface/thirdparty/face3d/mesh_numpy/io.py",
+            "insightface/thirdparty/face3d/mesh_numpy/render.py",
+            "insightface/thirdparty/face3d/mesh_numpy/light.py",
+            "insightface/thirdparty/face3d/mesh/vis.py",
+            "insightface/thirdparty/face3d/mesh/__init__.py",
+            "insightface/thirdparty/face3d/mesh/transform.py",
+            "insightface/thirdparty/face3d/mesh/io.py",
+            "insightface/thirdparty/face3d/mesh/render.py",
+            "insightface/thirdparty/face3d/mesh/light.py"
+        ]
+    },
+    "module_name": "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_29_23"
+#define CYTHON_HEX_VERSION 0x001D17F0
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+  #ifndef CYTHON_USE_DICT_VERSIONS
+    #define CYTHON_USE_DICT_VERSIONS (PY_VERSION_HEX >= 0x030600B1)
+  #endif
+  #ifndef CYTHON_USE_EXC_INFO_STACK
+    #define CYTHON_USE_EXC_INFO_STACK (PY_VERSION_HEX >= 0x030700A3)
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+  #ifdef SIZEOF_VOID_P
+    enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+  #endif
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef __cplusplus
+  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #else
+    #define CYTHON_INLINE inline
+  #endif
+#endif
+template<typename T>
+void __Pyx_call_destructor(T& x) {
+    x.~T();
+}
+template<typename T>
+class __Pyx_FakeReference {
+  public:
+    __Pyx_FakeReference() : ptr(NULL) { }
+    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
+    T *operator->() { return ptr; }
+    T *operator&() { return ptr; }
+    operator T&() { return *ptr; }
+    template<typename U> bool operator ==(U other) { return *ptr == other; }
+    template<typename U> bool operator !=(U other) { return *ptr != other; }
+  private:
+    T *ptr;
+};
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+#if PY_VERSION_HEX >= 0x030800A4 && PY_VERSION_HEX < 0x030800B2
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, 0, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#else
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef METH_STACKLESS
+  #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030400A1
+  #define PyMem_RawMalloc(n)           PyMem_Malloc(n)
+  #define PyMem_RawRealloc(p, n)       PyMem_Realloc(p, n)
+  #define PyMem_RawFree(p)             PyMem_Free(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #endif
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? ((void)(klass), PyMethod_New(func, self)) : __Pyx_NewRef(func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+  #define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+    { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+    { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__face3d__mesh____init__
+#define __PYX_HAVE_API__face3d__mesh____init__
+/* Early includes */
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+    return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+
+static const char *__pyx_f[] = {
+  "insightface/thirdparty/face3d/mesh/__init__.py",
+};
+
+/*--- Type declarations ---*/
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* SetPackagePathFromImportLib.proto */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_PEP489_MULTI_PHASE_INIT
+static int __Pyx_SetPackagePathFromImportLib(const char* parent_package_name, PyObject *module_name);
+#else
+#define __Pyx_SetPackagePathFromImportLib(a, b) 0
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT  ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict)  (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+    (version_var) = __PYX_GET_DICT_VERSION(dict);\
+    (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+        (VAR) = __pyx_dict_cached_value;\
+    } else {\
+        (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+        __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+    }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict)  (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP)  (VAR) = (LOOKUP);
+#endif
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* GCCDiagnostics.proto */
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'face3d.mesh.__init__' */
+#define __Pyx_MODULE_NAME "face3d.mesh.__init__"
+extern int __pyx_module_is_main_face3d__mesh____init__;
+int __pyx_module_is_main_face3d__mesh____init__ = 0;
+
+/* Implementation of 'face3d.mesh.__init__' */
+static const char __pyx_k_[] = "";
+static const char __pyx_k_io[] = "io";
+static const char __pyx_k_vis[] = "vis";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mesh[] = "mesh";
+static const char __pyx_k_name[] = "__name__";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_light[] = "light";
+static const char __pyx_k_cython[] = "cython";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_render[] = "render";
+static const char __pyx_k_transform[] = "transform";
+static const char __pyx_k_mesh_core_cython[] = "mesh_core_cython";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static PyObject *__pyx_n_s_;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_cython;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_io;
+static PyObject *__pyx_n_s_light;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_u_mesh;
+static PyObject *__pyx_n_s_mesh_core_cython;
+static PyObject *__pyx_n_s_name;
+static PyObject *__pyx_n_s_render;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_transform;
+static PyObject *__pyx_n_s_vis;
+/* Late includes */
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_mesh(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_mesh},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "mesh",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_n_s_, __pyx_k_, sizeof(__pyx_k_), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_cython, __pyx_k_cython, sizeof(__pyx_k_cython), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_n_s_io, __pyx_k_io, sizeof(__pyx_k_io), 0, 0, 1, 1},
+  {&__pyx_n_s_light, __pyx_k_light, sizeof(__pyx_k_light), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_u_mesh, __pyx_k_mesh, sizeof(__pyx_k_mesh), 0, 1, 0, 1},
+  {&__pyx_n_s_mesh_core_cython, __pyx_k_mesh_core_cython, sizeof(__pyx_k_mesh_core_cython), 0, 0, 1, 1},
+  {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+  {&__pyx_n_s_render, __pyx_k_render, sizeof(__pyx_k_render), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_transform, __pyx_k_transform, sizeof(__pyx_k_transform), 0, 0, 1, 1},
+  {&__pyx_n_s_vis, __pyx_k_vis, sizeof(__pyx_k_vis), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
+  return 0;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#ifndef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#elif PY_MAJOR_VERSION < 3
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" void
+#else
+#define __Pyx_PyMODINIT_FUNC void
+#endif
+#else
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initmesh(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initmesh(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_mesh(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_mesh(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
+    #if PY_VERSION_HEX >= 0x030700A1
+    static PY_INT64_T main_interpreter_id = -1;
+    PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp);
+    if (main_interpreter_id == -1) {
+        main_interpreter_id = current_id;
+        return (unlikely(current_id == -1)) ? -1 : 0;
+    } else if (unlikely(main_interpreter_id != current_id))
+    #else
+    static PyInterpreterState *main_interpreter = NULL;
+    PyInterpreterState *current_interpreter = PyThreadState_Get()->interp;
+    if (!main_interpreter) {
+        main_interpreter = current_interpreter;
+    } else if (unlikely(main_interpreter != current_interpreter))
+    #endif
+    {
+        PyErr_SetString(
+            PyExc_ImportError,
+            "Interpreter change detected - this module can only be loaded into one interpreter per process.");
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        if (allow_none || value != Py_None) {
+            result = PyDict_SetItemString(moddict, to_name, value);
+        }
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__Pyx_check_single_interpreter())
+        return NULL;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static CYTHON_SMALL_CODE int __pyx_pymod_exec_mesh(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m) {
+    if (__pyx_m == __pyx_pyinit_module) return 0;
+    PyErr_SetString(PyExc_RuntimeError, "Module 'mesh' has already been imported. Re-initialisation is not supported.");
+    return -1;
+  }
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_mesh(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pxy_PyFrame_Initialize_Offsets
+  __Pxy_PyFrame_Initialize_Offsets();
+  #endif
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("mesh", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_b);
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_cython_runtime);
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_face3d__mesh____init__) {
+    if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  if (!CYTHON_PEP489_MULTI_PHASE_INIT) {
+    if (unlikely(__Pyx_SetPackagePathFromImportLib("face3d", __pyx_n_u_mesh) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "face3d.mesh")) {
+      if (unlikely(PyDict_SetItemString(modules, "face3d.mesh", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  (void)__Pyx_modinit_type_init_code();
+  (void)__Pyx_modinit_type_import_code();
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "face3d/mesh/__init__.py":9
+ * #import render
+ * 
+ * from .cython import mesh_core_cython             # <<<<<<<<<<<<<<
+ * from . import io
+ * from . import vis
+ */
+  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_n_s_mesh_core_cython);
+  __Pyx_GIVEREF(__pyx_n_s_mesh_core_cython);
+  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_mesh_core_cython);
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_cython, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_mesh_core_cython, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "face3d/mesh/__init__.py":10
+ * 
+ * from .cython import mesh_core_cython
+ * from . import io             # <<<<<<<<<<<<<<
+ * from . import vis
+ * from . import transform
+ */
+  __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_INCREF(__pyx_n_s_io);
+  __Pyx_GIVEREF(__pyx_n_s_io);
+  PyList_SET_ITEM(__pyx_t_2, 0, __pyx_n_s_io);
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_, __pyx_t_2, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_io); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_io, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "face3d/mesh/__init__.py":11
+ * from .cython import mesh_core_cython
+ * from . import io
+ * from . import vis             # <<<<<<<<<<<<<<
+ * from . import transform
+ * from . import light
+ */
+  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_n_s_vis);
+  __Pyx_GIVEREF(__pyx_n_s_vis);
+  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_vis);
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_vis); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_vis, __pyx_t_1) < 0) __PYX_ERR(0, 11, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "face3d/mesh/__init__.py":12
+ * from . import io
+ * from . import vis
+ * from . import transform             # <<<<<<<<<<<<<<
+ * from . import light
+ * from . import render
+ */
+  __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_INCREF(__pyx_n_s_transform);
+  __Pyx_GIVEREF(__pyx_n_s_transform);
+  PyList_SET_ITEM(__pyx_t_2, 0, __pyx_n_s_transform);
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_, __pyx_t_2, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_transform); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_transform, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "face3d/mesh/__init__.py":13
+ * from . import vis
+ * from . import transform
+ * from . import light             # <<<<<<<<<<<<<<
+ * from . import render
+ * 
+ */
+  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_n_s_light);
+  __Pyx_GIVEREF(__pyx_n_s_light);
+  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_light);
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_light); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_light, __pyx_t_1) < 0) __PYX_ERR(0, 13, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "face3d/mesh/__init__.py":14
+ * from . import transform
+ * from . import light
+ * from . import render             # <<<<<<<<<<<<<<
+ * 
+ */
+  __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_INCREF(__pyx_n_s_render);
+  __Pyx_GIVEREF(__pyx_n_s_render);
+  PyList_SET_ITEM(__pyx_t_2, 0, __pyx_n_s_render);
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_, __pyx_t_2, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_render); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_render, __pyx_t_2) < 0) __PYX_ERR(0, 14, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "face3d/mesh/__init__.py":1
+ * #from __future__ import absolute_import             # <<<<<<<<<<<<<<
+ * #from cython import mesh_core_cython
+ * #import io
+ */
+  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init face3d.mesh.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    Py_CLEAR(__pyx_m);
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init face3d.mesh.__init__");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule(modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, "RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* SetPackagePathFromImportLib */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_PEP489_MULTI_PHASE_INIT
+static int __Pyx_SetPackagePathFromImportLib(const char* parent_package_name, PyObject *module_name) {
+    PyObject *importlib, *loader, *osmod, *ossep, *parts, *package_path;
+    PyObject *path = NULL, *file_path = NULL;
+    int result;
+    if (parent_package_name) {
+        PyObject *package = PyImport_ImportModule(parent_package_name);
+        if (unlikely(!package))
+            goto bad;
+        path = PyObject_GetAttrString(package, "__path__");
+        Py_DECREF(package);
+        if (unlikely(!path) || unlikely(path == Py_None))
+            goto bad;
+    } else {
+        path = Py_None; Py_INCREF(Py_None);
+    }
+    importlib = PyImport_ImportModule("importlib");
+    if (unlikely(!importlib))
+        goto bad;
+    loader = PyObject_CallMethod(importlib, "find_loader", "(OO)", module_name, path);
+    Py_DECREF(importlib);
+    Py_DECREF(path); path = NULL;
+    if (unlikely(!loader))
+        goto bad;
+    file_path = PyObject_GetAttrString(loader, "path");
+    Py_DECREF(loader);
+    if (unlikely(!file_path))
+        goto bad;
+    if (unlikely(PyObject_SetAttrString(__pyx_m, "__file__", file_path) < 0))
+        goto bad;
+    osmod = PyImport_ImportModule("os");
+    if (unlikely(!osmod))
+        goto bad;
+    ossep = PyObject_GetAttrString(osmod, "sep");
+    Py_DECREF(osmod);
+    if (unlikely(!ossep))
+        goto bad;
+    parts = PyObject_CallMethod(file_path, "rsplit", "(Oi)", ossep, 1);
+    Py_DECREF(file_path); file_path = NULL;
+    Py_DECREF(ossep);
+    if (unlikely(!parts))
+        goto bad;
+    package_path = Py_BuildValue("[O]", PyList_GET_ITEM(parts, 0));
+    Py_DECREF(parts);
+    if (unlikely(!package_path))
+        goto bad;
+    goto set_path;
+bad:
+    PyErr_WriteUnraisable(module_name);
+    Py_XDECREF(path);
+    Py_XDECREF(file_path);
+    PyErr_Clear();
+    package_path = PyList_New(0);
+    if (unlikely(!package_path))
+        return -1;
+set_path:
+    result = PyObject_SetAttrString(__pyx_m, "__path__", package_path);
+    Py_DECREF(package_path);
+    return result;
+}
+#endif
+
+/* Import */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if ((1) && (strchr(__Pyx_MODULE_NAME, '.'))) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, (PyObject *)NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* ImportFrom */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
+    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
+    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Format(PyExc_ImportError,
+        #if PY_MAJOR_VERSION < 3
+            "cannot import name %.230s", PyString_AS_STRING(name));
+        #else
+            "cannot import name %S", name);
+        #endif
+    }
+    return value;
+}
+
+/* PyDictVersioning */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0;
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) {
+    PyObject **dictptr = NULL;
+    Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset;
+    if (offset) {
+#if CYTHON_COMPILING_IN_CPYTHON
+        dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj);
+#else
+        dictptr = _PyObject_GetDictPtr(obj);
+#endif
+    }
+    return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0;
+}
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict)))
+        return 0;
+    return obj_dict_version == __Pyx_get_object_dict_version(obj);
+}
+#endif
+
+/* PyErrFetchRestore */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* CLineInTraceback */
+#ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_NCP_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+        __PYX_PY_DICT_LOOKUP_IF_MODIFIED(
+            use_cline, *cython_runtime_dict,
+            __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback))
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+#include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+/* CIntToPy */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPyVerify */
+#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* CIntFromPy */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* CIntFromPy */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* FastTypeChecks */
+#if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* InitStrings */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
+    int retval;
+    if (unlikely(!x)) return -1;
+    retval = __Pyx_PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return retval;
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(b);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e432960656282c76c1393f63d06c62e3a04c465
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/__init__.py
@@ -0,0 +1,15 @@
+#from __future__ import absolute_import
+#from cython import mesh_core_cython
+#import io
+#import vis
+#import transform
+#import light
+#import render
+
+from .cython import mesh_core_cython
+from . import io
+from . import vis
+from . import transform
+from . import light
+from . import render
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.cpp b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aeea4dae814a1c43c2cfc59f3636f8d43ca9093d
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.cpp
@@ -0,0 +1,375 @@
+/*
+functions that can not be optimazed by vertorization in python.
+1. rasterization.(need process each triangle)
+2. normal of each vertex.(use one-ring, need process each vertex)
+3. write obj(seems that it can be verctorized? anyway, writing it in c++ is simple, so also add function here. --> however, why writting in c++ is still slow?)
+
+Author: Yao Feng 
+Mail: yaofeng1995@gmail.com
+*/
+
+#include "mesh_core.h"
+
+
+/* Judge whether the point is in the triangle
+Method:
+    http://blackpawn.com/texts/pointinpoly/
+Args:
+    point: [x, y] 
+    tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices
+Returns:
+    bool: true for in triangle
+*/
+bool isPointInTri(point p, point p0, point p1, point p2)
+{   
+    // vectors
+    point v0, v1, v2;
+    v0 = p2 - p0;
+    v1 = p1 - p0;
+    v2 = p - p0;
+
+    // dot products
+    float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
+    float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
+    float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
+    float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
+    float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
+
+    // barycentric coordinates
+    float inverDeno;
+    if(dot00*dot11 - dot01*dot01 == 0)
+        inverDeno = 0;
+    else
+        inverDeno = 1/(dot00*dot11 - dot01*dot01);
+
+    float u = (dot11*dot02 - dot01*dot12)*inverDeno;
+    float v = (dot00*dot12 - dot01*dot02)*inverDeno;
+
+    // check if point in triangle
+    return (u >= 0) && (v >= 0) && (u + v < 1);
+}
+
+
+void get_point_weight(float* weight, point p, point p0, point p1, point p2)
+{   
+    // vectors
+    point v0, v1, v2;
+    v0 = p2 - p0; 
+    v1 = p1 - p0; 
+    v2 = p - p0; 
+
+    // dot products
+    float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
+    float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
+    float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
+    float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
+    float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
+
+    // barycentric coordinates
+    float inverDeno;
+    if(dot00*dot11 - dot01*dot01 == 0)
+        inverDeno = 0;
+    else
+        inverDeno = 1/(dot00*dot11 - dot01*dot01);
+
+    float u = (dot11*dot02 - dot01*dot12)*inverDeno;
+    float v = (dot00*dot12 - dot01*dot02)*inverDeno;
+
+    // weight
+    weight[0] = 1 - u - v;
+    weight[1] = v;
+    weight[2] = u;
+}
+
+
+void _get_normal_core(
+    float* normal, float* tri_normal, int* triangles,
+    int ntri)
+{
+    int i, j;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+
+    for(i = 0; i < ntri; i++)
+    {
+        tri_p0_ind = triangles[3*i];
+        tri_p1_ind = triangles[3*i + 1];
+        tri_p2_ind = triangles[3*i + 2];
+
+        for(j = 0; j < 3; j++)
+        {
+            normal[3*tri_p0_ind + j] = normal[3*tri_p0_ind + j] + tri_normal[3*i + j];
+            normal[3*tri_p1_ind + j] = normal[3*tri_p1_ind + j] + tri_normal[3*i + j];
+            normal[3*tri_p2_ind + j] = normal[3*tri_p2_ind + j] + tri_normal[3*i + j];
+        }
+    }
+}
+
+
+void _rasterize_triangles_core(
+    float* vertices, int* triangles, 
+    float* depth_buffer, int* triangle_buffer, float* barycentric_weight,
+    int nver, int ntri,
+    int h, int w)
+{
+    int i;
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    point p0, p1, p2, p;
+    int x_min, x_max, y_min, y_max;
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float weight[3];
+
+    for(i = 0; i < ntri; i++)
+    {
+        tri_p0_ind = triangles[3*i];
+        tri_p1_ind = triangles[3*i + 1];
+        tri_p2_ind = triangles[3*i + 2];
+
+        p0.x = vertices[3*tri_p0_ind]; p0.y = vertices[3*tri_p0_ind + 1]; p0_depth = vertices[3*tri_p0_ind + 2];
+        p1.x = vertices[3*tri_p1_ind]; p1.y = vertices[3*tri_p1_ind + 1]; p1_depth = vertices[3*tri_p1_ind + 2];
+        p2.x = vertices[3*tri_p2_ind]; p2.y = vertices[3*tri_p2_ind + 1]; p2_depth = vertices[3*tri_p2_ind + 2];
+        
+        x_min = max((int)ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int)floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+      
+        y_min = max((int)ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int)floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+        if(x_max < x_min || y_max < y_min)
+        {
+            continue;
+        }
+
+        for(y = y_min; y <= y_max; y++) //h
+        {
+            for(x = x_min; x <= x_max; x++) //w
+            {
+                p.x = x; p.y = y;
+                if(p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || isPointInTri(p, p0, p1, p2))
+                {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0]*p0_depth + weight[1]*p1_depth + weight[2]*p2_depth;
+
+                    if((p_depth > depth_buffer[y*w + x]))
+                    {
+                        depth_buffer[y*w + x] = p_depth;
+                        triangle_buffer[y*w + x] = i;
+                        for(k = 0; k < 3; k++)
+                        {
+                            barycentric_weight[y*w*3 + x*3 + k] = weight[k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+void _render_colors_core(
+    float* image, float* vertices, int* triangles, 
+    float* colors, 
+    float* depth_buffer,
+    int nver, int ntri,
+    int h, int w, int c)
+{
+    int i;
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    point p0, p1, p2, p;
+    int x_min, x_max, y_min, y_max;
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float p_color, p0_color, p1_color, p2_color;
+    float weight[3];
+
+    for(i = 0; i < ntri; i++)
+    {
+        tri_p0_ind = triangles[3*i];
+        tri_p1_ind = triangles[3*i + 1];
+        tri_p2_ind = triangles[3*i + 2];
+
+        p0.x = vertices[3*tri_p0_ind]; p0.y = vertices[3*tri_p0_ind + 1]; p0_depth = vertices[3*tri_p0_ind + 2];
+        p1.x = vertices[3*tri_p1_ind]; p1.y = vertices[3*tri_p1_ind + 1]; p1_depth = vertices[3*tri_p1_ind + 2];
+        p2.x = vertices[3*tri_p2_ind]; p2.y = vertices[3*tri_p2_ind + 1]; p2_depth = vertices[3*tri_p2_ind + 2];
+        
+        x_min = max((int)ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int)floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+      
+        y_min = max((int)ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int)floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+        if(x_max < x_min || y_max < y_min)
+        {
+            continue;
+        }
+
+        for(y = y_min; y <= y_max; y++) //h
+        {
+            for(x = x_min; x <= x_max; x++) //w
+            {
+                p.x = x; p.y = y;
+                if(p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || isPointInTri(p, p0, p1, p2))
+                {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0]*p0_depth + weight[1]*p1_depth + weight[2]*p2_depth;
+
+                    if((p_depth > depth_buffer[y*w + x]))
+                    {
+                        for(k = 0; k < c; k++) // c
+                        {   
+                            p0_color = colors[c*tri_p0_ind + k];
+                            p1_color = colors[c*tri_p1_ind + k];
+                            p2_color = colors[c*tri_p2_ind + k]; 
+
+                            p_color = weight[0]*p0_color + weight[1]*p1_color + weight[2]*p2_color;
+                            image[y*w*c + x*c + k] = p_color;
+                        }
+
+                        depth_buffer[y*w + x] = p_depth;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+void _render_texture_core(
+    float* image, float* vertices, int* triangles, 
+    float* texture, float* tex_coords, int* tex_triangles, 
+    float* depth_buffer,
+    int nver, int tex_nver, int ntri, 
+    int h, int w, int c, 
+    int tex_h, int tex_w, int tex_c, 
+    int mapping_type)
+{
+    int i;
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    int tex_tri_p0_ind, tex_tri_p1_ind, tex_tri_p2_ind;
+    point p0, p1, p2, p;
+    point tex_p0, tex_p1, tex_p2, tex_p;
+    int x_min, x_max, y_min, y_max;
+    float weight[3];
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float xd, yd;
+    float ul, ur, dl, dr;
+    for(i = 0; i < ntri; i++)
+    {
+        // mesh
+        tri_p0_ind = triangles[3*i];
+        tri_p1_ind = triangles[3*i + 1];
+        tri_p2_ind = triangles[3*i + 2];
+
+        p0.x = vertices[3*tri_p0_ind]; p0.y = vertices[3*tri_p0_ind + 1]; p0_depth = vertices[3*tri_p0_ind + 2];
+        p1.x = vertices[3*tri_p1_ind]; p1.y = vertices[3*tri_p1_ind + 1]; p1_depth = vertices[3*tri_p1_ind + 2];
+        p2.x = vertices[3*tri_p2_ind]; p2.y = vertices[3*tri_p2_ind + 1]; p2_depth = vertices[3*tri_p2_ind + 2];
+       
+        // texture
+        tex_tri_p0_ind = tex_triangles[3*i];
+        tex_tri_p1_ind = tex_triangles[3*i + 1];
+        tex_tri_p2_ind = tex_triangles[3*i + 2];
+
+        tex_p0.x = tex_coords[3*tex_tri_p0_ind]; tex_p0.y = tex_coords[3*tri_p0_ind + 1];
+        tex_p1.x = tex_coords[3*tex_tri_p1_ind]; tex_p1.y = tex_coords[3*tri_p1_ind + 1];
+        tex_p2.x = tex_coords[3*tex_tri_p2_ind]; tex_p2.y = tex_coords[3*tri_p2_ind + 1];
+
+
+        x_min = max((int)ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int)floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+      
+        y_min = max((int)ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int)floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+
+        if(x_max < x_min || y_max < y_min)
+        {
+            continue;
+        }
+
+        for(y = y_min; y <= y_max; y++) //h
+        {
+            for(x = x_min; x <= x_max; x++) //w
+            {
+                p.x = x; p.y = y;
+                if(p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || isPointInTri(p, p0, p1, p2))
+                {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0]*p0_depth + weight[1]*p1_depth + weight[2]*p2_depth;
+                    
+                    if((p_depth > depth_buffer[y*w + x]))
+                    {
+                        // -- color from texture
+                        // cal weight in mesh tri
+                        get_point_weight(weight, p, p0, p1, p2);
+                        // cal coord in texture
+                        tex_p = tex_p0*weight[0] + tex_p1*weight[1] + tex_p2*weight[2];
+                        tex_p.x = max(min(tex_p.x, float(tex_w - 1)), float(0)); 
+                        tex_p.y = max(min(tex_p.y, float(tex_h - 1)), float(0)); 
+
+                        yd = tex_p.y - floor(tex_p.y);
+                        xd = tex_p.x - floor(tex_p.x);
+                        for(k = 0; k < c; k++)
+                        {
+                            if(mapping_type==0)// nearest
+                            {   
+                                image[y*w*c + x*c + k] = texture[int(round(tex_p.y))*tex_w*tex_c + int(round(tex_p.x))*tex_c + k];
+                            }
+                            else//bilinear interp
+                            { 
+                                ul = texture[(int)floor(tex_p.y)*tex_w*tex_c + (int)floor(tex_p.x)*tex_c + k];
+                                ur = texture[(int)floor(tex_p.y)*tex_w*tex_c + (int)ceil(tex_p.x)*tex_c + k];
+                                dl = texture[(int)ceil(tex_p.y)*tex_w*tex_c + (int)floor(tex_p.x)*tex_c + k];
+                                dr = texture[(int)ceil(tex_p.y)*tex_w*tex_c + (int)ceil(tex_p.x)*tex_c + k];
+
+                                image[y*w*c + x*c + k] = ul*(1-xd)*(1-yd) + ur*xd*(1-yd) + dl*(1-xd)*yd + dr*xd*yd;
+                            }
+
+                        }
+
+                        depth_buffer[y*w + x] = p_depth;
+                    } 
+                }
+            }
+        }
+    }
+}
+
+
+
+// ------------------------------------------------- write
+// obj write
+// Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/core/Mesh.hpp
+void _write_obj_with_colors_texture(string filename, string mtl_name, 
+    float* vertices, int* triangles, float* colors, float* uv_coords,
+    int nver, int ntri, int ntexver)
+{
+    int i;
+
+    ofstream obj_file(filename.c_str());
+
+    // first line of the obj file: the mtl name
+    obj_file << "mtllib " << mtl_name << endl;
+    
+    // write vertices 
+    for (i = 0; i < nver; ++i) 
+    {
+        obj_file << "v " << vertices[3*i] << " " << vertices[3*i + 1] << " " << vertices[3*i + 2] << colors[3*i] << " " << colors[3*i + 1] << " " << colors[3*i + 2] <<  endl;
+    }
+
+    // write uv coordinates
+    for (i = 0; i < ntexver; ++i) 
+    {
+        //obj_file << "vt " << uv_coords[2*i] << " " << (1 - uv_coords[2*i + 1]) << endl;
+        obj_file << "vt " << uv_coords[2*i] << " " << uv_coords[2*i + 1] << endl;
+    }
+
+    obj_file << "usemtl FaceTexture" << endl;
+    // write triangles
+    for (i = 0; i < ntri; ++i) 
+    {
+        // obj_file << "f " << triangles[3*i] << "/" << triangles[3*i] << " " << triangles[3*i + 1] << "/" << triangles[3*i + 1] << " " << triangles[3*i + 2] << "/" << triangles[3*i + 2] << endl;
+        obj_file << "f " << triangles[3*i + 2] << "/" << triangles[3*i + 2] << " " << triangles[3*i + 1] << "/" << triangles[3*i + 1] << " " << triangles[3*i] << "/" << triangles[3*i] << endl;
+    }
+
+}
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.h b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..3eb00290f36f8cbe27a17d88bca31fe718d77e2c
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core.h
@@ -0,0 +1,83 @@
+#ifndef MESH_CORE_HPP_
+#define MESH_CORE_HPP_
+
+#include <stdio.h>
+#include <cmath>
+#include <algorithm>  
+#include <string>
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+class point
+{
+ public:
+    float x;
+    float y;
+
+    float dot(point p)
+    {
+        return this->x * p.x + this->y * p.y;
+    }
+
+    point operator-(const point& p)
+    {
+        point np;
+        np.x = this->x - p.x;
+        np.y = this->y - p.y;
+        return np;
+    }
+
+    point operator+(const point& p)
+    {
+        point np;
+        np.x = this->x + p.x;
+        np.y = this->y + p.y;
+        return np;
+    }
+
+    point operator*(float s)
+    {
+        point np;
+        np.x = s * this->x;
+        np.y = s * this->y;
+        return np;
+    }
+}; 
+
+
+bool isPointInTri(point p, point p0, point p1, point p2, int h, int w);
+void get_point_weight(float* weight, point p, point p0, point p1, point p2);
+
+void _get_normal_core(
+    float* normal, float* tri_normal, int* triangles,
+    int ntri);
+
+void _rasterize_triangles_core(
+    float* vertices, int* triangles, 
+    float* depth_buffer, int* triangle_buffer, float* barycentric_weight,
+    int nver, int ntri,
+    int h, int w);
+
+void _render_colors_core(
+    float* image, float* vertices, int* triangles, 
+    float* colors, 
+    float* depth_buffer,
+    int nver, int ntri,
+    int h, int w, int c);
+
+void _render_texture_core(
+    float* image, float* vertices, int* triangles, 
+    float* texture, float* tex_coords, int* tex_triangles, 
+    float* depth_buffer,
+    int nver, int tex_nver, int ntri, 
+    int h, int w, int c, 
+    int tex_h, int tex_w, int tex_c, 
+    int mapping_type);
+
+void _write_obj_with_colors_texture(string filename, string mtl_name, 
+    float* vertices, int* triangles, float* colors, float* uv_coords,
+    int nver, int ntri, int ntexver);
+
+#endif
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.c b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.c
new file mode 100644
index 0000000000000000000000000000000000000000..e113ac54ae03e4a79ea2a975ef33e67deb9d93d6
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.c
@@ -0,0 +1,9091 @@
+/* Generated by Cython 0.28.2 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "insightface/thirdparty/face3d/mesh/cython/mesh_core.h"
+        ],
+        "include_dirs": [
+            "insightface/thirdparty/face3d/mesh/cython"
+        ],
+        "name": "mesh_core_cython",
+        "sources": [
+            "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx"
+        ]
+    },
+    "module_name": "mesh_core_cython"
+}
+END: Cython Metadata */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_28_2"
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (0 && PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #elif defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0; // PyThread_create_key reports success always
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif // TSS (Thread Specific Storage) API
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+  #define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+{ \
+  __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
+}
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__mesh_core_cython
+#define __PYX_HAVE_API__mesh_core_cython
+/* Early includes */
+#include <string.h>
+#include <stdio.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "ios"
+#include "new"
+#include "stdexcept"
+#include "typeinfo"
+#include <string>
+#include "mesh_core.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+#define __Pyx_PyBool_FromLong(b) ((b) ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False))
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx",
+  "__init__.pxd",
+  "stringsource",
+  "type.pxd",
+};
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":730
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":731
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":737
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":738
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":739
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":740
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":744
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":745
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":754
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":756
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":759
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":763
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":765
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":767
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":769
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":770
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":773
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs)  (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* DictGetItem.proto */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key);
+#define __Pyx_PyObject_Dict_GetItem(obj, name)\
+    (likely(PyDict_CheckExact(obj)) ?\
+     __Pyx_PyDict_GetItem(obj, name) : PyObject_GetItem(obj, name))
+#else
+#define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key)
+#define __Pyx_PyObject_Dict_GetItem(obj, name)  PyObject_GetItem(obj, name)
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* PyIdentifierFromString.proto */
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+  #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+/* ModuleImport.proto */
+static PyObject *__Pyx_ImportModule(const char *name);
+
+/* TypeImport.proto */
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void); /*proto*/
+
+/* Module declarations from 'libcpp.string' */
+
+/* Module declarations from 'mesh_core_cython' */
+static std::string __pyx_convert_string_from_py_std__in_string(PyObject *); /*proto*/
+static __Pyx_TypeInfo __Pyx_TypeInfo_float = { "float", NULL, sizeof(float), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_int = { "int", NULL, sizeof(int), { 0 }, 0, IS_UNSIGNED(int) ? 'U' : 'I', IS_UNSIGNED(int), 0 };
+#define __Pyx_MODULE_NAME "mesh_core_cython"
+extern int __pyx_module_is_main_mesh_core_cython;
+int __pyx_module_is_main_mesh_core_cython = 0;
+
+/* Implementation of 'mesh_core_cython' */
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_RuntimeError;
+static PyObject *__pyx_builtin_ImportError;
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k_h[] = "h";
+static const char __pyx_k_w[] = "w";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_ntri[] = "ntri";
+static const char __pyx_k_nver[] = "nver";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_image[] = "image";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_tex_c[] = "tex_c";
+static const char __pyx_k_tex_h[] = "tex_h";
+static const char __pyx_k_tex_w[] = "tex_w";
+static const char __pyx_k_colors[] = "colors";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_normal[] = "normal";
+static const char __pyx_k_ntexver[] = "ntexver";
+static const char __pyx_k_texture[] = "texture";
+static const char __pyx_k_filename[] = "filename";
+static const char __pyx_k_mtl_name[] = "mtl_name";
+static const char __pyx_k_tex_nver[] = "tex_nver";
+static const char __pyx_k_vertices[] = "vertices";
+static const char __pyx_k_triangles[] = "triangles";
+static const char __pyx_k_uv_coords[] = "uv_coords";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_tex_coords[] = "tex_coords";
+static const char __pyx_k_tri_normal[] = "tri_normal";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_RuntimeError[] = "RuntimeError";
+static const char __pyx_k_depth_buffer[] = "depth_buffer";
+static const char __pyx_k_mapping_type[] = "mapping_type";
+static const char __pyx_k_tex_triangles[] = "tex_triangles";
+static const char __pyx_k_get_normal_core[] = "get_normal_core";
+static const char __pyx_k_triangle_buffer[] = "triangle_buffer";
+static const char __pyx_k_mesh_core_cython[] = "mesh_core_cython";
+static const char __pyx_k_barycentric_weight[] = "barycentric_weight";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_render_colors_core[] = "render_colors_core";
+static const char __pyx_k_render_texture_core[] = "render_texture_core";
+static const char __pyx_k_rasterize_triangles_core[] = "rasterize_triangles_core";
+static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static const char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static const char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static const char __pyx_k_insightface_thirdparty_face3d_me[] = "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx";
+static const char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static const char __pyx_k_write_obj_with_colors_texture_co[] = "write_obj_with_colors_texture_core";
+static const char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_barycentric_weight;
+static PyObject *__pyx_n_s_c;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_colors;
+static PyObject *__pyx_n_s_depth_buffer;
+static PyObject *__pyx_n_s_filename;
+static PyObject *__pyx_n_s_get_normal_core;
+static PyObject *__pyx_n_s_h;
+static PyObject *__pyx_n_s_image;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_kp_s_insightface_thirdparty_face3d_me;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_mapping_type;
+static PyObject *__pyx_n_s_mesh_core_cython;
+static PyObject *__pyx_n_s_mtl_name;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_n_s_normal;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_ntexver;
+static PyObject *__pyx_n_s_ntri;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_n_s_nver;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_rasterize_triangles_core;
+static PyObject *__pyx_n_s_render_colors_core;
+static PyObject *__pyx_n_s_render_texture_core;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_tex_c;
+static PyObject *__pyx_n_s_tex_coords;
+static PyObject *__pyx_n_s_tex_h;
+static PyObject *__pyx_n_s_tex_nver;
+static PyObject *__pyx_n_s_tex_triangles;
+static PyObject *__pyx_n_s_tex_w;
+static PyObject *__pyx_n_s_texture;
+static PyObject *__pyx_n_s_tri_normal;
+static PyObject *__pyx_n_s_triangle_buffer;
+static PyObject *__pyx_n_s_triangles;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_n_s_uv_coords;
+static PyObject *__pyx_n_s_vertices;
+static PyObject *__pyx_n_s_w;
+static PyObject *__pyx_n_s_write_obj_with_colors_texture_co;
+static PyObject *__pyx_pf_16mesh_core_cython_get_normal_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_normal, PyArrayObject *__pyx_v_tri_normal, PyArrayObject *__pyx_v_triangles, int __pyx_v_ntri); /* proto */
+static PyObject *__pyx_pf_16mesh_core_cython_2rasterize_triangles_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_depth_buffer, PyArrayObject *__pyx_v_triangle_buffer, PyArrayObject *__pyx_v_barycentric_weight, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_16mesh_core_cython_4render_colors_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c); /* proto */
+static PyObject *__pyx_pf_16mesh_core_cython_6render_texture_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_texture, PyArrayObject *__pyx_v_tex_coords, PyArrayObject *__pyx_v_tex_triangles, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_tex_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c, int __pyx_v_tex_h, int __pyx_v_tex_w, int __pyx_v_tex_c, int __pyx_v_mapping_type); /* proto */
+static PyObject *__pyx_pf_16mesh_core_cython_8write_obj_with_colors_texture_core(CYTHON_UNUSED PyObject *__pyx_self, std::string __pyx_v_filename, std::string __pyx_v_mtl_name, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_uv_coords, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_ntexver); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__10;
+static PyObject *__pyx_tuple__12;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_tuple__18;
+static PyObject *__pyx_codeobj__11;
+static PyObject *__pyx_codeobj__13;
+static PyObject *__pyx_codeobj__15;
+static PyObject *__pyx_codeobj__17;
+static PyObject *__pyx_codeobj__19;
+/* Late includes */
+
+/* "mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_16mesh_core_cython_1get_normal_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_16mesh_core_cython_1get_normal_core = {"get_normal_core", (PyCFunction)__pyx_pw_16mesh_core_cython_1get_normal_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_16mesh_core_cython_1get_normal_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_normal = 0;
+  PyArrayObject *__pyx_v_tri_normal = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  int __pyx_v_ntri;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("get_normal_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_normal,&__pyx_n_s_tri_normal,&__pyx_n_s_triangles,&__pyx_n_s_ntri,0};
+    PyObject* values[4] = {0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_normal)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tri_normal)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 1); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 2); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 3); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "get_normal_core") < 0)) __PYX_ERR(0, 40, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+    }
+    __pyx_v_normal = ((PyArrayObject *)values[0]);
+    __pyx_v_tri_normal = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[3]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 40, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("mesh_core_cython.get_normal_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_normal), __pyx_ptype_5numpy_ndarray, 0, "normal", 0))) __PYX_ERR(0, 40, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tri_normal), __pyx_ptype_5numpy_ndarray, 0, "tri_normal", 0))) __PYX_ERR(0, 41, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 42, __pyx_L1_error)
+  __pyx_r = __pyx_pf_16mesh_core_cython_get_normal_core(__pyx_self, __pyx_v_normal, __pyx_v_tri_normal, __pyx_v_triangles, __pyx_v_ntri);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_16mesh_core_cython_get_normal_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_normal, PyArrayObject *__pyx_v_tri_normal, PyArrayObject *__pyx_v_triangles, int __pyx_v_ntri) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_normal;
+  __Pyx_Buffer __pyx_pybuffer_normal;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tri_normal;
+  __Pyx_Buffer __pyx_pybuffer_tri_normal;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("get_normal_core", 0);
+  __pyx_pybuffer_normal.pybuffer.buf = NULL;
+  __pyx_pybuffer_normal.refcount = 0;
+  __pyx_pybuffernd_normal.data = NULL;
+  __pyx_pybuffernd_normal.rcbuffer = &__pyx_pybuffer_normal;
+  __pyx_pybuffer_tri_normal.pybuffer.buf = NULL;
+  __pyx_pybuffer_tri_normal.refcount = 0;
+  __pyx_pybuffernd_tri_normal.data = NULL;
+  __pyx_pybuffernd_tri_normal.rcbuffer = &__pyx_pybuffer_tri_normal;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_normal.rcbuffer->pybuffer, (PyObject*)__pyx_v_normal, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_normal.diminfo[0].strides = __pyx_pybuffernd_normal.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_normal.diminfo[0].shape = __pyx_pybuffernd_normal.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_normal.diminfo[1].strides = __pyx_pybuffernd_normal.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_normal.diminfo[1].shape = __pyx_pybuffernd_normal.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer, (PyObject*)__pyx_v_tri_normal, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tri_normal.diminfo[0].strides = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tri_normal.diminfo[0].shape = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tri_normal.diminfo[1].strides = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tri_normal.diminfo[1].shape = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+
+  /* "mesh_core_cython.pyx":45
+ *                 int ntri
+ *                 ):
+ *     _get_normal_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(normal), <float*> np.PyArray_DATA(tri_normal), <int*> np.PyArray_DATA(triangles),
+ *         ntri)
+ */
+  _get_normal_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_normal))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_tri_normal))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), __pyx_v_ntri);
+
+  /* "mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_normal.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("mesh_core_cython.get_normal_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_normal.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_16mesh_core_cython_3rasterize_triangles_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_16mesh_core_cython_3rasterize_triangles_core = {"rasterize_triangles_core", (PyCFunction)__pyx_pw_16mesh_core_cython_3rasterize_triangles_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_16mesh_core_cython_3rasterize_triangles_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  PyArrayObject *__pyx_v_triangle_buffer = 0;
+  PyArrayObject *__pyx_v_barycentric_weight = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("rasterize_triangles_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_depth_buffer,&__pyx_n_s_triangle_buffer,&__pyx_n_s_barycentric_weight,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[9] = {0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 1); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 2); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangle_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 3); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_barycentric_weight)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 4); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 5); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 6); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 7); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 8); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "rasterize_triangles_core") < 0)) __PYX_ERR(0, 49, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 9) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+    }
+    __pyx_v_vertices = ((PyArrayObject *)values[0]);
+    __pyx_v_triangles = ((PyArrayObject *)values[1]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[2]);
+    __pyx_v_triangle_buffer = ((PyArrayObject *)values[3]);
+    __pyx_v_barycentric_weight = ((PyArrayObject *)values[4]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[5]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 55, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 55, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 56, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 56, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 49, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("mesh_core_cython.rasterize_triangles_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 50, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 51, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 52, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangle_buffer), __pyx_ptype_5numpy_ndarray, 0, "triangle_buffer", 0))) __PYX_ERR(0, 53, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_barycentric_weight), __pyx_ptype_5numpy_ndarray, 0, "barycentric_weight", 0))) __PYX_ERR(0, 54, __pyx_L1_error)
+  __pyx_r = __pyx_pf_16mesh_core_cython_2rasterize_triangles_core(__pyx_self, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_depth_buffer, __pyx_v_triangle_buffer, __pyx_v_barycentric_weight, __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_16mesh_core_cython_2rasterize_triangles_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_depth_buffer, PyArrayObject *__pyx_v_triangle_buffer, PyArrayObject *__pyx_v_barycentric_weight, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_barycentric_weight;
+  __Pyx_Buffer __pyx_pybuffer_barycentric_weight;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangle_buffer;
+  __Pyx_Buffer __pyx_pybuffer_triangle_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("rasterize_triangles_core", 0);
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  __pyx_pybuffer_triangle_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangle_buffer.refcount = 0;
+  __pyx_pybuffernd_triangle_buffer.data = NULL;
+  __pyx_pybuffernd_triangle_buffer.rcbuffer = &__pyx_pybuffer_triangle_buffer;
+  __pyx_pybuffer_barycentric_weight.pybuffer.buf = NULL;
+  __pyx_pybuffer_barycentric_weight.refcount = 0;
+  __pyx_pybuffernd_barycentric_weight.data = NULL;
+  __pyx_pybuffernd_barycentric_weight.rcbuffer = &__pyx_pybuffer_barycentric_weight;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangle_buffer, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangle_buffer.diminfo[0].strides = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangle_buffer.diminfo[0].shape = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangle_buffer.diminfo[1].strides = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangle_buffer.diminfo[1].shape = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer, (PyObject*)__pyx_v_barycentric_weight, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_barycentric_weight.diminfo[0].strides = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_barycentric_weight.diminfo[0].shape = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_barycentric_weight.diminfo[1].strides = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_barycentric_weight.diminfo[1].shape = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.shape[1];
+
+  /* "mesh_core_cython.pyx":58
+ *                 int h, int w
+ *                 ):
+ *     _rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(depth_buffer), <int*> np.PyArray_DATA(triangle_buffer), <float*> np.PyArray_DATA(barycentric_weight),
+ */
+  _rasterize_triangles_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangle_buffer))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_barycentric_weight))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w);
+
+  /* "mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("mesh_core_cython.rasterize_triangles_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_16mesh_core_cython_5render_colors_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_16mesh_core_cython_5render_colors_core = {"render_colors_core", (PyCFunction)__pyx_pw_16mesh_core_cython_5render_colors_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_16mesh_core_cython_5render_colors_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_image = 0;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_colors = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  int __pyx_v_c;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_colors_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_image,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_colors,&__pyx_n_s_depth_buffer,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_c,0};
+    PyObject* values[10] = {0,0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+        CYTHON_FALLTHROUGH;
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_image)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 1); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 2); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_colors)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 3); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 4); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 5); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 6); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 7); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 8); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  9:
+        if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 9); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "render_colors_core") < 0)) __PYX_ERR(0, 64, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 10) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+      values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+    }
+    __pyx_v_image = ((PyArrayObject *)values[0]);
+    __pyx_v_vertices = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_colors = ((PyArrayObject *)values[3]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[4]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[5]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 69, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 69, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+    __pyx_v_c = __Pyx_PyInt_As_int(values[9]); if (unlikely((__pyx_v_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 64, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("mesh_core_cython.render_colors_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_image), __pyx_ptype_5numpy_ndarray, 0, "image", 0))) __PYX_ERR(0, 64, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 65, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 66, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_colors), __pyx_ptype_5numpy_ndarray, 0, "colors", 0))) __PYX_ERR(0, 67, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 68, __pyx_L1_error)
+  __pyx_r = __pyx_pf_16mesh_core_cython_4render_colors_core(__pyx_self, __pyx_v_image, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_colors, __pyx_v_depth_buffer, __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_16mesh_core_cython_4render_colors_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_colors;
+  __Pyx_Buffer __pyx_pybuffer_colors;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_image;
+  __Pyx_Buffer __pyx_pybuffer_image;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_colors_core", 0);
+  __pyx_pybuffer_image.pybuffer.buf = NULL;
+  __pyx_pybuffer_image.refcount = 0;
+  __pyx_pybuffernd_image.data = NULL;
+  __pyx_pybuffernd_image.rcbuffer = &__pyx_pybuffer_image;
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_colors.pybuffer.buf = NULL;
+  __pyx_pybuffer_colors.refcount = 0;
+  __pyx_pybuffernd_colors.data = NULL;
+  __pyx_pybuffernd_colors.rcbuffer = &__pyx_pybuffer_colors;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_image.rcbuffer->pybuffer, (PyObject*)__pyx_v_image, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_image.diminfo[0].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_image.diminfo[0].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_image.diminfo[1].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_image.diminfo[1].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_image.diminfo[2].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_image.diminfo[2].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_colors.rcbuffer->pybuffer, (PyObject*)__pyx_v_colors, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_colors.diminfo[0].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_colors.diminfo[0].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_colors.diminfo[1].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_colors.diminfo[1].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+
+  /* "mesh_core_cython.pyx":72
+ *                 int h, int w, int c
+ *                 ):
+ *     _render_colors_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(colors),
+ */
+  _render_colors_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_image))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_colors))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c);
+
+  /* "mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("mesh_core_cython.render_colors_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_16mesh_core_cython_7render_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_16mesh_core_cython_7render_texture_core = {"render_texture_core", (PyCFunction)__pyx_pw_16mesh_core_cython_7render_texture_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_16mesh_core_cython_7render_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_image = 0;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_texture = 0;
+  PyArrayObject *__pyx_v_tex_coords = 0;
+  PyArrayObject *__pyx_v_tex_triangles = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  int __pyx_v_nver;
+  int __pyx_v_tex_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  int __pyx_v_c;
+  int __pyx_v_tex_h;
+  int __pyx_v_tex_w;
+  int __pyx_v_tex_c;
+  int __pyx_v_mapping_type;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_texture_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_image,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_texture,&__pyx_n_s_tex_coords,&__pyx_n_s_tex_triangles,&__pyx_n_s_depth_buffer,&__pyx_n_s_nver,&__pyx_n_s_tex_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_c,&__pyx_n_s_tex_h,&__pyx_n_s_tex_w,&__pyx_n_s_tex_c,&__pyx_n_s_mapping_type,0};
+    PyObject* values[17] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+        CYTHON_FALLTHROUGH;
+        case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+        CYTHON_FALLTHROUGH;
+        case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+        CYTHON_FALLTHROUGH;
+        case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+        CYTHON_FALLTHROUGH;
+        case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+        CYTHON_FALLTHROUGH;
+        case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+        CYTHON_FALLTHROUGH;
+        case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+        CYTHON_FALLTHROUGH;
+        case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+        CYTHON_FALLTHROUGH;
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_image)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 1); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 2); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_texture)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 3); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_coords)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 4); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 5); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 6); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 7); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 8); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  9:
+        if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 9); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 10:
+        if (likely((values[10] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 10); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 11:
+        if (likely((values[11] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 11); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 12:
+        if (likely((values[12] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 12); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 13:
+        if (likely((values[13] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 13); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 14:
+        if (likely((values[14] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 14); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 15:
+        if (likely((values[15] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 15); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 16:
+        if (likely((values[16] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_mapping_type)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 16); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "render_texture_core") < 0)) __PYX_ERR(0, 79, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 17) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+      values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+      values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+      values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+      values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+      values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+      values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+      values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+      values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+    }
+    __pyx_v_image = ((PyArrayObject *)values[0]);
+    __pyx_v_vertices = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_texture = ((PyArrayObject *)values[3]);
+    __pyx_v_tex_coords = ((PyArrayObject *)values[4]);
+    __pyx_v_tex_triangles = ((PyArrayObject *)values[5]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[6]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_tex_nver = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_tex_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[9]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[10]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[11]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_c = __Pyx_PyInt_As_int(values[12]); if (unlikely((__pyx_v_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_tex_h = __Pyx_PyInt_As_int(values[13]); if (unlikely((__pyx_v_tex_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_tex_w = __Pyx_PyInt_As_int(values[14]); if (unlikely((__pyx_v_tex_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_tex_c = __Pyx_PyInt_As_int(values[15]); if (unlikely((__pyx_v_tex_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_mapping_type = __Pyx_PyInt_As_int(values[16]); if (unlikely((__pyx_v_mapping_type == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 89, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 79, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("mesh_core_cython.render_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_image), __pyx_ptype_5numpy_ndarray, 0, "image", 0))) __PYX_ERR(0, 79, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 80, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 81, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_texture), __pyx_ptype_5numpy_ndarray, 0, "texture", 0))) __PYX_ERR(0, 82, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tex_coords), __pyx_ptype_5numpy_ndarray, 0, "tex_coords", 0))) __PYX_ERR(0, 83, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tex_triangles), __pyx_ptype_5numpy_ndarray, 0, "tex_triangles", 0))) __PYX_ERR(0, 84, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 85, __pyx_L1_error)
+  __pyx_r = __pyx_pf_16mesh_core_cython_6render_texture_core(__pyx_self, __pyx_v_image, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_texture, __pyx_v_tex_coords, __pyx_v_tex_triangles, __pyx_v_depth_buffer, __pyx_v_nver, __pyx_v_tex_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c, __pyx_v_tex_h, __pyx_v_tex_w, __pyx_v_tex_c, __pyx_v_mapping_type);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_16mesh_core_cython_6render_texture_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_texture, PyArrayObject *__pyx_v_tex_coords, PyArrayObject *__pyx_v_tex_triangles, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_tex_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c, int __pyx_v_tex_h, int __pyx_v_tex_w, int __pyx_v_tex_c, int __pyx_v_mapping_type) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_image;
+  __Pyx_Buffer __pyx_pybuffer_image;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tex_coords;
+  __Pyx_Buffer __pyx_pybuffer_tex_coords;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tex_triangles;
+  __Pyx_Buffer __pyx_pybuffer_tex_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_texture;
+  __Pyx_Buffer __pyx_pybuffer_texture;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_texture_core", 0);
+  __pyx_pybuffer_image.pybuffer.buf = NULL;
+  __pyx_pybuffer_image.refcount = 0;
+  __pyx_pybuffernd_image.data = NULL;
+  __pyx_pybuffernd_image.rcbuffer = &__pyx_pybuffer_image;
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_texture.pybuffer.buf = NULL;
+  __pyx_pybuffer_texture.refcount = 0;
+  __pyx_pybuffernd_texture.data = NULL;
+  __pyx_pybuffernd_texture.rcbuffer = &__pyx_pybuffer_texture;
+  __pyx_pybuffer_tex_coords.pybuffer.buf = NULL;
+  __pyx_pybuffer_tex_coords.refcount = 0;
+  __pyx_pybuffernd_tex_coords.data = NULL;
+  __pyx_pybuffernd_tex_coords.rcbuffer = &__pyx_pybuffer_tex_coords;
+  __pyx_pybuffer_tex_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_tex_triangles.refcount = 0;
+  __pyx_pybuffernd_tex_triangles.data = NULL;
+  __pyx_pybuffernd_tex_triangles.rcbuffer = &__pyx_pybuffer_tex_triangles;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_image.rcbuffer->pybuffer, (PyObject*)__pyx_v_image, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_image.diminfo[0].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_image.diminfo[0].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_image.diminfo[1].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_image.diminfo[1].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_image.diminfo[2].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_image.diminfo[2].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_texture.rcbuffer->pybuffer, (PyObject*)__pyx_v_texture, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_texture.diminfo[0].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_texture.diminfo[0].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_texture.diminfo[1].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_texture.diminfo[1].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_texture.diminfo[2].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_texture.diminfo[2].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer, (PyObject*)__pyx_v_tex_coords, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tex_coords.diminfo[0].strides = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tex_coords.diminfo[0].shape = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tex_coords.diminfo[1].strides = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tex_coords.diminfo[1].shape = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_tex_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tex_triangles.diminfo[0].strides = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tex_triangles.diminfo[0].shape = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tex_triangles.diminfo[1].strides = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tex_triangles.diminfo[1].shape = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+
+  /* "mesh_core_cython.pyx":91
+ *                 int mapping_type
+ *                 ):
+ *     _render_texture_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(texture), <float*> np.PyArray_DATA(tex_coords), <int*> np.PyArray_DATA(tex_triangles),
+ */
+  _render_texture_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_image))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_texture))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_tex_coords))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_tex_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), __pyx_v_nver, __pyx_v_tex_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c, __pyx_v_tex_h, __pyx_v_tex_w, __pyx_v_tex_c, __pyx_v_mapping_type);
+
+  /* "mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_texture.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("mesh_core_cython.render_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_texture.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_16mesh_core_cython_9write_obj_with_colors_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_16mesh_core_cython_9write_obj_with_colors_texture_core = {"write_obj_with_colors_texture_core", (PyCFunction)__pyx_pw_16mesh_core_cython_9write_obj_with_colors_texture_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_16mesh_core_cython_9write_obj_with_colors_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  std::string __pyx_v_filename;
+  std::string __pyx_v_mtl_name;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_colors = 0;
+  PyArrayObject *__pyx_v_uv_coords = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_ntexver;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("write_obj_with_colors_texture_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_filename,&__pyx_n_s_mtl_name,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_colors,&__pyx_n_s_uv_coords,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_ntexver,0};
+    PyObject* values[9] = {0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_filename)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_mtl_name)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 1); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 2); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 3); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_colors)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 4); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_uv_coords)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 5); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 6); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 7); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntexver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 8); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "write_obj_with_colors_texture_core") < 0)) __PYX_ERR(0, 100, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 9) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+    }
+    __pyx_v_filename = __pyx_convert_string_from_py_std__in_string(values[0]); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 100, __pyx_L3_error)
+    __pyx_v_mtl_name = __pyx_convert_string_from_py_std__in_string(values[1]); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 100, __pyx_L3_error)
+    __pyx_v_vertices = ((PyArrayObject *)values[2]);
+    __pyx_v_triangles = ((PyArrayObject *)values[3]);
+    __pyx_v_colors = ((PyArrayObject *)values[4]);
+    __pyx_v_uv_coords = ((PyArrayObject *)values[5]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+    __pyx_v_ntexver = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_ntexver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 100, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("mesh_core_cython.write_obj_with_colors_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 101, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 102, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_colors), __pyx_ptype_5numpy_ndarray, 0, "colors", 0))) __PYX_ERR(0, 103, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_uv_coords), __pyx_ptype_5numpy_ndarray, 0, "uv_coords", 0))) __PYX_ERR(0, 104, __pyx_L1_error)
+  __pyx_r = __pyx_pf_16mesh_core_cython_8write_obj_with_colors_texture_core(__pyx_self, __pyx_v_filename, __pyx_v_mtl_name, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_colors, __pyx_v_uv_coords, __pyx_v_nver, __pyx_v_ntri, __pyx_v_ntexver);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_16mesh_core_cython_8write_obj_with_colors_texture_core(CYTHON_UNUSED PyObject *__pyx_self, std::string __pyx_v_filename, std::string __pyx_v_mtl_name, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_uv_coords, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_ntexver) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_colors;
+  __Pyx_Buffer __pyx_pybuffer_colors;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_uv_coords;
+  __Pyx_Buffer __pyx_pybuffer_uv_coords;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("write_obj_with_colors_texture_core", 0);
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_colors.pybuffer.buf = NULL;
+  __pyx_pybuffer_colors.refcount = 0;
+  __pyx_pybuffernd_colors.data = NULL;
+  __pyx_pybuffernd_colors.rcbuffer = &__pyx_pybuffer_colors;
+  __pyx_pybuffer_uv_coords.pybuffer.buf = NULL;
+  __pyx_pybuffer_uv_coords.refcount = 0;
+  __pyx_pybuffernd_uv_coords.data = NULL;
+  __pyx_pybuffernd_uv_coords.rcbuffer = &__pyx_pybuffer_uv_coords;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_colors.rcbuffer->pybuffer, (PyObject*)__pyx_v_colors, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_colors.diminfo[0].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_colors.diminfo[0].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_colors.diminfo[1].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_colors.diminfo[1].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer, (PyObject*)__pyx_v_uv_coords, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_uv_coords.diminfo[0].strides = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_uv_coords.diminfo[0].shape = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_uv_coords.diminfo[1].strides = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_uv_coords.diminfo[1].shape = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.shape[1];
+
+  /* "mesh_core_cython.pyx":107
+ *                 int nver, int ntri, int ntexver
+ *                 ):
+ *     _write_obj_with_colors_texture(filename, mtl_name,             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles), <float*> np.PyArray_DATA(colors), <float*> np.PyArray_DATA(uv_coords),
+ *         nver, ntri, ntexver)
+ */
+  _write_obj_with_colors_texture(__pyx_v_filename, __pyx_v_mtl_name, ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_colors))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_uv_coords))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_ntexver);
+
+  /* "mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("mesh_core_cython.write_obj_with_colors_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fulfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_i;
+  int __pyx_v_ndim;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  int __pyx_v_t;
+  char *__pyx_v_f;
+  PyArray_Descr *__pyx_v_descr = 0;
+  int __pyx_v_offset;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  char *__pyx_t_8;
+  if (__pyx_v_info == NULL) {
+    PyErr_SetString(PyExc_BufferError, "PyObject_GetBuffer: view==NULL argument is obsolete");
+    return -1;
+  }
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+  __Pyx_GIVEREF(__pyx_v_info->obj);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ * 
+ *             cdef int i, ndim
+ *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":223
+ *             cdef int i, ndim
+ *             cdef int endian_detector = 1
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ * 
+ *             ndim = PyArray_NDIM(self)
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":225
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+  __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L4_bool_binop_done;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":228
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L4_bool_binop_done:;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  if (unlikely(__pyx_t_1)) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 229, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 229, __pyx_L1_error)
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L7_bool_binop_done;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L7_bool_binop_done:;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  if (unlikely(__pyx_t_1)) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 233, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 233, __pyx_L1_error)
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ * 
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 # Allocate new buffer for strides and shape info.
+ */
+  __pyx_v_info->ndim = __pyx_v_ndim;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":240
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)             # <<<<<<<<<<<<<<
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)PyObject_Malloc((((sizeof(Py_ssize_t)) * 2) * ((size_t)__pyx_v_ndim))));
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":241
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)
+ *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+    __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":242
+ *                 info.strides = <Py_ssize_t*>PyObject_Malloc(sizeof(Py_ssize_t) * 2 * <size_t>ndim)
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):             # <<<<<<<<<<<<<<
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ */
+    __pyx_t_4 = __pyx_v_ndim;
+    __pyx_t_5 = __pyx_t_4;
+    for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+      __pyx_v_i = __pyx_t_6;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":243
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ */
+      (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":244
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ */
+      (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+    goto __pyx_L9;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ */
+  /*else*/ {
+    __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":247
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ */
+    __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+  }
+  __pyx_L9:;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":249
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ * 
+ */
+  __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":250
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int t
+ */
+  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ * 
+ *             cdef int t
+ *             cdef char* f = NULL             # <<<<<<<<<<<<<<
+ *             cdef dtype descr = self.descr
+ *             cdef int offset
+ */
+  __pyx_v_f = NULL;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":254
+ *             cdef int t
+ *             cdef char* f = NULL
+ *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
+ *             cdef int offset
+ * 
+ */
+  __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
+  __Pyx_INCREF(__pyx_t_3);
+  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *             cdef int offset
+ * 
+ *             info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *             if not PyDataType_HASFIELDS(descr):
+ */
+  __Pyx_INCREF(((PyObject *)__pyx_v_self));
+  __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+  __Pyx_GOTREF(__pyx_v_info->obj);
+  __Pyx_DECREF(__pyx_v_info->obj);
+  __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *             info.obj = self
+ * 
+ *             if not PyDataType_HASFIELDS(descr):             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  __pyx_t_1 = ((!(PyDataType_HASFIELDS(__pyx_v_descr) != 0)) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ * 
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ */
+    __pyx_t_4 = __pyx_v_descr->type_num;
+    __pyx_v_t = __pyx_t_4;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '>') != 0);
+    if (!__pyx_t_2) {
+      goto __pyx_L15_next_or;
+    } else {
+    }
+    __pyx_t_2 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L14_bool_binop_done;
+    }
+    __pyx_L15_next_or:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '<') != 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L14_bool_binop_done;
+    }
+    __pyx_t_2 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L14_bool_binop_done:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    if (unlikely(__pyx_t_1)) {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 263, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 263, __pyx_L1_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *             if not PyDataType_HASFIELDS(descr):
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ */
+    switch (__pyx_v_t) {
+      case NPY_BYTE:
+      __pyx_v_f = ((char *)"b");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ */
+      case NPY_UBYTE:
+      __pyx_v_f = ((char *)"B");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ */
+      case NPY_SHORT:
+      __pyx_v_f = ((char *)"h");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ */
+      case NPY_USHORT:
+      __pyx_v_f = ((char *)"H");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ */
+      case NPY_INT:
+      __pyx_v_f = ((char *)"i");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ */
+      case NPY_UINT:
+      __pyx_v_f = ((char *)"I");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ */
+      case NPY_LONG:
+      __pyx_v_f = ((char *)"l");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ */
+      case NPY_ULONG:
+      __pyx_v_f = ((char *)"L");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ */
+      case NPY_LONGLONG:
+      __pyx_v_f = ((char *)"q");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ */
+      case NPY_ULONGLONG:
+      __pyx_v_f = ((char *)"Q");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ */
+      case NPY_FLOAT:
+      __pyx_v_f = ((char *)"f");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":275
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ */
+      case NPY_DOUBLE:
+      __pyx_v_f = ((char *)"d");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ */
+      case NPY_LONGDOUBLE:
+      __pyx_v_f = ((char *)"g");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":277
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+      case NPY_CFLOAT:
+      __pyx_v_f = ((char *)"Zf");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"
+ */
+      case NPY_CDOUBLE:
+      __pyx_v_f = ((char *)"Zd");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":279
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ */
+      case NPY_CLONGDOUBLE:
+      __pyx_v_f = ((char *)"Zg");
+      break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      case NPY_OBJECT:
+      __pyx_v_f = ((char *)"O");
+      break;
+      default:
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *                 info.format = f
+ *                 return
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_7 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(1, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_7); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 282, __pyx_L1_error)
+      break;
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f             # <<<<<<<<<<<<<<
+ *                 return
+ *             else:
+ */
+    __pyx_v_info->format = __pyx_v_f;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":284
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f
+ *                 return             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ */
+    __pyx_r = 0;
+    goto __pyx_L0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *             info.obj = self
+ * 
+ *             if not PyDataType_HASFIELDS(descr):             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":286
+ *                 return
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ */
+  /*else*/ {
+    __pyx_v_info->format = ((char *)PyObject_Malloc(0xFF));
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":287
+ *             else:
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ */
+    (__pyx_v_info->format[0]) = '^';
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 info.format = <char*>PyObject_Malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0             # <<<<<<<<<<<<<<
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ *                                       info.format + _buffer_format_string_len,
+ */
+    __pyx_v_offset = 0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":289
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ */
+    __pyx_t_8 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 0xFF), (&__pyx_v_offset)); if (unlikely(__pyx_t_8 == ((char *)NULL))) __PYX_ERR(1, 289, __pyx_L1_error)
+    __pyx_v_f = __pyx_t_8;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+    (__pyx_v_f[0]) = '\x00';
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fulfill the PEP.
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = 0;
+  }
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+  __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":296
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 PyObject_Free(info.strides)
+ */
+    PyObject_Free(__pyx_v_info->format);
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":295
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":298
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 PyObject_Free(info.strides)             # <<<<<<<<<<<<<<
+ *                 # info.shape was stored after info.strides in the same block
+ * 
+ */
+    PyObject_Free(__pyx_v_info->strides);
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":297
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 PyObject_Free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 PyObject_Free(info.format)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":776
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 776, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":779
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 779, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":782
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 782, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":785
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 785, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":784
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":788
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 788, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":787
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":792
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject*)__pyx_v_d->subarray->shape));
+    __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape);
+    goto __pyx_L0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *         return ()
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+  PyArray_Descr *__pyx_v_child = 0;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  PyObject *__pyx_v_fields = 0;
+  PyObject *__pyx_v_childname = NULL;
+  PyObject *__pyx_v_new_offset = NULL;
+  PyObject *__pyx_v_t = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  long __pyx_t_8;
+  char *__pyx_t_9;
+  __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ * 
+ *     cdef dtype child
+ *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ *     cdef tuple fields
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ *     cdef dtype child
+ *     cdef int endian_detector = 1
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ *     cdef tuple fields
+ * 
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  if (unlikely(__pyx_v_descr->names == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    __PYX_ERR(1, 805, __pyx_L1_error)
+  }
+  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  for (;;) {
+    if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(1, 805, __pyx_L1_error)
+    #else
+    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 805, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":806
+ * 
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
+ *         child, new_offset = fields
+ * 
+ */
+    if (unlikely(__pyx_v_descr->fields == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(1, 806, __pyx_L1_error)
+    }
+    __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 806, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) __PYX_ERR(1, 806, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":807
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields             # <<<<<<<<<<<<<<
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ */
+    if (likely(__pyx_v_fields != Py_None)) {
+      PyObject* sequence = __pyx_v_fields;
+      Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
+      if (unlikely(size != 2)) {
+        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+        __PYX_ERR(1, 807, __pyx_L1_error)
+      }
+      #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+      __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      #else
+      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 807, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 807, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      #endif
+    } else {
+      __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 807, __pyx_L1_error)
+    }
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) __PYX_ERR(1, 807, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    __pyx_t_3 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 809, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 809, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 809, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    if (unlikely(__pyx_t_6)) {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":810
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 810, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 810, __pyx_L1_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":809
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '>') != 0);
+    if (!__pyx_t_7) {
+      goto __pyx_L8_next_or;
+    } else {
+    }
+    __pyx_t_7 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_L8_next_or:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *             raise ValueError(u"Non-native byte order not supported")
+ *             # One could encode it in the format string and have Cython
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '<') != 0);
+    if (__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_t_7 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_6 = __pyx_t_7;
+    __pyx_L7_bool_binop_done:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    if (unlikely(__pyx_t_6)) {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 814, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 814, __pyx_L1_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":812
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":824
+ * 
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ */
+    while (1) {
+      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 824, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 824, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 824, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (!__pyx_t_6) break;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":825
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
+ *             f += 1
+ *             offset[0] += 1
+ */
+      (__pyx_v_f[0]) = 0x78;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1             # <<<<<<<<<<<<<<
+ *             offset[0] += 1
+ * 
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ *             offset[0] += 1             # <<<<<<<<<<<<<<
+ * 
+ *         offset[0] += child.itemsize
+ */
+      __pyx_t_8 = 0;
+      (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + 1);
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ *             offset[0] += 1
+ * 
+ *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ */
+    __pyx_t_8 = 0;
+    (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + __pyx_v_child->elsize);
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    if (__pyx_t_6) {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num             # <<<<<<<<<<<<<<
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 832, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+      __pyx_t_4 = 0;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      if (unlikely(__pyx_t_6)) {
+
+        /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __PYX_ERR(1, 834, __pyx_L1_error)
+
+        /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_BYTE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 98;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UBYTE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_SHORT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x68;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_USHORT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 72;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_INT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x69;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UINT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 73;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":843
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 843, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 843, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 843, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x6C;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 844, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 844, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 844, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 76;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 845, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 845, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 845, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x71;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":846
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 846, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 846, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 846, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 81;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":847
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_FLOAT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 847, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 847, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 847, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":848
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 848, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 848, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 848, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x64;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 849, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 849, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 849, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x67;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 850, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 850, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 850, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x66;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":851
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 851, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 851, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 851, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x64;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":852
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 852, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 852, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 852, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x67;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":853
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_OBJECT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 853, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 853, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 853, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (likely(__pyx_t_6)) {
+        (__pyx_v_f[0]) = 79;
+        goto __pyx_L15;
+      }
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":855
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *             f += 1
+ *         else:
+ */
+      /*else*/ {
+        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 855, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 855, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __PYX_ERR(1, 855, __pyx_L1_error)
+      }
+      __pyx_L15:;
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":856
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *             f += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             # Cython ignores struct boundary information ("T{...}"),
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+      goto __pyx_L13;
+    }
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":860
+ *             # Cython ignores struct boundary information ("T{...}"),
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
+ *     return f
+ * 
+ */
+    /*else*/ {
+      __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_9 == ((char *)NULL))) __PYX_ERR(1, 860, __pyx_L1_error)
+      __pyx_v_f = __pyx_t_9;
+    }
+    __pyx_L13:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":805
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":861
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)
+ *     return f             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_f;
+  goto __pyx_L0;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *         return ()
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_child);
+  __Pyx_XDECREF(__pyx_v_fields);
+  __Pyx_XDECREF(__pyx_v_childname);
+  __Pyx_XDECREF(__pyx_v_new_offset);
+  __Pyx_XDECREF(__pyx_v_t);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  PyObject *__pyx_v_baseptr;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+  __pyx_t_1 = (__pyx_v_base == Py_None);
+  __pyx_t_2 = (__pyx_t_1 != 0);
+  if (__pyx_t_2) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ *          baseptr = NULL             # <<<<<<<<<<<<<<
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ */
+    __pyx_v_baseptr = NULL;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":979
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":982
+ *          baseptr = NULL
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ */
+  /*else*/ {
+    Py_INCREF(__pyx_v_base);
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":983
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr
+ */
+    __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+  }
+  __pyx_L3:;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":984
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
+ *      arr.base = baseptr
+ * 
+ */
+  Py_XDECREF(__pyx_v_arr->base);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":985
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_v_arr->base = __pyx_v_baseptr;
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":989
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     else:
+ *         return <object>arr.base
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":991
+ *         return None
+ *     else:
+ *         return <object>arr.base             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
+    goto __pyx_L0;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":996
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_array", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":998
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         _import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 998, __pyx_L3_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":999
+ *     try:
+ *         _import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 999, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1000
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1000, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 1000, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":996
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1002
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_umath", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1004
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 1004, __pyx_L3_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1005
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 1005, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1006
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1006, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 1006, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1003
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1002
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1008
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1010
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 1010, __pyx_L3_error)
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1011
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 1011, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1012
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1012, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 1012, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1009
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1008
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+static std::string __pyx_convert_string_from_py_std__in_string(PyObject *__pyx_v_o) {
+  Py_ssize_t __pyx_v_length;
+  char const *__pyx_v_data;
+  std::string __pyx_r;
+  __Pyx_RefNannyDeclarations
+  char const *__pyx_t_1;
+  __Pyx_RefNannySetupContext("__pyx_convert_string_from_py_std__in_string", 0);
+
+  /* "string.from_py":15
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:
+ *     cdef Py_ssize_t length
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)             # <<<<<<<<<<<<<<
+ *     return string(data, length)
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyObject_AsStringAndSize(__pyx_v_o, (&__pyx_v_length)); if (unlikely(__pyx_t_1 == ((char const *)NULL))) __PYX_ERR(2, 15, __pyx_L1_error)
+  __pyx_v_data = __pyx_t_1;
+
+  /* "string.from_py":16
+ *     cdef Py_ssize_t length
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ *     return string(data, length)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = std::string(__pyx_v_data, __pyx_v_length);
+  goto __pyx_L0;
+
+  /* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("string.from_py.__pyx_convert_string_from_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_pretend_to_initialize(&__pyx_r);
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_mesh_core_cython(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_mesh_core_cython},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "mesh_core_cython",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
+  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
+  {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
+  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
+  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s_barycentric_weight, __pyx_k_barycentric_weight, sizeof(__pyx_k_barycentric_weight), 0, 0, 1, 1},
+  {&__pyx_n_s_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_colors, __pyx_k_colors, sizeof(__pyx_k_colors), 0, 0, 1, 1},
+  {&__pyx_n_s_depth_buffer, __pyx_k_depth_buffer, sizeof(__pyx_k_depth_buffer), 0, 0, 1, 1},
+  {&__pyx_n_s_filename, __pyx_k_filename, sizeof(__pyx_k_filename), 0, 0, 1, 1},
+  {&__pyx_n_s_get_normal_core, __pyx_k_get_normal_core, sizeof(__pyx_k_get_normal_core), 0, 0, 1, 1},
+  {&__pyx_n_s_h, __pyx_k_h, sizeof(__pyx_k_h), 0, 0, 1, 1},
+  {&__pyx_n_s_image, __pyx_k_image, sizeof(__pyx_k_image), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_k_insightface_thirdparty_face3d_me, sizeof(__pyx_k_insightface_thirdparty_face3d_me), 0, 0, 1, 0},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_mapping_type, __pyx_k_mapping_type, sizeof(__pyx_k_mapping_type), 0, 0, 1, 1},
+  {&__pyx_n_s_mesh_core_cython, __pyx_k_mesh_core_cython, sizeof(__pyx_k_mesh_core_cython), 0, 0, 1, 1},
+  {&__pyx_n_s_mtl_name, __pyx_k_mtl_name, sizeof(__pyx_k_mtl_name), 0, 0, 1, 1},
+  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
+  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
+  {&__pyx_n_s_normal, __pyx_k_normal, sizeof(__pyx_k_normal), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_ntexver, __pyx_k_ntexver, sizeof(__pyx_k_ntexver), 0, 0, 1, 1},
+  {&__pyx_n_s_ntri, __pyx_k_ntri, sizeof(__pyx_k_ntri), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_kp_s_numpy_core_multiarray_failed_to, __pyx_k_numpy_core_multiarray_failed_to, sizeof(__pyx_k_numpy_core_multiarray_failed_to), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_core_umath_failed_to_impor, __pyx_k_numpy_core_umath_failed_to_impor, sizeof(__pyx_k_numpy_core_umath_failed_to_impor), 0, 0, 1, 0},
+  {&__pyx_n_s_nver, __pyx_k_nver, sizeof(__pyx_k_nver), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_rasterize_triangles_core, __pyx_k_rasterize_triangles_core, sizeof(__pyx_k_rasterize_triangles_core), 0, 0, 1, 1},
+  {&__pyx_n_s_render_colors_core, __pyx_k_render_colors_core, sizeof(__pyx_k_render_colors_core), 0, 0, 1, 1},
+  {&__pyx_n_s_render_texture_core, __pyx_k_render_texture_core, sizeof(__pyx_k_render_texture_core), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_c, __pyx_k_tex_c, sizeof(__pyx_k_tex_c), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_coords, __pyx_k_tex_coords, sizeof(__pyx_k_tex_coords), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_h, __pyx_k_tex_h, sizeof(__pyx_k_tex_h), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_nver, __pyx_k_tex_nver, sizeof(__pyx_k_tex_nver), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_triangles, __pyx_k_tex_triangles, sizeof(__pyx_k_tex_triangles), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_w, __pyx_k_tex_w, sizeof(__pyx_k_tex_w), 0, 0, 1, 1},
+  {&__pyx_n_s_texture, __pyx_k_texture, sizeof(__pyx_k_texture), 0, 0, 1, 1},
+  {&__pyx_n_s_tri_normal, __pyx_k_tri_normal, sizeof(__pyx_k_tri_normal), 0, 0, 1, 1},
+  {&__pyx_n_s_triangle_buffer, __pyx_k_triangle_buffer, sizeof(__pyx_k_triangle_buffer), 0, 0, 1, 1},
+  {&__pyx_n_s_triangles, __pyx_k_triangles, sizeof(__pyx_k_triangles), 0, 0, 1, 1},
+  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
+  {&__pyx_n_s_uv_coords, __pyx_k_uv_coords, sizeof(__pyx_k_uv_coords), 0, 0, 1, 1},
+  {&__pyx_n_s_vertices, __pyx_k_vertices, sizeof(__pyx_k_vertices), 0, 0, 1, 1},
+  {&__pyx_n_s_w, __pyx_k_w, sizeof(__pyx_k_w), 0, 0, 1, 1},
+  {&__pyx_n_s_write_obj_with_colors_texture_co, __pyx_k_write_obj_with_colors_texture_co, sizeof(__pyx_k_write_obj_with_colors_texture_co), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(1, 229, __pyx_L1_error)
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(1, 242, __pyx_L1_error)
+  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) __PYX_ERR(1, 810, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(1, 1000, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple_)) __PYX_ERR(1, 229, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(1, 233, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(1, 263, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":810
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+  __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(1, 810, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(1, 814, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(1, 834, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__6);
+  __Pyx_GIVEREF(__pyx_tuple__6);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1000
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(1, 1000, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1006
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__8)) __PYX_ERR(1, 1006, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "../../../../jack_ssd/home/jack/anaconda3/lib/python3.6/site-packages/Cython/Includes/numpy/__init__.pxd":1012
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(1, 1012, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+
+  /* "mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__10 = PyTuple_Pack(4, __pyx_n_s_normal, __pyx_n_s_tri_normal, __pyx_n_s_triangles, __pyx_n_s_ntri); if (unlikely(!__pyx_tuple__10)) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+  __pyx_codeobj__11 = (PyObject*)__Pyx_PyCode_New(4, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__10, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_get_normal_core, 40, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__11)) __PYX_ERR(0, 40, __pyx_L1_error)
+
+  /* "mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__12 = PyTuple_Pack(9, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_depth_buffer, __pyx_n_s_triangle_buffer, __pyx_n_s_barycentric_weight, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
+  __pyx_codeobj__13 = (PyObject*)__Pyx_PyCode_New(9, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__12, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_rasterize_triangles_core, 49, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__13)) __PYX_ERR(0, 49, __pyx_L1_error)
+
+  /* "mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__14 = PyTuple_Pack(10, __pyx_n_s_image, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_colors, __pyx_n_s_depth_buffer, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_c); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+  __pyx_codeobj__15 = (PyObject*)__Pyx_PyCode_New(10, 0, 10, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__14, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_render_colors_core, 64, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__15)) __PYX_ERR(0, 64, __pyx_L1_error)
+
+  /* "mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__16 = PyTuple_Pack(17, __pyx_n_s_image, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_texture, __pyx_n_s_tex_coords, __pyx_n_s_tex_triangles, __pyx_n_s_depth_buffer, __pyx_n_s_nver, __pyx_n_s_tex_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_c, __pyx_n_s_tex_h, __pyx_n_s_tex_w, __pyx_n_s_tex_c, __pyx_n_s_mapping_type); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+  __pyx_codeobj__17 = (PyObject*)__Pyx_PyCode_New(17, 0, 17, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__16, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_render_texture_core, 79, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__17)) __PYX_ERR(0, 79, __pyx_L1_error)
+
+  /* "mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__18 = PyTuple_Pack(9, __pyx_n_s_filename, __pyx_n_s_mtl_name, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_colors, __pyx_n_s_uv_coords, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_ntexver); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__18);
+  __Pyx_GIVEREF(__pyx_tuple__18);
+  __pyx_codeobj__19 = (PyObject*)__Pyx_PyCode_New(9, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__18, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_write_obj_with_colors_texture_co, 100, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__19)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_modinit_global_init_code(void); /*proto*/
+static int __Pyx_modinit_variable_export_code(void); /*proto*/
+static int __Pyx_modinit_function_export_code(void); /*proto*/
+static int __Pyx_modinit_type_init_code(void); /*proto*/
+static int __Pyx_modinit_type_import_code(void); /*proto*/
+static int __Pyx_modinit_variable_import_code(void); /*proto*/
+static int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) __PYX_ERR(3, 9, __pyx_L1_error)
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) __PYX_ERR(1, 164, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) __PYX_ERR(1, 186, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) __PYX_ERR(1, 190, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) __PYX_ERR(1, 199, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) __PYX_ERR(1, 872, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#if PY_MAJOR_VERSION < 3
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC void
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#else
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__)
+    #define CYTHON_SMALL_CODE __attribute__((optimize("Os")))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initmesh_core_cython(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initmesh_core_cython(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        result = PyDict_SetItemString(moddict, to_name, value);
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__") < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static int __pyx_pymod_exec_mesh_core_cython(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m && __pyx_m == __pyx_pyinit_module) return 0;
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("mesh_core_cython", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_mesh_core_cython) {
+    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "mesh_core_cython")) {
+      if (unlikely(PyDict_SetItemString(modules, "mesh_core_cython", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  (void)__Pyx_modinit_type_init_code();
+  if (unlikely(__Pyx_modinit_type_import_code() != 0)) goto __pyx_L1_error;
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "mesh_core_cython.pyx":1
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libcpp.string cimport string
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":6
+ * 
+ * # use the Numpy-C-API from Cython
+ * np.import_array()             # <<<<<<<<<<<<<<
+ * 
+ * # cdefine the signature of our c function
+ */
+  __pyx_t_2 = __pyx_f_5numpy_import_array(); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 6, __pyx_L1_error)
+
+  /* "mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_16mesh_core_cython_1get_normal_core, NULL, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_normal_core, __pyx_t_1) < 0) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_16mesh_core_cython_3rasterize_triangles_core, NULL, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_rasterize_triangles_core, __pyx_t_1) < 0) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_16mesh_core_cython_5render_colors_core, NULL, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_render_colors_core, __pyx_t_1) < 0) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_16mesh_core_cython_7render_texture_core, NULL, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_render_texture_core, __pyx_t_1) < 0) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_16mesh_core_cython_9write_obj_with_colors_texture_core, NULL, __pyx_n_s_mesh_core_cython); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_write_obj_with_colors_texture_co, __pyx_t_1) < 0) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "mesh_core_cython.pyx":1
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libcpp.string cimport string
+ */
+  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init mesh_core_cython", 0, __pyx_lineno, __pyx_filename);
+    }
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init mesh_core_cython");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+    return 0;
+}
+
+/* IsLittleEndian */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t < '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number;
+    int ndim = ctx->head->field->type->ndim;
+;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if (ctx->enc_type == *ts && got_Z == ctx->is_complex &&
+            ctx->enc_packmode == ctx->new_packmode) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+  static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((unsigned)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* PyErrFetchRestore */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+  #if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* GetBuiltinName */
+  static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* PyObjectCall */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* RaiseException */
+  #if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+#if CYTHON_COMPILING_IN_PYPY
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#else
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* PyCFunctionFastCall */
+  #if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) {
+    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
+    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
+    PyObject *self = PyCFunction_GET_SELF(func);
+    int flags = PyCFunction_GET_FLAGS(func);
+    assert(PyCFunction_Check(func));
+    assert(METH_FASTCALL == (flags & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)));
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    /* _PyCFunction_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+    if ((PY_VERSION_HEX < 0x030700A0) || unlikely(flags & METH_KEYWORDS)) {
+        return (*((__Pyx_PyCFunctionFastWithKeywords)meth)) (self, args, nargs, NULL);
+    } else {
+        return (*((__Pyx_PyCFunctionFast)meth)) (self, args, nargs);
+    }
+}
+#endif
+
+/* PyFunctionFastCall */
+  #if CYTHON_FAST_PYCALL
+#include "frameobject.h"
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = f->f_localsplus;
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    if (Py_EnterRecursiveCall((char*)" while calling a Python object")) {
+        return NULL;
+    }
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif
+#endif
+
+/* PyObjectCallMethO */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = PyCFunction_GET_FUNCTION(func);
+    self = PyCFunction_GET_SELF(func);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallOneArg */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_New(1);
+    if (unlikely(!args)) return NULL;
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(args, 0, arg);
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+#if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCall(func, &arg, 1);
+    }
+#endif
+    if (likely(PyCFunction_Check(func))) {
+        if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) {
+            return __Pyx_PyObject_CallMethO(func, arg);
+#if CYTHON_FAST_PYCCALL
+        } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) {
+            return __Pyx_PyCFunction_FastCall(func, &arg, 1);
+#endif
+        }
+    }
+    return __Pyx__PyObject_CallOneArg(func, arg);
+}
+#else
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_Pack(1, arg);
+    if (unlikely(!args)) return NULL;
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+#endif
+
+/* DictGetItem */
+  #if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key) {
+    PyObject *value;
+    value = PyDict_GetItemWithError(d, key);
+    if (unlikely(!value)) {
+        if (!PyErr_Occurred()) {
+            PyObject* args = PyTuple_Pack(1, key);
+            if (likely(args))
+                PyErr_SetObject(PyExc_KeyError, args);
+            Py_XDECREF(args);
+        }
+        return NULL;
+    }
+    Py_INCREF(value);
+    return value;
+}
+#endif
+
+/* RaiseTooManyValuesToUnpack */
+  static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+/* RaiseNeedMoreValuesToUnpack */
+  static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+/* RaiseNoneIterError */
+  static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+/* ExtTypeTest */
+  static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(__Pyx_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+/* SaveResetException */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    #if PY_VERSION_HEX >= 0x030700A2
+    *type = tstate->exc_state.exc_type;
+    *value = tstate->exc_state.exc_value;
+    *tb = tstate->exc_state.exc_traceback;
+    #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    #endif
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if PY_VERSION_HEX >= 0x030700A2
+    tmp_type = tstate->exc_state.exc_type;
+    tmp_value = tstate->exc_state.exc_value;
+    tmp_tb = tstate->exc_state.exc_traceback;
+    tstate->exc_state.exc_type = type;
+    tstate->exc_state.exc_value = value;
+    tstate->exc_state.exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+#endif
+
+/* PyErrExceptionMatches */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    PyObject *exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+    if (unlikely(PyTuple_Check(err)))
+        return __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    return __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+}
+#endif
+
+/* GetException */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) {
+#endif
+    PyObject *local_type, *local_value, *local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if PY_VERSION_HEX >= 0x030700A2
+    tmp_type = tstate->exc_state.exc_type;
+    tmp_value = tstate->exc_state.exc_value;
+    tmp_tb = tstate->exc_state.exc_traceback;
+    tstate->exc_state.exc_type = local_type;
+    tstate->exc_state.exc_value = local_value;
+    tstate->exc_state.exc_traceback = local_tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* Import */
+    static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* CLineInTraceback */
+    #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+      use_cline = __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback);
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (PyObject_Not(use_cline) != 0) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+    static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+    #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+        if (__Pyx_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+        else if (__Pyx_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view);
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+    /* CIntFromPyVerify */
+    #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* Declarations */
+    #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+    #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+    #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+    #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* CIntToPy */
+    static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+    static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value) {
+    const enum NPY_TYPES neg_one = (enum NPY_TYPES) -1, const_zero = (enum NPY_TYPES) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(enum NPY_TYPES) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(enum NPY_TYPES) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(enum NPY_TYPES),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+    static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntToPy */
+    static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+    static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* FastTypeChecks */
+    #if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+    static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* ModuleImport */
+    #ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
+/* TypeImport */
+    #ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
+    size_t size, int strict)
+{
+    PyObject *py_module = 0;
+    PyObject *result = 0;
+    PyObject *py_name = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    py_module = __Pyx_ImportModule(module_name);
+    if (!py_module)
+        goto bad;
+    py_name = __Pyx_PyIdentifier_FromString(class_name);
+    if (!py_name)
+        goto bad;
+    result = PyObject_GetAttr(py_module, py_name);
+    Py_DECREF(py_name);
+    py_name = 0;
+    Py_DECREF(py_module);
+    py_module = 0;
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (!strict && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    else if ((size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s has the wrong size, try recompiling. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(py_module);
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* InitStrings */
+    static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(x);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.cpp b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5b441fb77f7737ff2a816c946c5f2ec04775a95
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.cpp
@@ -0,0 +1,7672 @@
+/* Generated by Cython 0.29.32 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "insightface/thirdparty/face3d/mesh/cython/mesh_core.h"
+        ],
+        "include_dirs": [
+            "insightface/thirdparty/face3d/mesh/cython"
+        ],
+        "language": "c++",
+        "name": "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython",
+        "sources": [
+            "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx",
+            "insightface/thirdparty/face3d/mesh/cython/mesh_core.cpp"
+        ]
+    },
+    "module_name": "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_29_32"
+#define CYTHON_HEX_VERSION 0x001D20F0
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC (PYPY_VERSION_HEX >= 0x07030900)
+  #endif
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+#elif defined(PY_NOGIL)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #ifndef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0 || PY_VERSION_HEX >= 0x030B00A2
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #if PY_VERSION_HEX >= 0x030B00A4
+    #undef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 0
+  #elif !defined(CYTHON_FAST_THREAD_STATE)
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL (PY_VERSION_HEX < 0x030A0000)
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+  #ifndef CYTHON_USE_DICT_VERSIONS
+    #define CYTHON_USE_DICT_VERSIONS (PY_VERSION_HEX >= 0x030600B1)
+  #endif
+  #if PY_VERSION_HEX >= 0x030B00A4
+    #undef CYTHON_USE_EXC_INFO_STACK
+    #define CYTHON_USE_EXC_INFO_STACK 0
+  #elif !defined(CYTHON_USE_EXC_INFO_STACK)
+    #define CYTHON_USE_EXC_INFO_STACK (PY_VERSION_HEX >= 0x030700A3)
+  #endif
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_MAJOR_VERSION < 3
+    #include "longintrepr.h"
+  #endif
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+  #ifdef SIZEOF_VOID_P
+    enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+  #endif
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef __cplusplus
+  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #else
+    #define CYTHON_INLINE inline
+  #endif
+#endif
+template<typename T>
+void __Pyx_call_destructor(T& x) {
+    x.~T();
+}
+template<typename T>
+class __Pyx_FakeReference {
+  public:
+    __Pyx_FakeReference() : ptr(NULL) { }
+    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
+    T *operator->() { return ptr; }
+    T *operator&() { return ptr; }
+    operator T&() { return *ptr; }
+    template<typename U> bool operator ==(U other) { return *ptr == other; }
+    template<typename U> bool operator !=(U other) { return *ptr != other; }
+  private:
+    T *ptr;
+};
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_DefaultClassType PyType_Type
+#if PY_VERSION_HEX >= 0x030B00A1
+    static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+        PyObject *kwds=NULL, *argcount=NULL, *posonlyargcount=NULL, *kwonlyargcount=NULL;
+        PyObject *nlocals=NULL, *stacksize=NULL, *flags=NULL, *replace=NULL, *call_result=NULL, *empty=NULL;
+        const char *fn_cstr=NULL;
+        const char *name_cstr=NULL;
+        PyCodeObject* co=NULL;
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (!(kwds=PyDict_New())) goto end;
+        if (!(argcount=PyLong_FromLong(a))) goto end;
+        if (PyDict_SetItemString(kwds, "co_argcount", argcount) != 0) goto end;
+        if (!(posonlyargcount=PyLong_FromLong(0))) goto end;
+        if (PyDict_SetItemString(kwds, "co_posonlyargcount", posonlyargcount) != 0) goto end;
+        if (!(kwonlyargcount=PyLong_FromLong(k))) goto end;
+        if (PyDict_SetItemString(kwds, "co_kwonlyargcount", kwonlyargcount) != 0) goto end;
+        if (!(nlocals=PyLong_FromLong(l))) goto end;
+        if (PyDict_SetItemString(kwds, "co_nlocals", nlocals) != 0) goto end;
+        if (!(stacksize=PyLong_FromLong(s))) goto end;
+        if (PyDict_SetItemString(kwds, "co_stacksize", stacksize) != 0) goto end;
+        if (!(flags=PyLong_FromLong(f))) goto end;
+        if (PyDict_SetItemString(kwds, "co_flags", flags) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_code", code) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_consts", c) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_names", n) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_varnames", v) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_freevars", fv) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_cellvars", cell) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_linetable", lnos) != 0) goto end;
+        if (!(fn_cstr=PyUnicode_AsUTF8AndSize(fn, NULL))) goto end;
+        if (!(name_cstr=PyUnicode_AsUTF8AndSize(name, NULL))) goto end;
+        if (!(co = PyCode_NewEmpty(fn_cstr, name_cstr, fline))) goto end;
+        if (!(replace = PyObject_GetAttrString((PyObject*)co, "replace"))) goto cleanup_code_too;
+        if (!(empty = PyTuple_New(0))) goto cleanup_code_too; // unfortunately __pyx_empty_tuple isn't available here
+        if (!(call_result = PyObject_Call(replace, empty, kwds))) goto cleanup_code_too;
+        Py_XDECREF((PyObject*)co);
+        co = (PyCodeObject*)call_result;
+        call_result = NULL;
+        if (0) {
+            cleanup_code_too:
+            Py_XDECREF((PyObject*)co);
+            co = NULL;
+        }
+        end:
+        Py_XDECREF(kwds);
+        Py_XDECREF(argcount);
+        Py_XDECREF(posonlyargcount);
+        Py_XDECREF(kwonlyargcount);
+        Py_XDECREF(nlocals);
+        Py_XDECREF(stacksize);
+        Py_XDECREF(replace);
+        Py_XDECREF(call_result);
+        Py_XDECREF(empty);
+        if (type) {
+            PyErr_Restore(type, value, traceback);
+        }
+        return co;
+    }
+#else
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef METH_STACKLESS
+  #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030400A1
+  #define PyMem_RawMalloc(n)           PyMem_Malloc(n)
+  #define PyMem_RawRealloc(p, n)       PyMem_Realloc(p, n)
+  #define PyMem_RawFree(p)             PyMem_Free(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #if defined(PyUnicode_IS_READY)
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #else
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #endif
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
+  #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03090000
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : ((PyCompactUnicodeObject *)(u))->wstr_length))
+  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+  #endif
+  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #endif
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsHash_t
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? ((void)(klass), PyMethod_New(func, self)) : __Pyx_NewRef(func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(_WIN32) || defined(WIN32) || defined(MS_WINDOWS)
+  #if !defined(_USE_MATH_DEFINES)
+    #define _USE_MATH_DEFINES
+  #endif
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+    { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+    { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__insightface__thirdparty__face3d__mesh__cython__mesh_core_cython
+#define __PYX_HAVE_API__insightface__thirdparty__face3d__mesh__cython__mesh_core_cython
+/* Early includes */
+#include <string.h>
+#include <stdio.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/ufuncobject.h"
+
+    /* NumPy API declarations from "numpy/__init__.pxd" */
+    
+#include <string>
+#include "ios"
+#include "new"
+#include "stdexcept"
+#include "typeinfo"
+#include "mesh_core.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+    return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject*);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx",
+  "__init__.pxd",
+  "stringsource",
+  "type.pxd",
+};
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":689
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":690
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":691
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":692
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":696
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":697
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":698
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":699
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":703
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":704
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":713
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":714
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":715
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":717
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":718
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":719
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":721
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":722
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":724
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":725
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":726
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":728
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":729
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":730
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":732
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* GetTopmostException.proto */
+#if CYTHON_USE_EXC_INFO_STACK
+static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate);
+#endif
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* TypeImport.proto */
+#ifndef __PYX_HAVE_RT_ImportType_proto
+#define __PYX_HAVE_RT_ImportType_proto
+enum __Pyx_ImportType_CheckSize {
+   __Pyx_ImportType_CheckSize_Error = 0,
+   __Pyx_ImportType_CheckSize_Warn = 1,
+   __Pyx_ImportType_CheckSize_Ignore = 2
+};
+static PyTypeObject *__Pyx_ImportType(PyObject* module, const char *module_name, const char *class_name, size_t size, enum __Pyx_ImportType_CheckSize check_size);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT  ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict)  (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+    (version_var) = __PYX_GET_DICT_VERSION(dict);\
+    (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+        (VAR) = __pyx_dict_cached_value;\
+    } else {\
+        (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+        __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+    }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict)  (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP)  (VAR) = (LOOKUP);
+#endif
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* GCCDiagnostics.proto */
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_generic = 0;
+static PyTypeObject *__pyx_ptype_5numpy_number = 0;
+static PyTypeObject *__pyx_ptype_5numpy_integer = 0;
+static PyTypeObject *__pyx_ptype_5numpy_signedinteger = 0;
+static PyTypeObject *__pyx_ptype_5numpy_unsignedinteger = 0;
+static PyTypeObject *__pyx_ptype_5numpy_inexact = 0;
+static PyTypeObject *__pyx_ptype_5numpy_floating = 0;
+static PyTypeObject *__pyx_ptype_5numpy_complexfloating = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flexible = 0;
+static PyTypeObject *__pyx_ptype_5numpy_character = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void); /*proto*/
+
+/* Module declarations from 'libcpp.string' */
+
+/* Module declarations from 'insightface.thirdparty.face3d.mesh.cython.mesh_core_cython' */
+static std::string __pyx_convert_string_from_py_std__in_string(PyObject *); /*proto*/
+static __Pyx_TypeInfo __Pyx_TypeInfo_float = { "float", NULL, sizeof(float), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_int = { "int", NULL, sizeof(int), { 0 }, 0, IS_UNSIGNED(int) ? 'U' : 'I', IS_UNSIGNED(int), 0 };
+#define __Pyx_MODULE_NAME "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython"
+extern int __pyx_module_is_main_insightface__thirdparty__face3d__mesh__cython__mesh_core_cython;
+int __pyx_module_is_main_insightface__thirdparty__face3d__mesh__cython__mesh_core_cython = 0;
+
+/* Implementation of 'insightface.thirdparty.face3d.mesh.cython.mesh_core_cython' */
+static PyObject *__pyx_builtin_ImportError;
+static const char __pyx_k_c[] = "c";
+static const char __pyx_k_h[] = "h";
+static const char __pyx_k_w[] = "w";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_name[] = "__name__";
+static const char __pyx_k_ntri[] = "ntri";
+static const char __pyx_k_nver[] = "nver";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_image[] = "image";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_tex_c[] = "tex_c";
+static const char __pyx_k_tex_h[] = "tex_h";
+static const char __pyx_k_tex_w[] = "tex_w";
+static const char __pyx_k_colors[] = "colors";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_normal[] = "normal";
+static const char __pyx_k_ntexver[] = "ntexver";
+static const char __pyx_k_texture[] = "texture";
+static const char __pyx_k_filename[] = "filename";
+static const char __pyx_k_mtl_name[] = "mtl_name";
+static const char __pyx_k_tex_nver[] = "tex_nver";
+static const char __pyx_k_vertices[] = "vertices";
+static const char __pyx_k_triangles[] = "triangles";
+static const char __pyx_k_uv_coords[] = "uv_coords";
+static const char __pyx_k_tex_coords[] = "tex_coords";
+static const char __pyx_k_tri_normal[] = "tri_normal";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_depth_buffer[] = "depth_buffer";
+static const char __pyx_k_mapping_type[] = "mapping_type";
+static const char __pyx_k_tex_triangles[] = "tex_triangles";
+static const char __pyx_k_get_normal_core[] = "get_normal_core";
+static const char __pyx_k_triangle_buffer[] = "triangle_buffer";
+static const char __pyx_k_barycentric_weight[] = "barycentric_weight";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_render_colors_core[] = "render_colors_core";
+static const char __pyx_k_render_texture_core[] = "render_texture_core";
+static const char __pyx_k_rasterize_triangles_core[] = "rasterize_triangles_core";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_insightface_thirdparty_face3d_me[] = "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static const char __pyx_k_write_obj_with_colors_texture_co[] = "write_obj_with_colors_texture_core";
+static const char __pyx_k_insightface_thirdparty_face3d_me_2[] = "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython";
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_n_s_barycentric_weight;
+static PyObject *__pyx_n_s_c;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_colors;
+static PyObject *__pyx_n_s_depth_buffer;
+static PyObject *__pyx_n_s_filename;
+static PyObject *__pyx_n_s_get_normal_core;
+static PyObject *__pyx_n_s_h;
+static PyObject *__pyx_n_s_image;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_kp_s_insightface_thirdparty_face3d_me;
+static PyObject *__pyx_n_s_insightface_thirdparty_face3d_me_2;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_mapping_type;
+static PyObject *__pyx_n_s_mtl_name;
+static PyObject *__pyx_n_s_name;
+static PyObject *__pyx_n_s_normal;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_ntexver;
+static PyObject *__pyx_n_s_ntri;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_n_s_nver;
+static PyObject *__pyx_n_s_rasterize_triangles_core;
+static PyObject *__pyx_n_s_render_colors_core;
+static PyObject *__pyx_n_s_render_texture_core;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_tex_c;
+static PyObject *__pyx_n_s_tex_coords;
+static PyObject *__pyx_n_s_tex_h;
+static PyObject *__pyx_n_s_tex_nver;
+static PyObject *__pyx_n_s_tex_triangles;
+static PyObject *__pyx_n_s_tex_w;
+static PyObject *__pyx_n_s_texture;
+static PyObject *__pyx_n_s_tri_normal;
+static PyObject *__pyx_n_s_triangle_buffer;
+static PyObject *__pyx_n_s_triangles;
+static PyObject *__pyx_n_s_uv_coords;
+static PyObject *__pyx_n_s_vertices;
+static PyObject *__pyx_n_s_w;
+static PyObject *__pyx_n_s_write_obj_with_colors_texture_co;
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_get_normal_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_normal, PyArrayObject *__pyx_v_tri_normal, PyArrayObject *__pyx_v_triangles, int __pyx_v_ntri); /* proto */
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_2rasterize_triangles_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_depth_buffer, PyArrayObject *__pyx_v_triangle_buffer, PyArrayObject *__pyx_v_barycentric_weight, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_4render_colors_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c); /* proto */
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_6render_texture_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_texture, PyArrayObject *__pyx_v_tex_coords, PyArrayObject *__pyx_v_tex_triangles, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_tex_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c, int __pyx_v_tex_h, int __pyx_v_tex_w, int __pyx_v_tex_c, int __pyx_v_mapping_type); /* proto */
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_8write_obj_with_colors_texture_core(CYTHON_UNUSED PyObject *__pyx_self, std::string __pyx_v_filename, std::string __pyx_v_mtl_name, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_uv_coords, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_ntexver); /* proto */
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_codeobj__4;
+static PyObject *__pyx_codeobj__6;
+static PyObject *__pyx_codeobj__8;
+static PyObject *__pyx_codeobj__10;
+static PyObject *__pyx_codeobj__12;
+/* Late includes */
+
+/* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_1get_normal_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_1get_normal_core = {"get_normal_core", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_1get_normal_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_1get_normal_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_normal = 0;
+  PyArrayObject *__pyx_v_tri_normal = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  int __pyx_v_ntri;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("get_normal_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_normal,&__pyx_n_s_tri_normal,&__pyx_n_s_triangles,&__pyx_n_s_ntri,0};
+    PyObject* values[4] = {0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_normal)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tri_normal)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 1); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 2); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, 3); __PYX_ERR(0, 40, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "get_normal_core") < 0)) __PYX_ERR(0, 40, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+    }
+    __pyx_v_normal = ((PyArrayObject *)values[0]);
+    __pyx_v_tri_normal = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[3]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("get_normal_core", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 40, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.get_normal_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_normal), __pyx_ptype_5numpy_ndarray, 0, "normal", 0))) __PYX_ERR(0, 40, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tri_normal), __pyx_ptype_5numpy_ndarray, 0, "tri_normal", 0))) __PYX_ERR(0, 41, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 42, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_get_normal_core(__pyx_self, __pyx_v_normal, __pyx_v_tri_normal, __pyx_v_triangles, __pyx_v_ntri);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_get_normal_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_normal, PyArrayObject *__pyx_v_tri_normal, PyArrayObject *__pyx_v_triangles, int __pyx_v_ntri) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_normal;
+  __Pyx_Buffer __pyx_pybuffer_normal;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tri_normal;
+  __Pyx_Buffer __pyx_pybuffer_tri_normal;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("get_normal_core", 0);
+  __pyx_pybuffer_normal.pybuffer.buf = NULL;
+  __pyx_pybuffer_normal.refcount = 0;
+  __pyx_pybuffernd_normal.data = NULL;
+  __pyx_pybuffernd_normal.rcbuffer = &__pyx_pybuffer_normal;
+  __pyx_pybuffer_tri_normal.pybuffer.buf = NULL;
+  __pyx_pybuffer_tri_normal.refcount = 0;
+  __pyx_pybuffernd_tri_normal.data = NULL;
+  __pyx_pybuffernd_tri_normal.rcbuffer = &__pyx_pybuffer_tri_normal;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_normal.rcbuffer->pybuffer, (PyObject*)__pyx_v_normal, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_normal.diminfo[0].strides = __pyx_pybuffernd_normal.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_normal.diminfo[0].shape = __pyx_pybuffernd_normal.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_normal.diminfo[1].strides = __pyx_pybuffernd_normal.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_normal.diminfo[1].shape = __pyx_pybuffernd_normal.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer, (PyObject*)__pyx_v_tri_normal, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tri_normal.diminfo[0].strides = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tri_normal.diminfo[0].shape = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tri_normal.diminfo[1].strides = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tri_normal.diminfo[1].shape = __pyx_pybuffernd_tri_normal.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":45
+ *                 int ntri
+ *                 ):
+ *     _get_normal_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(normal), <float*> np.PyArray_DATA(tri_normal), <int*> np.PyArray_DATA(triangles),
+ *         ntri)
+ */
+  _get_normal_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_normal))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_tri_normal))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), __pyx_v_ntri);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_normal.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.get_normal_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_normal.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tri_normal.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_3rasterize_triangles_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_3rasterize_triangles_core = {"rasterize_triangles_core", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_3rasterize_triangles_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_3rasterize_triangles_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  PyArrayObject *__pyx_v_triangle_buffer = 0;
+  PyArrayObject *__pyx_v_barycentric_weight = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("rasterize_triangles_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_depth_buffer,&__pyx_n_s_triangle_buffer,&__pyx_n_s_barycentric_weight,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[9] = {0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 1); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 2); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangle_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 3); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_barycentric_weight)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 4); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 5); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 6); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 7); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, 8); __PYX_ERR(0, 49, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "rasterize_triangles_core") < 0)) __PYX_ERR(0, 49, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 9) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+    }
+    __pyx_v_vertices = ((PyArrayObject *)values[0]);
+    __pyx_v_triangles = ((PyArrayObject *)values[1]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[2]);
+    __pyx_v_triangle_buffer = ((PyArrayObject *)values[3]);
+    __pyx_v_barycentric_weight = ((PyArrayObject *)values[4]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[5]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 55, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 55, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 56, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 56, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("rasterize_triangles_core", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 49, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.rasterize_triangles_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 50, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 51, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 52, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangle_buffer), __pyx_ptype_5numpy_ndarray, 0, "triangle_buffer", 0))) __PYX_ERR(0, 53, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_barycentric_weight), __pyx_ptype_5numpy_ndarray, 0, "barycentric_weight", 0))) __PYX_ERR(0, 54, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_2rasterize_triangles_core(__pyx_self, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_depth_buffer, __pyx_v_triangle_buffer, __pyx_v_barycentric_weight, __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_2rasterize_triangles_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_depth_buffer, PyArrayObject *__pyx_v_triangle_buffer, PyArrayObject *__pyx_v_barycentric_weight, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_barycentric_weight;
+  __Pyx_Buffer __pyx_pybuffer_barycentric_weight;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangle_buffer;
+  __Pyx_Buffer __pyx_pybuffer_triangle_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("rasterize_triangles_core", 0);
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  __pyx_pybuffer_triangle_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangle_buffer.refcount = 0;
+  __pyx_pybuffernd_triangle_buffer.data = NULL;
+  __pyx_pybuffernd_triangle_buffer.rcbuffer = &__pyx_pybuffer_triangle_buffer;
+  __pyx_pybuffer_barycentric_weight.pybuffer.buf = NULL;
+  __pyx_pybuffer_barycentric_weight.refcount = 0;
+  __pyx_pybuffernd_barycentric_weight.data = NULL;
+  __pyx_pybuffernd_barycentric_weight.rcbuffer = &__pyx_pybuffer_barycentric_weight;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangle_buffer, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangle_buffer.diminfo[0].strides = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangle_buffer.diminfo[0].shape = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangle_buffer.diminfo[1].strides = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangle_buffer.diminfo[1].shape = __pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer, (PyObject*)__pyx_v_barycentric_weight, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_barycentric_weight.diminfo[0].strides = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_barycentric_weight.diminfo[0].shape = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_barycentric_weight.diminfo[1].strides = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_barycentric_weight.diminfo[1].shape = __pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer.shape[1];
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":58
+ *                 int h, int w
+ *                 ):
+ *     _rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(depth_buffer), <int*> np.PyArray_DATA(triangle_buffer), <float*> np.PyArray_DATA(barycentric_weight),
+ */
+  _rasterize_triangles_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangle_buffer))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_barycentric_weight))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.rasterize_triangles_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_barycentric_weight.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangle_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_5render_colors_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_5render_colors_core = {"render_colors_core", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_5render_colors_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_5render_colors_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_image = 0;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_colors = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  int __pyx_v_c;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_colors_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_image,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_colors,&__pyx_n_s_depth_buffer,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_c,0};
+    PyObject* values[10] = {0,0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+        CYTHON_FALLTHROUGH;
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_image)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 1); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 2); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_colors)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 3); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 4); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 5); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 6); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 7); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 8); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  9:
+        if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, 9); __PYX_ERR(0, 64, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "render_colors_core") < 0)) __PYX_ERR(0, 64, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 10) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+      values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+    }
+    __pyx_v_image = ((PyArrayObject *)values[0]);
+    __pyx_v_vertices = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_colors = ((PyArrayObject *)values[3]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[4]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[5]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 69, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 69, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+    __pyx_v_c = __Pyx_PyInt_As_int(values[9]); if (unlikely((__pyx_v_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 70, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("render_colors_core", 1, 10, 10, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 64, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.render_colors_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_image), __pyx_ptype_5numpy_ndarray, 0, "image", 0))) __PYX_ERR(0, 64, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 65, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 66, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_colors), __pyx_ptype_5numpy_ndarray, 0, "colors", 0))) __PYX_ERR(0, 67, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 68, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_4render_colors_core(__pyx_self, __pyx_v_image, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_colors, __pyx_v_depth_buffer, __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_4render_colors_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_colors;
+  __Pyx_Buffer __pyx_pybuffer_colors;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_image;
+  __Pyx_Buffer __pyx_pybuffer_image;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("render_colors_core", 0);
+  __pyx_pybuffer_image.pybuffer.buf = NULL;
+  __pyx_pybuffer_image.refcount = 0;
+  __pyx_pybuffernd_image.data = NULL;
+  __pyx_pybuffernd_image.rcbuffer = &__pyx_pybuffer_image;
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_colors.pybuffer.buf = NULL;
+  __pyx_pybuffer_colors.refcount = 0;
+  __pyx_pybuffernd_colors.data = NULL;
+  __pyx_pybuffernd_colors.rcbuffer = &__pyx_pybuffer_colors;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_image.rcbuffer->pybuffer, (PyObject*)__pyx_v_image, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_image.diminfo[0].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_image.diminfo[0].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_image.diminfo[1].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_image.diminfo[1].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_image.diminfo[2].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_image.diminfo[2].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_colors.rcbuffer->pybuffer, (PyObject*)__pyx_v_colors, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_colors.diminfo[0].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_colors.diminfo[0].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_colors.diminfo[1].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_colors.diminfo[1].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":72
+ *                 int h, int w, int c
+ *                 ):
+ *     _render_colors_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(colors),
+ */
+  _render_colors_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_image))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_colors))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.render_colors_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_7render_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_7render_texture_core = {"render_texture_core", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_7render_texture_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_7render_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_image = 0;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_texture = 0;
+  PyArrayObject *__pyx_v_tex_coords = 0;
+  PyArrayObject *__pyx_v_tex_triangles = 0;
+  PyArrayObject *__pyx_v_depth_buffer = 0;
+  int __pyx_v_nver;
+  int __pyx_v_tex_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_h;
+  int __pyx_v_w;
+  int __pyx_v_c;
+  int __pyx_v_tex_h;
+  int __pyx_v_tex_w;
+  int __pyx_v_tex_c;
+  int __pyx_v_mapping_type;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("render_texture_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_image,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_texture,&__pyx_n_s_tex_coords,&__pyx_n_s_tex_triangles,&__pyx_n_s_depth_buffer,&__pyx_n_s_nver,&__pyx_n_s_tex_nver,&__pyx_n_s_ntri,&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_c,&__pyx_n_s_tex_h,&__pyx_n_s_tex_w,&__pyx_n_s_tex_c,&__pyx_n_s_mapping_type,0};
+    PyObject* values[17] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case 17: values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+        CYTHON_FALLTHROUGH;
+        case 16: values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+        CYTHON_FALLTHROUGH;
+        case 15: values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+        CYTHON_FALLTHROUGH;
+        case 14: values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+        CYTHON_FALLTHROUGH;
+        case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+        CYTHON_FALLTHROUGH;
+        case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+        CYTHON_FALLTHROUGH;
+        case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+        CYTHON_FALLTHROUGH;
+        case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+        CYTHON_FALLTHROUGH;
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_image)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 1); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 2); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_texture)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 3); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_coords)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 4); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 5); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_depth_buffer)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 6); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 7); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 8); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  9:
+        if (likely((values[9] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 9); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 10:
+        if (likely((values[10] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 10); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 11:
+        if (likely((values[11] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 11); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 12:
+        if (likely((values[12] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 12); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 13:
+        if (likely((values[13] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 13); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 14:
+        if (likely((values[14] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 14); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 15:
+        if (likely((values[15] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_tex_c)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 15); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case 16:
+        if (likely((values[16] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_mapping_type)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, 16); __PYX_ERR(0, 79, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "render_texture_core") < 0)) __PYX_ERR(0, 79, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 17) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+      values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+      values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+      values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+      values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+      values[13] = PyTuple_GET_ITEM(__pyx_args, 13);
+      values[14] = PyTuple_GET_ITEM(__pyx_args, 14);
+      values[15] = PyTuple_GET_ITEM(__pyx_args, 15);
+      values[16] = PyTuple_GET_ITEM(__pyx_args, 16);
+    }
+    __pyx_v_image = ((PyArrayObject *)values[0]);
+    __pyx_v_vertices = ((PyArrayObject *)values[1]);
+    __pyx_v_triangles = ((PyArrayObject *)values[2]);
+    __pyx_v_texture = ((PyArrayObject *)values[3]);
+    __pyx_v_tex_coords = ((PyArrayObject *)values[4]);
+    __pyx_v_tex_triangles = ((PyArrayObject *)values[5]);
+    __pyx_v_depth_buffer = ((PyArrayObject *)values[6]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_tex_nver = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_tex_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[9]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 86, __pyx_L3_error)
+    __pyx_v_h = __Pyx_PyInt_As_int(values[10]); if (unlikely((__pyx_v_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_int(values[11]); if (unlikely((__pyx_v_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_c = __Pyx_PyInt_As_int(values[12]); if (unlikely((__pyx_v_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 87, __pyx_L3_error)
+    __pyx_v_tex_h = __Pyx_PyInt_As_int(values[13]); if (unlikely((__pyx_v_tex_h == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_tex_w = __Pyx_PyInt_As_int(values[14]); if (unlikely((__pyx_v_tex_w == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_tex_c = __Pyx_PyInt_As_int(values[15]); if (unlikely((__pyx_v_tex_c == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 88, __pyx_L3_error)
+    __pyx_v_mapping_type = __Pyx_PyInt_As_int(values[16]); if (unlikely((__pyx_v_mapping_type == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 89, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("render_texture_core", 1, 17, 17, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 79, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.render_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_image), __pyx_ptype_5numpy_ndarray, 0, "image", 0))) __PYX_ERR(0, 79, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 80, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 81, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_texture), __pyx_ptype_5numpy_ndarray, 0, "texture", 0))) __PYX_ERR(0, 82, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tex_coords), __pyx_ptype_5numpy_ndarray, 0, "tex_coords", 0))) __PYX_ERR(0, 83, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_tex_triangles), __pyx_ptype_5numpy_ndarray, 0, "tex_triangles", 0))) __PYX_ERR(0, 84, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_depth_buffer), __pyx_ptype_5numpy_ndarray, 0, "depth_buffer", 0))) __PYX_ERR(0, 85, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_6render_texture_core(__pyx_self, __pyx_v_image, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_texture, __pyx_v_tex_coords, __pyx_v_tex_triangles, __pyx_v_depth_buffer, __pyx_v_nver, __pyx_v_tex_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c, __pyx_v_tex_h, __pyx_v_tex_w, __pyx_v_tex_c, __pyx_v_mapping_type);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_6render_texture_core(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_image, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_texture, PyArrayObject *__pyx_v_tex_coords, PyArrayObject *__pyx_v_tex_triangles, PyArrayObject *__pyx_v_depth_buffer, int __pyx_v_nver, int __pyx_v_tex_nver, int __pyx_v_ntri, int __pyx_v_h, int __pyx_v_w, int __pyx_v_c, int __pyx_v_tex_h, int __pyx_v_tex_w, int __pyx_v_tex_c, int __pyx_v_mapping_type) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_depth_buffer;
+  __Pyx_Buffer __pyx_pybuffer_depth_buffer;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_image;
+  __Pyx_Buffer __pyx_pybuffer_image;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tex_coords;
+  __Pyx_Buffer __pyx_pybuffer_tex_coords;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_tex_triangles;
+  __Pyx_Buffer __pyx_pybuffer_tex_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_texture;
+  __Pyx_Buffer __pyx_pybuffer_texture;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("render_texture_core", 0);
+  __pyx_pybuffer_image.pybuffer.buf = NULL;
+  __pyx_pybuffer_image.refcount = 0;
+  __pyx_pybuffernd_image.data = NULL;
+  __pyx_pybuffernd_image.rcbuffer = &__pyx_pybuffer_image;
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_texture.pybuffer.buf = NULL;
+  __pyx_pybuffer_texture.refcount = 0;
+  __pyx_pybuffernd_texture.data = NULL;
+  __pyx_pybuffernd_texture.rcbuffer = &__pyx_pybuffer_texture;
+  __pyx_pybuffer_tex_coords.pybuffer.buf = NULL;
+  __pyx_pybuffer_tex_coords.refcount = 0;
+  __pyx_pybuffernd_tex_coords.data = NULL;
+  __pyx_pybuffernd_tex_coords.rcbuffer = &__pyx_pybuffer_tex_coords;
+  __pyx_pybuffer_tex_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_tex_triangles.refcount = 0;
+  __pyx_pybuffernd_tex_triangles.data = NULL;
+  __pyx_pybuffernd_tex_triangles.rcbuffer = &__pyx_pybuffer_tex_triangles;
+  __pyx_pybuffer_depth_buffer.pybuffer.buf = NULL;
+  __pyx_pybuffer_depth_buffer.refcount = 0;
+  __pyx_pybuffernd_depth_buffer.data = NULL;
+  __pyx_pybuffernd_depth_buffer.rcbuffer = &__pyx_pybuffer_depth_buffer;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_image.rcbuffer->pybuffer, (PyObject*)__pyx_v_image, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_image.diminfo[0].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_image.diminfo[0].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_image.diminfo[1].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_image.diminfo[1].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_image.diminfo[2].strides = __pyx_pybuffernd_image.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_image.diminfo[2].shape = __pyx_pybuffernd_image.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_texture.rcbuffer->pybuffer, (PyObject*)__pyx_v_texture, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_texture.diminfo[0].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_texture.diminfo[0].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_texture.diminfo[1].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_texture.diminfo[1].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_texture.diminfo[2].strides = __pyx_pybuffernd_texture.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_texture.diminfo[2].shape = __pyx_pybuffernd_texture.rcbuffer->pybuffer.shape[2];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer, (PyObject*)__pyx_v_tex_coords, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tex_coords.diminfo[0].strides = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tex_coords.diminfo[0].shape = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tex_coords.diminfo[1].strides = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tex_coords.diminfo[1].shape = __pyx_pybuffernd_tex_coords.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_tex_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_tex_triangles.diminfo[0].strides = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_tex_triangles.diminfo[0].shape = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_tex_triangles.diminfo[1].strides = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_tex_triangles.diminfo[1].shape = __pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer, (PyObject*)__pyx_v_depth_buffer, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_depth_buffer.diminfo[0].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_depth_buffer.diminfo[0].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_depth_buffer.diminfo[1].strides = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_depth_buffer.diminfo[1].shape = __pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer.shape[1];
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":91
+ *                 int mapping_type
+ *                 ):
+ *     _render_texture_core(             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+ *         <float*> np.PyArray_DATA(texture), <float*> np.PyArray_DATA(tex_coords), <int*> np.PyArray_DATA(tex_triangles),
+ */
+  _render_texture_core(((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_image))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_texture))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_tex_coords))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_tex_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_depth_buffer))), __pyx_v_nver, __pyx_v_tex_nver, __pyx_v_ntri, __pyx_v_h, __pyx_v_w, __pyx_v_c, __pyx_v_tex_h, __pyx_v_tex_w, __pyx_v_tex_c, __pyx_v_mapping_type);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_texture.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.render_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_depth_buffer.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_image.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_coords.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_tex_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_texture.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_9write_obj_with_colors_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_9write_obj_with_colors_texture_core = {"write_obj_with_colors_texture_core", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_9write_obj_with_colors_texture_core, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_9write_obj_with_colors_texture_core(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  std::string __pyx_v_filename;
+  std::string __pyx_v_mtl_name;
+  PyArrayObject *__pyx_v_vertices = 0;
+  PyArrayObject *__pyx_v_triangles = 0;
+  PyArrayObject *__pyx_v_colors = 0;
+  PyArrayObject *__pyx_v_uv_coords = 0;
+  int __pyx_v_nver;
+  int __pyx_v_ntri;
+  int __pyx_v_ntexver;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("write_obj_with_colors_texture_core (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_filename,&__pyx_n_s_mtl_name,&__pyx_n_s_vertices,&__pyx_n_s_triangles,&__pyx_n_s_colors,&__pyx_n_s_uv_coords,&__pyx_n_s_nver,&__pyx_n_s_ntri,&__pyx_n_s_ntexver,0};
+    PyObject* values[9] = {0,0,0,0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+        CYTHON_FALLTHROUGH;
+        case  8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+        CYTHON_FALLTHROUGH;
+        case  7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+        CYTHON_FALLTHROUGH;
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        CYTHON_FALLTHROUGH;
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        CYTHON_FALLTHROUGH;
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_filename)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_mtl_name)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 1); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_vertices)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 2); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_triangles)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 3); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  4:
+        if (likely((values[4] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_colors)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 4); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  5:
+        if (likely((values[5] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_uv_coords)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 5); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  6:
+        if (likely((values[6] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_nver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 6); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  7:
+        if (likely((values[7] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntri)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 7); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  8:
+        if (likely((values[8] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ntexver)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, 8); __PYX_ERR(0, 100, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "write_obj_with_colors_texture_core") < 0)) __PYX_ERR(0, 100, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 9) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+      values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+      values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+      values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+    }
+    __pyx_v_filename = __pyx_convert_string_from_py_std__in_string(values[0]); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 100, __pyx_L3_error)
+    __pyx_v_mtl_name = __pyx_convert_string_from_py_std__in_string(values[1]); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 100, __pyx_L3_error)
+    __pyx_v_vertices = ((PyArrayObject *)values[2]);
+    __pyx_v_triangles = ((PyArrayObject *)values[3]);
+    __pyx_v_colors = ((PyArrayObject *)values[4]);
+    __pyx_v_uv_coords = ((PyArrayObject *)values[5]);
+    __pyx_v_nver = __Pyx_PyInt_As_int(values[6]); if (unlikely((__pyx_v_nver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+    __pyx_v_ntri = __Pyx_PyInt_As_int(values[7]); if (unlikely((__pyx_v_ntri == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+    __pyx_v_ntexver = __Pyx_PyInt_As_int(values[8]); if (unlikely((__pyx_v_ntexver == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 105, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("write_obj_with_colors_texture_core", 1, 9, 9, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 100, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.write_obj_with_colors_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_vertices), __pyx_ptype_5numpy_ndarray, 0, "vertices", 0))) __PYX_ERR(0, 101, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_triangles), __pyx_ptype_5numpy_ndarray, 0, "triangles", 0))) __PYX_ERR(0, 102, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_colors), __pyx_ptype_5numpy_ndarray, 0, "colors", 0))) __PYX_ERR(0, 103, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_uv_coords), __pyx_ptype_5numpy_ndarray, 0, "uv_coords", 0))) __PYX_ERR(0, 104, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_8write_obj_with_colors_texture_core(__pyx_self, __pyx_v_filename, __pyx_v_mtl_name, __pyx_v_vertices, __pyx_v_triangles, __pyx_v_colors, __pyx_v_uv_coords, __pyx_v_nver, __pyx_v_ntri, __pyx_v_ntexver);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_8write_obj_with_colors_texture_core(CYTHON_UNUSED PyObject *__pyx_self, std::string __pyx_v_filename, std::string __pyx_v_mtl_name, PyArrayObject *__pyx_v_vertices, PyArrayObject *__pyx_v_triangles, PyArrayObject *__pyx_v_colors, PyArrayObject *__pyx_v_uv_coords, int __pyx_v_nver, int __pyx_v_ntri, int __pyx_v_ntexver) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_colors;
+  __Pyx_Buffer __pyx_pybuffer_colors;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_triangles;
+  __Pyx_Buffer __pyx_pybuffer_triangles;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_uv_coords;
+  __Pyx_Buffer __pyx_pybuffer_uv_coords;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_vertices;
+  __Pyx_Buffer __pyx_pybuffer_vertices;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("write_obj_with_colors_texture_core", 0);
+  __pyx_pybuffer_vertices.pybuffer.buf = NULL;
+  __pyx_pybuffer_vertices.refcount = 0;
+  __pyx_pybuffernd_vertices.data = NULL;
+  __pyx_pybuffernd_vertices.rcbuffer = &__pyx_pybuffer_vertices;
+  __pyx_pybuffer_triangles.pybuffer.buf = NULL;
+  __pyx_pybuffer_triangles.refcount = 0;
+  __pyx_pybuffernd_triangles.data = NULL;
+  __pyx_pybuffernd_triangles.rcbuffer = &__pyx_pybuffer_triangles;
+  __pyx_pybuffer_colors.pybuffer.buf = NULL;
+  __pyx_pybuffer_colors.refcount = 0;
+  __pyx_pybuffernd_colors.data = NULL;
+  __pyx_pybuffernd_colors.rcbuffer = &__pyx_pybuffer_colors;
+  __pyx_pybuffer_uv_coords.pybuffer.buf = NULL;
+  __pyx_pybuffer_uv_coords.refcount = 0;
+  __pyx_pybuffernd_uv_coords.data = NULL;
+  __pyx_pybuffernd_uv_coords.rcbuffer = &__pyx_pybuffer_uv_coords;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer, (PyObject*)__pyx_v_vertices, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_vertices.diminfo[0].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_vertices.diminfo[0].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_vertices.diminfo[1].strides = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_vertices.diminfo[1].shape = __pyx_pybuffernd_vertices.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer, (PyObject*)__pyx_v_triangles, &__Pyx_TypeInfo_int, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_triangles.diminfo[0].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_triangles.diminfo[0].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_triangles.diminfo[1].strides = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_triangles.diminfo[1].shape = __pyx_pybuffernd_triangles.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_colors.rcbuffer->pybuffer, (PyObject*)__pyx_v_colors, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_colors.diminfo[0].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_colors.diminfo[0].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_colors.diminfo[1].strides = __pyx_pybuffernd_colors.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_colors.diminfo[1].shape = __pyx_pybuffernd_colors.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer, (PyObject*)__pyx_v_uv_coords, &__Pyx_TypeInfo_float, PyBUF_FORMAT| PyBUF_C_CONTIGUOUS, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_uv_coords.diminfo[0].strides = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_uv_coords.diminfo[0].shape = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_uv_coords.diminfo[1].strides = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_uv_coords.diminfo[1].shape = __pyx_pybuffernd_uv_coords.rcbuffer->pybuffer.shape[1];
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":107
+ *                 int nver, int ntri, int ntexver
+ *                 ):
+ *     _write_obj_with_colors_texture(filename, mtl_name,             # <<<<<<<<<<<<<<
+ *         <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles), <float*> np.PyArray_DATA(colors), <float*> np.PyArray_DATA(uv_coords),
+ *         nver, ntri, ntexver)
+ */
+  _write_obj_with_colors_texture(__pyx_v_filename, __pyx_v_mtl_name, ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_vertices))), ((int *)PyArray_DATA(((PyArrayObject *)__pyx_v_triangles))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_colors))), ((float *)PyArray_DATA(((PyArrayObject *)__pyx_v_uv_coords))), __pyx_v_nver, __pyx_v_ntri, __pyx_v_ntexver);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython.write_obj_with_colors_texture_core", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_colors.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_triangles.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_uv_coords.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_vertices.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":734
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":735
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 735, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":734
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":737
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":738
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 738, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":737
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":740
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":741
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 741, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":740
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":743
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":744
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 744, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":743
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":746
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":747
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 747, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":746
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":749
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":750
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":751
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject*)__pyx_v_d->subarray->shape));
+    __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":750
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":753
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":749
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":928
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":929
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *     Py_INCREF(base) # important to do this before stealing the reference below!             # <<<<<<<<<<<<<<
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ */
+  Py_INCREF(__pyx_v_base);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":930
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  (void)(PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_base));
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":928
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":932
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_v_base;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":933
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)             # <<<<<<<<<<<<<<
+ *     if base is NULL:
+ *         return None
+ */
+  __pyx_v_base = PyArray_BASE(__pyx_v_arr);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":934
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  __pyx_t_1 = ((__pyx_v_base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":935
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     return <object>base
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":934
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":936
+ *     if base is NULL:
+ *         return None
+ *     return <object>base             # <<<<<<<<<<<<<<
+ * 
+ * # Versions of the import_* functions which are more suitable for
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_base));
+  __pyx_r = ((PyObject *)__pyx_v_base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":932
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":940
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_array", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":941
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":942
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         __pyx_import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 942, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":941
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":943
+ *     try:
+ *         __pyx_import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 943, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":944
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 944, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 944, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":941
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":940
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":946
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_umath", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":947
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":948
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 948, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":947
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":949
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 949, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":950
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 950, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 950, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":947
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":946
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":952
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":953
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":954
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 954, __pyx_L3_error)
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":953
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":955
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 955, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":956
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef extern from *:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 956, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 956, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":953
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":952
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj):             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_timedelta64_object", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":978
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyTimedeltaArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline bint is_timedelta64_object(object obj):             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.timedelta64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":981
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj):             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *__pyx_v_obj) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("is_datetime64_object", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":993
+ *     bool
+ *     """
+ *     return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = PyObject_TypeCheck(__pyx_v_obj, (&PyDatetimeArrType_Type));
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":981
+ * 
+ * 
+ * cdef inline bint is_datetime64_object(object obj):             # <<<<<<<<<<<<<<
+ *     """
+ *     Cython equivalent of `isinstance(obj, np.datetime64)`
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":996
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(PyObject *__pyx_v_obj) {
+  npy_datetime __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1003
+ *     also needed.  That can be found using `get_datetime64_unit`.
+ *     """
+ *     return (<PyDatetimeScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyDatetimeScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":996
+ * 
+ * 
+ * cdef inline npy_datetime get_datetime64_value(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy datetime64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1006
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(PyObject *__pyx_v_obj) {
+  npy_timedelta __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1010
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ *     """
+ *     return (<PyTimedeltaScalarObject*>obj).obval             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = ((PyTimedeltaScalarObject *)__pyx_v_obj)->obval;
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1006
+ * 
+ * 
+ * cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the int64 value underlying scalar numpy timedelta64 object
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1013
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit(PyObject *__pyx_v_obj) {
+  NPY_DATETIMEUNIT __pyx_r;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1017
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ *     """
+ *     return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base             # <<<<<<<<<<<<<<
+ */
+  __pyx_r = ((NPY_DATETIMEUNIT)((PyDatetimeScalarObject *)__pyx_v_obj)->obmeta.base);
+  goto __pyx_L0;
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":1013
+ * 
+ * 
+ * cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:             # <<<<<<<<<<<<<<
+ *     """
+ *     returns the unit part of the dtype for a numpy datetime64 object.
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  return __pyx_r;
+}
+
+/* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length = 0
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+static std::string __pyx_convert_string_from_py_std__in_string(PyObject *__pyx_v_o) {
+  Py_ssize_t __pyx_v_length;
+  char const *__pyx_v_data;
+  std::string __pyx_r;
+  __Pyx_RefNannyDeclarations
+  char const *__pyx_t_1;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__pyx_convert_string_from_py_std__in_string", 0);
+
+  /* "string.from_py":14
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:
+ *     cdef Py_ssize_t length = 0             # <<<<<<<<<<<<<<
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ *     return string(data, length)
+ */
+  __pyx_v_length = 0;
+
+  /* "string.from_py":15
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:
+ *     cdef Py_ssize_t length = 0
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)             # <<<<<<<<<<<<<<
+ *     return string(data, length)
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyObject_AsStringAndSize(__pyx_v_o, (&__pyx_v_length)); if (unlikely(__pyx_t_1 == ((char const *)NULL))) __PYX_ERR(2, 15, __pyx_L1_error)
+  __pyx_v_data = __pyx_t_1;
+
+  /* "string.from_py":16
+ *     cdef Py_ssize_t length = 0
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ *     return string(data, length)             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = std::string(__pyx_v_data, __pyx_v_length);
+  goto __pyx_L0;
+
+  /* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length = 0
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_AddTraceback("string.from_py.__pyx_convert_string_from_py_std__in_string", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_pretend_to_initialize(&__pyx_r);
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_mesh_core_cython(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_mesh_core_cython},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "mesh_core_cython",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+  {&__pyx_n_s_barycentric_weight, __pyx_k_barycentric_weight, sizeof(__pyx_k_barycentric_weight), 0, 0, 1, 1},
+  {&__pyx_n_s_c, __pyx_k_c, sizeof(__pyx_k_c), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_colors, __pyx_k_colors, sizeof(__pyx_k_colors), 0, 0, 1, 1},
+  {&__pyx_n_s_depth_buffer, __pyx_k_depth_buffer, sizeof(__pyx_k_depth_buffer), 0, 0, 1, 1},
+  {&__pyx_n_s_filename, __pyx_k_filename, sizeof(__pyx_k_filename), 0, 0, 1, 1},
+  {&__pyx_n_s_get_normal_core, __pyx_k_get_normal_core, sizeof(__pyx_k_get_normal_core), 0, 0, 1, 1},
+  {&__pyx_n_s_h, __pyx_k_h, sizeof(__pyx_k_h), 0, 0, 1, 1},
+  {&__pyx_n_s_image, __pyx_k_image, sizeof(__pyx_k_image), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_k_insightface_thirdparty_face3d_me, sizeof(__pyx_k_insightface_thirdparty_face3d_me), 0, 0, 1, 0},
+  {&__pyx_n_s_insightface_thirdparty_face3d_me_2, __pyx_k_insightface_thirdparty_face3d_me_2, sizeof(__pyx_k_insightface_thirdparty_face3d_me_2), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_mapping_type, __pyx_k_mapping_type, sizeof(__pyx_k_mapping_type), 0, 0, 1, 1},
+  {&__pyx_n_s_mtl_name, __pyx_k_mtl_name, sizeof(__pyx_k_mtl_name), 0, 0, 1, 1},
+  {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+  {&__pyx_n_s_normal, __pyx_k_normal, sizeof(__pyx_k_normal), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_ntexver, __pyx_k_ntexver, sizeof(__pyx_k_ntexver), 0, 0, 1, 1},
+  {&__pyx_n_s_ntri, __pyx_k_ntri, sizeof(__pyx_k_ntri), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_kp_s_numpy_core_multiarray_failed_to, __pyx_k_numpy_core_multiarray_failed_to, sizeof(__pyx_k_numpy_core_multiarray_failed_to), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_core_umath_failed_to_impor, __pyx_k_numpy_core_umath_failed_to_impor, sizeof(__pyx_k_numpy_core_umath_failed_to_impor), 0, 0, 1, 0},
+  {&__pyx_n_s_nver, __pyx_k_nver, sizeof(__pyx_k_nver), 0, 0, 1, 1},
+  {&__pyx_n_s_rasterize_triangles_core, __pyx_k_rasterize_triangles_core, sizeof(__pyx_k_rasterize_triangles_core), 0, 0, 1, 1},
+  {&__pyx_n_s_render_colors_core, __pyx_k_render_colors_core, sizeof(__pyx_k_render_colors_core), 0, 0, 1, 1},
+  {&__pyx_n_s_render_texture_core, __pyx_k_render_texture_core, sizeof(__pyx_k_render_texture_core), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_c, __pyx_k_tex_c, sizeof(__pyx_k_tex_c), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_coords, __pyx_k_tex_coords, sizeof(__pyx_k_tex_coords), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_h, __pyx_k_tex_h, sizeof(__pyx_k_tex_h), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_nver, __pyx_k_tex_nver, sizeof(__pyx_k_tex_nver), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_triangles, __pyx_k_tex_triangles, sizeof(__pyx_k_tex_triangles), 0, 0, 1, 1},
+  {&__pyx_n_s_tex_w, __pyx_k_tex_w, sizeof(__pyx_k_tex_w), 0, 0, 1, 1},
+  {&__pyx_n_s_texture, __pyx_k_texture, sizeof(__pyx_k_texture), 0, 0, 1, 1},
+  {&__pyx_n_s_tri_normal, __pyx_k_tri_normal, sizeof(__pyx_k_tri_normal), 0, 0, 1, 1},
+  {&__pyx_n_s_triangle_buffer, __pyx_k_triangle_buffer, sizeof(__pyx_k_triangle_buffer), 0, 0, 1, 1},
+  {&__pyx_n_s_triangles, __pyx_k_triangles, sizeof(__pyx_k_triangles), 0, 0, 1, 1},
+  {&__pyx_n_s_uv_coords, __pyx_k_uv_coords, sizeof(__pyx_k_uv_coords), 0, 0, 1, 1},
+  {&__pyx_n_s_vertices, __pyx_k_vertices, sizeof(__pyx_k_vertices), 0, 0, 1, 1},
+  {&__pyx_n_s_w, __pyx_k_w, sizeof(__pyx_k_w), 0, 0, 1, 1},
+  {&__pyx_n_s_write_obj_with_colors_texture_co, __pyx_k_write_obj_with_colors_texture_co, sizeof(__pyx_k_write_obj_with_colors_texture_co), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(1, 944, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":944
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple_)) __PYX_ERR(1, 944, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+
+  /* "../../../../tmp/pip-build-env-cbwohp4_/overlay/lib/python3.8/site-packages/numpy/__init__.pxd":950
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(1, 950, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__3 = PyTuple_Pack(4, __pyx_n_s_normal, __pyx_n_s_tri_normal, __pyx_n_s_triangles, __pyx_n_s_ntri); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+  __pyx_codeobj__4 = (PyObject*)__Pyx_PyCode_New(4, 0, 4, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__3, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_get_normal_core, 40, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__4)) __PYX_ERR(0, 40, __pyx_L1_error)
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__5 = PyTuple_Pack(9, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_depth_buffer, __pyx_n_s_triangle_buffer, __pyx_n_s_barycentric_weight, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+  __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(9, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__5, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_rasterize_triangles_core, 49, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 49, __pyx_L1_error)
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__7 = PyTuple_Pack(10, __pyx_n_s_image, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_colors, __pyx_n_s_depth_buffer, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_c); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+  __pyx_codeobj__8 = (PyObject*)__Pyx_PyCode_New(10, 0, 10, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__7, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_render_colors_core, 64, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__8)) __PYX_ERR(0, 64, __pyx_L1_error)
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__9 = PyTuple_Pack(17, __pyx_n_s_image, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_texture, __pyx_n_s_tex_coords, __pyx_n_s_tex_triangles, __pyx_n_s_depth_buffer, __pyx_n_s_nver, __pyx_n_s_tex_nver, __pyx_n_s_ntri, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_c, __pyx_n_s_tex_h, __pyx_n_s_tex_w, __pyx_n_s_tex_c, __pyx_n_s_mapping_type); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+  __pyx_codeobj__10 = (PyObject*)__Pyx_PyCode_New(17, 0, 17, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__9, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_render_texture_core, 79, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__10)) __PYX_ERR(0, 79, __pyx_L1_error)
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_tuple__11 = PyTuple_Pack(9, __pyx_n_s_filename, __pyx_n_s_mtl_name, __pyx_n_s_vertices, __pyx_n_s_triangles, __pyx_n_s_colors, __pyx_n_s_uv_coords, __pyx_n_s_nver, __pyx_n_s_ntri, __pyx_n_s_ntexver); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_codeobj__12 = (PyObject*)__Pyx_PyCode_New(9, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__11, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_insightface_thirdparty_face3d_me, __pyx_n_s_write_obj_with_colors_texture_co, 100, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__12)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_7cpython_4type_type) __PYX_ERR(3, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyImport_ImportModule("numpy"); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 199, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType(__pyx_t_1, "numpy", "dtype", sizeof(PyArray_Descr), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_dtype) __PYX_ERR(1, 199, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType(__pyx_t_1, "numpy", "flatiter", sizeof(PyArrayIterObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_flatiter) __PYX_ERR(1, 222, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType(__pyx_t_1, "numpy", "broadcast", sizeof(PyArrayMultiIterObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_broadcast) __PYX_ERR(1, 226, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType(__pyx_t_1, "numpy", "ndarray", sizeof(PyArrayObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_ndarray) __PYX_ERR(1, 238, __pyx_L1_error)
+  __pyx_ptype_5numpy_generic = __Pyx_ImportType(__pyx_t_1, "numpy", "generic", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_generic) __PYX_ERR(1, 770, __pyx_L1_error)
+  __pyx_ptype_5numpy_number = __Pyx_ImportType(__pyx_t_1, "numpy", "number", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_number) __PYX_ERR(1, 772, __pyx_L1_error)
+  __pyx_ptype_5numpy_integer = __Pyx_ImportType(__pyx_t_1, "numpy", "integer", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_integer) __PYX_ERR(1, 774, __pyx_L1_error)
+  __pyx_ptype_5numpy_signedinteger = __Pyx_ImportType(__pyx_t_1, "numpy", "signedinteger", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_signedinteger) __PYX_ERR(1, 776, __pyx_L1_error)
+  __pyx_ptype_5numpy_unsignedinteger = __Pyx_ImportType(__pyx_t_1, "numpy", "unsignedinteger", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_unsignedinteger) __PYX_ERR(1, 778, __pyx_L1_error)
+  __pyx_ptype_5numpy_inexact = __Pyx_ImportType(__pyx_t_1, "numpy", "inexact", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_inexact) __PYX_ERR(1, 780, __pyx_L1_error)
+  __pyx_ptype_5numpy_floating = __Pyx_ImportType(__pyx_t_1, "numpy", "floating", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_floating) __PYX_ERR(1, 782, __pyx_L1_error)
+  __pyx_ptype_5numpy_complexfloating = __Pyx_ImportType(__pyx_t_1, "numpy", "complexfloating", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_complexfloating) __PYX_ERR(1, 784, __pyx_L1_error)
+  __pyx_ptype_5numpy_flexible = __Pyx_ImportType(__pyx_t_1, "numpy", "flexible", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_flexible) __PYX_ERR(1, 786, __pyx_L1_error)
+  __pyx_ptype_5numpy_character = __Pyx_ImportType(__pyx_t_1, "numpy", "character", sizeof(PyObject), __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_5numpy_character) __PYX_ERR(1, 788, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType(__pyx_t_1, "numpy", "ufunc", sizeof(PyUFuncObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_ufunc) __PYX_ERR(1, 826, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#ifndef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#elif PY_MAJOR_VERSION < 3
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" void
+#else
+#define __Pyx_PyMODINIT_FUNC void
+#endif
+#else
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initmesh_core_cython(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initmesh_core_cython(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
+    #if PY_VERSION_HEX >= 0x030700A1
+    static PY_INT64_T main_interpreter_id = -1;
+    PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp);
+    if (main_interpreter_id == -1) {
+        main_interpreter_id = current_id;
+        return (unlikely(current_id == -1)) ? -1 : 0;
+    } else if (unlikely(main_interpreter_id != current_id))
+    #else
+    static PyInterpreterState *main_interpreter = NULL;
+    PyInterpreterState *current_interpreter = PyThreadState_Get()->interp;
+    if (!main_interpreter) {
+        main_interpreter = current_interpreter;
+    } else if (unlikely(main_interpreter != current_interpreter))
+    #endif
+    {
+        PyErr_SetString(
+            PyExc_ImportError,
+            "Interpreter change detected - this module can only be loaded into one interpreter per process.");
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        if (allow_none || value != Py_None) {
+            result = PyDict_SetItemString(moddict, to_name, value);
+        }
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__Pyx_check_single_interpreter())
+        return NULL;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static CYTHON_SMALL_CODE int __pyx_pymod_exec_mesh_core_cython(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m) {
+    if (__pyx_m == __pyx_pyinit_module) return 0;
+    PyErr_SetString(PyExc_RuntimeError, "Module 'mesh_core_cython' has already been imported. Re-initialisation is not supported.");
+    return -1;
+  }
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_mesh_core_cython(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pxy_PyFrame_Initialize_Offsets
+  __Pxy_PyFrame_Initialize_Offsets();
+  #endif
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(WITH_THREAD) && PY_VERSION_HEX < 0x030700F0 && defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  PyEval_InitThreads();
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("mesh_core_cython", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_b);
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_cython_runtime);
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_insightface__thirdparty__face3d__mesh__cython__mesh_core_cython) {
+    if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython")) {
+      if (unlikely(PyDict_SetItemString(modules, "insightface.thirdparty.face3d.mesh.cython.mesh_core_cython", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  (void)__Pyx_modinit_type_init_code();
+  if (unlikely(__Pyx_modinit_type_import_code() < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":1
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libcpp.string cimport string
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":6
+ * 
+ * # use the Numpy-C-API from Cython
+ * np.import_array()             # <<<<<<<<<<<<<<
+ * 
+ * # cdefine the signature of our c function
+ */
+  __pyx_t_2 = __pyx_f_5numpy_import_array(); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 6, __pyx_L1_error)
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":40
+ *         int nver, int ntri, int ntexver)
+ * 
+ * def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_1get_normal_core, NULL, __pyx_n_s_insightface_thirdparty_face3d_me_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_normal_core, __pyx_t_1) < 0) __PYX_ERR(0, 40, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":49
+ *         ntri)
+ * 
+ * def rasterize_triangles_core(             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_3rasterize_triangles_core, NULL, __pyx_n_s_insightface_thirdparty_face3d_me_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_rasterize_triangles_core, __pyx_t_1) < 0) __PYX_ERR(0, 49, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":64
+ *         h, w)
+ * 
+ * def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_5render_colors_core, NULL, __pyx_n_s_insightface_thirdparty_face3d_me_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_render_colors_core, __pyx_t_1) < 0) __PYX_ERR(0, 64, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":79
+ *         h, w, c)
+ * 
+ * def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_7render_texture_core, NULL, __pyx_n_s_insightface_thirdparty_face3d_me_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_render_texture_core, __pyx_t_1) < 0) __PYX_ERR(0, 79, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":100
+ *         mapping_type)
+ * 
+ * def write_obj_with_colors_texture_core(string filename, string mtl_name,             # <<<<<<<<<<<<<<
+ *                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+ *                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11insightface_10thirdparty_6face3d_4mesh_6cython_16mesh_core_cython_9write_obj_with_colors_texture_core, NULL, __pyx_n_s_insightface_thirdparty_face3d_me_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_write_obj_with_colors_texture_co, __pyx_t_1) < 0) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx":1
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libcpp.string cimport string
+ */
+  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "string.from_py":13
+ * 
+ * @cname("__pyx_convert_string_from_py_std__in_string")
+ * cdef string __pyx_convert_string_from_py_std__in_string(object o) except *:             # <<<<<<<<<<<<<<
+ *     cdef Py_ssize_t length = 0
+ *     cdef const char* data = __Pyx_PyObject_AsStringAndSize(o, &length)
+ */
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init insightface.thirdparty.face3d.mesh.cython.mesh_core_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    Py_CLEAR(__pyx_m);
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init insightface.thirdparty.face3d.mesh.cython.mesh_core_cython");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule(modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, "RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+    return 0;
+}
+
+/* IsLittleEndian */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t <= '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case '?': return "'bool'";
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case '?': case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number, ndim;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    ndim = ctx->head->field->type->ndim;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case '?': case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if ((ctx->enc_type == *ts) && (got_Z == ctx->is_complex) &&
+            (ctx->enc_packmode == ctx->new_packmode) && (!ctx->is_valid_array)) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+  static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((size_t)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* PyErrFetchRestore */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+  #if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* GetBuiltinName */
+  static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* GetTopmostException */
+  #if CYTHON_USE_EXC_INFO_STACK
+static _PyErr_StackItem *
+__Pyx_PyErr_GetTopmostException(PyThreadState *tstate)
+{
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    while ((exc_info->exc_type == NULL || exc_info->exc_type == Py_None) &&
+           exc_info->previous_item != NULL)
+    {
+        exc_info = exc_info->previous_item;
+    }
+    return exc_info;
+}
+#endif
+
+/* SaveResetException */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    *type = exc_info->exc_type;
+    *value = exc_info->exc_value;
+    *tb = exc_info->exc_traceback;
+    #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    #endif
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = type;
+    exc_info->exc_value = value;
+    exc_info->exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+#endif
+
+/* PyErrExceptionMatches */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    PyObject *exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+    if (unlikely(PyTuple_Check(err)))
+        return __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    return __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+}
+#endif
+
+/* GetException */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb)
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
+#endif
+{
+    PyObject *local_type, *local_value, *local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if CYTHON_USE_EXC_INFO_STACK
+    {
+        _PyErr_StackItem *exc_info = tstate->exc_info;
+        tmp_type = exc_info->exc_type;
+        tmp_value = exc_info->exc_value;
+        tmp_tb = exc_info->exc_traceback;
+        exc_info->exc_type = local_type;
+        exc_info->exc_value = local_value;
+        exc_info->exc_traceback = local_tb;
+    }
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* PyObjectCall */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = Py_TYPE(func)->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* RaiseException */
+  #if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+#if CYTHON_COMPILING_IN_PYPY
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#else
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* TypeImport */
+  #ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(PyObject *module, const char *module_name, const char *class_name,
+    size_t size, enum __Pyx_ImportType_CheckSize check_size)
+{
+    PyObject *result = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    result = PyObject_GetAttrString(module, class_name);
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if ((size_t)basicsize < size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        goto bad;
+    }
+    if (check_size == __Pyx_ImportType_CheckSize_Error && (size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        goto bad;
+    }
+    else if (check_size == __Pyx_ImportType_CheckSize_Warn && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* Import */
+  static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if ((1) && (strchr(__Pyx_MODULE_NAME, '.'))) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, (PyObject *)NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* PyDictVersioning */
+  #if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0;
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) {
+    PyObject **dictptr = NULL;
+    Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset;
+    if (offset) {
+#if CYTHON_COMPILING_IN_CPYTHON
+        dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj);
+#else
+        dictptr = _PyObject_GetDictPtr(obj);
+#endif
+    }
+    return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0;
+}
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict)))
+        return 0;
+    return obj_dict_version == __Pyx_get_object_dict_version(obj);
+}
+#endif
+
+/* CLineInTraceback */
+  #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_NCP_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+        __PYX_PY_DICT_LOOKUP_IF_MODIFIED(
+            use_cline, *cython_runtime_dict,
+            __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback))
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        (void) PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+  static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+  #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+#if PY_VERSION_HEX >= 0x030b00a6
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = NULL;
+    PyObject *py_funcname = NULL;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_srcfile = NULL;
+    py_srcfile = PyString_FromString(filename);
+    if (!py_srcfile) goto bad;
+    #endif
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        funcname = PyUnicode_AsUTF8(py_funcname);
+        if (!funcname) goto bad;
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        if (!py_funcname) goto bad;
+        #endif
+    }
+    #if PY_MAJOR_VERSION < 3
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    #else
+    py_code = PyCode_NewEmpty(filename, funcname, py_line);
+    #endif
+    Py_XDECREF(py_funcname);  // XDECREF since it's only set on Py3 if cline
+    return py_code;
+bad:
+    Py_XDECREF(py_funcname);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_srcfile);
+    #endif
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject *ptype, *pvalue, *ptraceback;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) {
+            /* If the code object creation fails, then we should clear the
+               fetched exception references and propagate the new exception */
+            Py_XDECREF(ptype);
+            Py_XDECREF(pvalue);
+            Py_XDECREF(ptraceback);
+            goto bad;
+        }
+        __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+  /* CIntFromPyVerify */
+  #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = (float)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = (float)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = (double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = (double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* CIntFromPy */
+  static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* FastTypeChecks */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+  static int __Pyx_check_binary_version(void) {
+    char ctversion[5];
+    int same=1, i, found_dot;
+    const char* rt_from_call = Py_GetVersion();
+    PyOS_snprintf(ctversion, 5, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    found_dot = 0;
+    for (i = 0; i < 4; i++) {
+        if (!ctversion[i]) {
+            same = (rt_from_call[i] < '0' || rt_from_call[i] > '9');
+            break;
+        }
+        if (rt_from_call[i] != ctversion[i]) {
+            same = 0;
+            break;
+        }
+    }
+    if (!same) {
+        char rtversion[5] = {'\0'};
+        char message[200];
+        for (i=0; i<4; ++i) {
+            if (rt_from_call[i] == '.') {
+                if (found_dot) break;
+                found_dot = 1;
+            } else if (rt_from_call[i] < '0' || rt_from_call[i] > '9') {
+                break;
+            }
+            rtversion[i] = rt_from_call[i];
+        }
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* InitStrings */
+  static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
+    int retval;
+    if (unlikely(!x)) return -1;
+    retval = __Pyx_PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return retval;
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(b);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
+  if (sizeof(Py_hash_t) == sizeof(Py_ssize_t)) {
+    return (Py_hash_t) __Pyx_PyIndex_AsSsize_t(o);
+#if PY_MAJOR_VERSION < 3
+  } else if (likely(PyInt_CheckExact(o))) {
+    return PyInt_AS_LONG(o);
+#endif
+  } else {
+    Py_ssize_t ival;
+    PyObject *x;
+    x = PyNumber_Index(o);
+    if (!x) return -1;
+    ival = PyInt_AsLong(x);
+    Py_DECREF(x);
+    return ival;
+  }
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..40f589a9b6ddfe5d42d62c242348380c0fe2242a
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx
@@ -0,0 +1,109 @@
+import numpy as np
+cimport numpy as np
+from libcpp.string cimport string
+
+# use the Numpy-C-API from Cython
+np.import_array()
+
+# cdefine the signature of our c function
+cdef extern from "mesh_core.h":
+    void _rasterize_triangles_core(
+        float* vertices, int* triangles, 
+        float* depth_buffer, int* triangle_buffer, float* barycentric_weight,
+        int nver, int ntri,
+        int h, int w)
+
+    void _render_colors_core(
+        float* image, float* vertices, int* triangles, 
+        float* colors, 
+        float* depth_buffer,
+        int nver, int ntri,
+        int h, int w, int c)
+
+    void _render_texture_core(
+        float* image, float* vertices, int* triangles, 
+        float* texture, float* tex_coords, int* tex_triangles, 
+        float* depth_buffer,
+        int nver, int tex_nver, int ntri, 
+        int h, int w, int c, 
+        int tex_h, int tex_w, int tex_c, 
+        int mapping_type)
+
+    void _get_normal_core(
+        float* normal, float* tri_normal, int* triangles,
+        int ntri)
+
+    void _write_obj_with_colors_texture(string filename, string mtl_name, 
+        float* vertices, int* triangles, float* colors, float* uv_coords,
+        int nver, int ntri, int ntexver)
+
+def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None, 
+                np.ndarray[float, ndim=2, mode = "c"] tri_normal not None, 
+                np.ndarray[int, ndim=2, mode="c"] triangles not None, 
+                int ntri
+                ):
+    _get_normal_core(
+        <float*> np.PyArray_DATA(normal), <float*> np.PyArray_DATA(tri_normal), <int*> np.PyArray_DATA(triangles),  
+        ntri)
+
+def rasterize_triangles_core(
+                np.ndarray[float, ndim=2, mode = "c"] vertices not None, 
+                np.ndarray[int, ndim=2, mode="c"] triangles not None, 
+                np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+                np.ndarray[int, ndim=2, mode = "c"] triangle_buffer not None,
+                np.ndarray[float, ndim=2, mode = "c"] barycentric_weight not None,
+                int nver, int ntri,
+                int h, int w
+                ):   
+    _rasterize_triangles_core(
+        <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),  
+        <float*> np.PyArray_DATA(depth_buffer), <int*> np.PyArray_DATA(triangle_buffer), <float*> np.PyArray_DATA(barycentric_weight),
+        nver, ntri,
+        h, w)
+
+def render_colors_core(np.ndarray[float, ndim=3, mode = "c"] image not None, 
+                np.ndarray[float, ndim=2, mode = "c"] vertices not None, 
+                np.ndarray[int, ndim=2, mode="c"] triangles not None, 
+                np.ndarray[float, ndim=2, mode = "c"] colors not None, 
+                np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+                int nver, int ntri,
+                int h, int w, int c
+                ):   
+    _render_colors_core(
+        <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),  
+        <float*> np.PyArray_DATA(colors), 
+        <float*> np.PyArray_DATA(depth_buffer),
+        nver, ntri,
+        h, w, c)
+
+def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None, 
+                np.ndarray[float, ndim=2, mode = "c"] vertices not None, 
+                np.ndarray[int, ndim=2, mode="c"] triangles not None, 
+                np.ndarray[float, ndim=3, mode = "c"] texture not None, 
+                np.ndarray[float, ndim=2, mode = "c"] tex_coords not None, 
+                np.ndarray[int, ndim=2, mode="c"] tex_triangles not None, 
+                np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+                int nver, int tex_nver, int ntri,
+                int h, int w, int c,
+                int tex_h, int tex_w, int tex_c,
+                int mapping_type
+                ):   
+    _render_texture_core(
+        <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),  
+        <float*> np.PyArray_DATA(texture), <float*> np.PyArray_DATA(tex_coords), <int*> np.PyArray_DATA(tex_triangles),  
+        <float*> np.PyArray_DATA(depth_buffer),
+        nver, tex_nver, ntri,
+        h, w, c, 
+        tex_h, tex_w, tex_c, 
+        mapping_type)
+
+def write_obj_with_colors_texture_core(string filename, string mtl_name, 
+                np.ndarray[float, ndim=2, mode = "c"] vertices not None, 
+                np.ndarray[int, ndim=2, mode="c"] triangles not None, 
+                np.ndarray[float, ndim=2, mode = "c"] colors not None, 
+                np.ndarray[float, ndim=2, mode = "c"] uv_coords not None, 
+                int nver, int ntri, int ntexver
+                ):
+    _write_obj_with_colors_texture(filename, mtl_name, 
+        <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles), <float*> np.PyArray_DATA(colors), <float*> np.PyArray_DATA(uv_coords),
+        nver, ntri, ntexver)
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/setup.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1ede1e777b369b1bbd9c94f6fa85ce037de3f9
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/cython/setup.py
@@ -0,0 +1,20 @@
+'''
+python setup.py build_ext -i
+to compile
+'''
+
+# setup.py
+from distutils.core import setup, Extension
+from Cython.Build import cythonize
+from Cython.Distutils import build_ext
+import numpy
+
+setup(
+	name = 'mesh_core_cython',
+    cmdclass={'build_ext': build_ext},
+    ext_modules=[Extension("mesh_core_cython",
+                 sources=["mesh_core_cython.pyx", "mesh_core.cpp"],
+                 language='c++',
+                 include_dirs=[numpy.get_include()])],
+)
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/io.cpp b/insightface/python-package/insightface/thirdparty/face3d/mesh/io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06f2230dd7ee03c887290f5a40d14c053ceb556d
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/io.cpp
@@ -0,0 +1 @@
+#error Do not use this file, it is the result of a failed Cython compilation.
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/io.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bb146012ecd8f78b7485153bafab00af092fe16
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/io.py
@@ -0,0 +1,142 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+from skimage import io
+from time import time
+
+from .cython import mesh_core_cython
+
+## TODO
+## TODO: c++ version
+def read_obj(obj_name):
+	''' read mesh
+	'''
+	return 0
+
+# ------------------------- write
+def write_asc(path, vertices):
+    '''
+    Args:
+        vertices: shape = (nver, 3)
+    '''
+    if path.split('.')[-1] == 'asc':
+        np.savetxt(path, vertices)
+    else:
+        np.savetxt(path + '.asc', vertices)
+
+def write_obj_with_colors(obj_name, vertices, triangles, colors):
+    ''' Save 3D face model with texture represented by colors.
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        colors: shape = (nver, 3)
+    '''
+    triangles = triangles.copy()
+    triangles += 1 # meshlab start with 1
+    
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+        
+    # write obj
+    with open(obj_name, 'w') as f:
+        
+        # write vertices & colors
+        for i in range(vertices.shape[0]):
+            # s = 'v {} {} {} \n'.format(vertices[0,i], vertices[1,i], vertices[2,i])
+            s = 'v {} {} {} {} {} {}\n'.format(vertices[i, 0], vertices[i, 1], vertices[i, 2], colors[i, 0], colors[i, 1], colors[i, 2])
+            f.write(s)
+
+        # write f: ver ind/ uv ind
+        [k, ntri] = triangles.shape
+        for i in range(triangles.shape[0]):
+            # s = 'f {} {} {}\n'.format(triangles[i, 0], triangles[i, 1], triangles[i, 2])
+            s = 'f {} {} {}\n'.format(triangles[i, 2], triangles[i, 1], triangles[i, 0])
+            f.write(s)
+
+## TODO: c++ version
+def write_obj_with_texture(obj_name, vertices, triangles, texture, uv_coords):
+    ''' Save 3D face model with texture represented by texture map.
+    Ref: https://github.com/patrikhuber/eos/blob/bd00155ebae4b1a13b08bf5a991694d682abbada/include/eos/core/Mesh.hpp
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        texture: shape = (256,256,3)
+        uv_coords: shape = (nver, 3) max value<=1
+    '''
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+    mtl_name = obj_name.replace('.obj', '.mtl')
+    texture_name = obj_name.replace('.obj', '_texture.png')
+    
+    triangles = triangles.copy()
+    triangles += 1 # mesh lab start with 1
+    
+    # write obj
+    with open(obj_name, 'w') as f:
+        # first line: write mtlib(material library)
+        s = "mtllib {}\n".format(os.path.abspath(mtl_name))
+        f.write(s)
+
+        # write vertices
+        for i in range(vertices.shape[0]):
+            s = 'v {} {} {}\n'.format(vertices[i, 0], vertices[i, 1], vertices[i, 2])
+            f.write(s)
+        
+        # write uv coords
+        for i in range(uv_coords.shape[0]):
+            s = 'vt {} {}\n'.format(uv_coords[i,0], 1 - uv_coords[i,1])
+            f.write(s)
+
+        f.write("usemtl FaceTexture\n")
+
+        # write f: ver ind/ uv ind
+        for i in range(triangles.shape[0]):
+            s = 'f {}/{} {}/{} {}/{}\n'.format(triangles[i,2], triangles[i,2], triangles[i,1], triangles[i,1], triangles[i,0], triangles[i,0])
+            f.write(s)
+
+    # write mtl
+    with open(mtl_name, 'w') as f:
+        f.write("newmtl FaceTexture\n")
+        s = 'map_Kd {}\n'.format(os.path.abspath(texture_name)) # map to image
+        f.write(s)
+
+    # write texture as png
+    imsave(texture_name, texture)
+
+# c++ version
+def write_obj_with_colors_texture(obj_name, vertices, triangles, colors, texture, uv_coords):
+    ''' Save 3D face model with texture. 
+    Ref: https://github.com/patrikhuber/eos/blob/bd00155ebae4b1a13b08bf5a991694d682abbada/include/eos/core/Mesh.hpp
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        colors: shape = (nver, 3)
+        texture: shape = (256,256,3)
+        uv_coords: shape = (nver, 3) max value<=1
+    '''
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+    mtl_name = obj_name.replace('.obj', '.mtl')
+    texture_name = obj_name.replace('.obj', '_texture.png')
+    
+    triangles = triangles.copy()
+    triangles += 1 # mesh lab start with 1
+    
+    # write obj
+    vertices, colors, uv_coords = vertices.astype(np.float32).copy(), colors.astype(np.float32).copy(), uv_coords.astype(np.float32).copy()
+    mesh_core_cython.write_obj_with_colors_texture_core(str.encode(obj_name), str.encode(os.path.abspath(mtl_name)), vertices, triangles, colors, uv_coords, vertices.shape[0], triangles.shape[0], uv_coords.shape[0])
+   
+    # write mtl
+    with open(mtl_name, 'w') as f:
+        f.write("newmtl FaceTexture\n")
+        s = 'map_Kd {}\n'.format(os.path.abspath(texture_name)) # map to image
+        f.write(s)
+
+    # write texture as png
+    io.imsave(texture_name, texture)
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/light.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/light.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f0da63ed8ec6bbe8a12600e8d0f50cd75860129
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/light.py
@@ -0,0 +1,213 @@
+'''
+Functions about lighting mesh(changing colors/texture of mesh).
+1. add light to colors/texture (shade each vertex)
+2. fit light according to colors/texture & image.
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from .cython import mesh_core_cython
+
+def get_normal(vertices, triangles):
+    ''' calculate normal direction in each vertex
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+    Returns:
+        normal: [nver, 3]
+    '''
+    pt0 = vertices[triangles[:, 0], :] # [ntri, 3]
+    pt1 = vertices[triangles[:, 1], :] # [ntri, 3]
+    pt2 = vertices[triangles[:, 2], :] # [ntri, 3]
+    tri_normal = np.cross(pt0 - pt1, pt0 - pt2) # [ntri, 3]. normal of each triangle
+
+    normal = np.zeros_like(vertices, dtype = np.float32).copy() # [nver, 3]
+    # for i in range(triangles.shape[0]):
+    #     normal[triangles[i, 0], :] = normal[triangles[i, 0], :] + tri_normal[i, :]
+    #     normal[triangles[i, 1], :] = normal[triangles[i, 1], :] + tri_normal[i, :]
+    #     normal[triangles[i, 2], :] = normal[triangles[i, 2], :] + tri_normal[i, :]
+    mesh_core_cython.get_normal_core(normal, tri_normal.astype(np.float32).copy(), triangles.copy(), triangles.shape[0])
+
+    # normalize to unit length
+    mag = np.sum(normal**2, 1) # [nver]
+    zero_ind = (mag == 0)
+    mag[zero_ind] = 1;
+    normal[zero_ind, 0] = np.ones((np.sum(zero_ind)))
+
+    normal = normal/np.sqrt(mag[:,np.newaxis])
+
+    return normal
+
+# TODO: test
+def add_light_sh(vertices, triangles, colors, sh_coeff):
+    ''' 
+    In 3d face, usually assume:
+    1. The surface of face is Lambertian(reflect only the low frequencies of lighting)
+    2. Lighting can be an arbitrary combination of point sources
+    --> can be expressed in terms of spherical harmonics(omit the lighting coefficients)
+    I = albedo * (sh(n) x sh_coeff)
+    
+    albedo: n x 1
+    sh_coeff: 9 x 1
+    Y(n) = (1, n_x, n_y, n_z, n_xn_y, n_xn_z, n_yn_z, n_x^2 - n_y^2, 3n_z^2 - 1)': n x 9 
+    # Y(n) = (1, n_x, n_y, n_z)': n x 4
+
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        colors: [nver, 3] albedo
+        sh_coeff: [9, 1] spherical harmonics coefficients
+
+    Returns:
+        lit_colors: [nver, 3]
+    '''
+    assert vertices.shape[0] == colors.shape[0]
+    nver = vertices.shape[0]
+    normal = get_normal(vertices, triangles) # [nver, 3]
+    sh = np.array((np.ones(nver), n[:,0], n[:,1], n[:,2], n[:,0]*n[:,1], n[:,0]*n[:,2], n[:,1]*n[:,2], n[:,0]**2 - n[:,1]**2, 3*(n[:,2]**2) - 1)) # [nver, 9]
+    ref = sh.dot(sh_coeff) #[nver, 1]
+    lit_colors = colors*ref
+    return lit_colors
+
+
+def add_light(vertices, triangles, colors, light_positions = 0, light_intensities = 0):
+    ''' Gouraud shading. add point lights.
+    In 3d face, usually assume:
+    1. The surface of face is Lambertian(reflect only the low frequencies of lighting)
+    2. Lighting can be an arbitrary combination of point sources
+    3. No specular (unless skin is oil, 23333)
+
+    Ref: https://cs184.eecs.berkeley.edu/lecture/pipeline    
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        light_positions: [nlight, 3] 
+        light_intensities: [nlight, 3]
+    Returns:
+        lit_colors: [nver, 3]
+    '''
+    nver = vertices.shape[0]
+    normals = get_normal(vertices, triangles) # [nver, 3]
+
+    # ambient
+    # La = ka*Ia
+
+    # diffuse
+    # Ld = kd*(I/r^2)max(0, nxl)
+    direction_to_lights = vertices[np.newaxis, :, :] - light_positions[:, np.newaxis, :] # [nlight, nver, 3]
+    direction_to_lights_n = np.sqrt(np.sum(direction_to_lights**2, axis = 2)) # [nlight, nver]
+    direction_to_lights = direction_to_lights/direction_to_lights_n[:, :, np.newaxis]
+    normals_dot_lights = normals[np.newaxis, :, :]*direction_to_lights # [nlight, nver, 3]
+    normals_dot_lights = np.sum(normals_dot_lights, axis = 2) # [nlight, nver]
+    diffuse_output = colors[np.newaxis, :, :]*normals_dot_lights[:, :, np.newaxis]*light_intensities[:, np.newaxis, :]
+    diffuse_output = np.sum(diffuse_output, axis = 0) # [nver, 3]
+    
+    # specular
+    # h = (v + l)/(|v + l|) bisector
+    # Ls = ks*(I/r^2)max(0, nxh)^p
+    # increasing p narrows the reflectionlob
+
+    lit_colors = diffuse_output # only diffuse part here.
+    lit_colors = np.minimum(np.maximum(lit_colors, 0), 1)
+    return lit_colors
+
+
+
+## TODO. estimate light(sh coeff)
+## -------------------------------- estimate. can not use now. 
+def fit_light(image, vertices, colors, triangles, vis_ind, lamb = 10, max_iter = 3):
+    [h, w, c] = image.shape
+
+    # surface normal
+    norm = get_normal(vertices, triangles)
+    
+    nver = vertices.shape[1]
+
+    # vertices --> corresponding image pixel
+    pt2d = vertices[:2, :]
+
+    pt2d[0,:] = np.minimum(np.maximum(pt2d[0,:], 0), w - 1)
+    pt2d[1,:] = np.minimum(np.maximum(pt2d[1,:], 0), h - 1)
+    pt2d = np.round(pt2d).astype(np.int32) # 2 x nver
+
+    image_pixel = image[pt2d[1,:], pt2d[0,:], :] # nver x 3
+    image_pixel = image_pixel.T # 3 x nver
+
+    # vertices --> corresponding mean texture pixel with illumination
+    # Spherical Harmonic Basis
+    harmonic_dim = 9
+    nx = norm[0,:];
+    ny = norm[1,:];
+    nz = norm[2,:];
+    harmonic = np.zeros((nver, harmonic_dim))
+
+    pi = np.pi
+    harmonic[:,0] = np.sqrt(1/(4*pi)) * np.ones((nver,));
+    harmonic[:,1] = np.sqrt(3/(4*pi)) * nx;
+    harmonic[:,2] = np.sqrt(3/(4*pi)) * ny;
+    harmonic[:,3] = np.sqrt(3/(4*pi)) * nz;
+    harmonic[:,4] = 1/2. * np.sqrt(3/(4*pi)) * (2*nz**2 - nx**2 - ny**2);
+    harmonic[:,5] = 3 * np.sqrt(5/(12*pi)) * (ny*nz);
+    harmonic[:,6] = 3 * np.sqrt(5/(12*pi)) * (nx*nz);
+    harmonic[:,7] = 3 * np.sqrt(5/(12*pi)) * (nx*ny);
+    harmonic[:,8] = 3/2. * np.sqrt(5/(12*pi)) * (nx*nx - ny*ny);
+    
+    '''
+    I' = sum(albedo * lj * hj) j = 0:9 (albedo = tex)
+    set A = albedo*h (n x 9)
+        alpha = lj (9 x 1)
+        Y = I (n x 1)
+        Y' = A.dot(alpha)
+
+    opt function:
+        ||Y - A*alpha|| + lambda*(alpha'*alpha)
+    result:
+        A'*(Y - A*alpha) + lambda*alpha = 0
+        ==>
+        (A'*A*alpha - lambda)*alpha = A'*Y
+        left: 9 x 9
+        right: 9 x 1
+    '''
+    n_vis_ind = len(vis_ind)
+    n = n_vis_ind*c
+
+    Y = np.zeros((n, 1))
+    A = np.zeros((n, 9))
+    light = np.zeros((3, 1))
+
+    for k in range(c):
+        Y[k*n_vis_ind:(k+1)*n_vis_ind, :] = image_pixel[k, vis_ind][:, np.newaxis]
+        A[k*n_vis_ind:(k+1)*n_vis_ind, :] = texture[k, vis_ind][:, np.newaxis] * harmonic[vis_ind, :]
+        Ac = texture[k, vis_ind][:, np.newaxis]
+        Yc = image_pixel[k, vis_ind][:, np.newaxis]
+        light[k] = (Ac.T.dot(Yc))/(Ac.T.dot(Ac))
+
+    for i in range(max_iter):
+
+        Yc = Y.copy()
+        for k in range(c):
+            Yc[k*n_vis_ind:(k+1)*n_vis_ind, :]  /= light[k]
+
+        # update alpha
+        equation_left = np.dot(A.T, A) + lamb*np.eye(harmonic_dim); # why + ?
+        equation_right = np.dot(A.T, Yc) 
+        alpha = np.dot(np.linalg.inv(equation_left), equation_right)
+
+        # update light
+        for k in range(c):
+            Ac = A[k*n_vis_ind:(k+1)*n_vis_ind, :].dot(alpha)
+            Yc = Y[k*n_vis_ind:(k+1)*n_vis_ind, :]
+            light[k] = (Ac.T.dot(Yc))/(Ac.T.dot(Ac))
+
+    appearance = np.zeros_like(texture)
+    for k in range(c):
+        tmp = np.dot(harmonic*texture[k, :][:, np.newaxis], alpha*light[k])
+        appearance[k,:] = tmp.T
+
+    appearance = np.minimum(np.maximum(appearance, 0), 1)
+
+    return appearance
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/render.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/render.py
new file mode 100644
index 0000000000000000000000000000000000000000..19957222e7fd3841cf11cc3bdd06630f28509381
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/render.py
@@ -0,0 +1,135 @@
+'''
+functions about rendering mesh(from 3d obj to 2d image).
+only use rasterization render here.
+Note that:
+1. Generally, render func includes camera, light, raterize. Here no camera and light(I write these in other files)
+2. Generally, the input vertices are normalized to [-1,1] and cetered on [0, 0]. (in world space)
+   Here, the vertices are using image coords, which centers on [w/2, h/2] with the y-axis pointing to oppisite direction.
+ Means: render here only conducts interpolation.(I just want to make the input flexible)
+
+Author: Yao Feng 
+Mail: yaofeng1995@gmail.com
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from time import time
+
+from .cython import mesh_core_cython
+
+def rasterize_triangles(vertices, triangles, h, w):
+    ''' 
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        h: height
+        w: width
+    Returns:
+        depth_buffer: [h, w] saves the depth, here, the bigger the z, the fronter the point.
+        triangle_buffer: [h, w] saves the tri id(-1 for no triangle). 
+        barycentric_weight: [h, w, 3] saves corresponding barycentric weight.
+
+    # Each triangle has 3 vertices & Each vertex has 3 coordinates x, y, z.
+    # h, w is the size of rendering
+    '''
+
+    # initial 
+    depth_buffer = np.zeros([h, w]) - 999999. #set the initial z to the farest position
+    triangle_buffer = np.zeros([h, w], dtype = np.int32) - 1  # if tri id = -1, the pixel has no triangle correspondance
+    barycentric_weight = np.zeros([h, w, 3], dtype = np.float32)  # 
+    
+    vertices = vertices.astype(np.float32).copy()
+    triangles = triangles.astype(np.int32).copy()
+
+    mesh_core_cython.rasterize_triangles_core(
+                vertices, triangles,
+                depth_buffer, triangle_buffer, barycentric_weight, 
+                vertices.shape[0], triangles.shape[0], 
+                h, w)
+
+def render_colors(vertices, triangles, colors, h, w, c = 3, BG = None):
+    ''' render mesh with colors
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3] 
+        colors: [nver, 3]
+        h: height
+        w: width  
+        c: channel
+        BG: background image
+    Returns:
+        image: [h, w, c]. rendered image./rendering.
+    '''
+
+    # initial 
+    if BG is None:
+        image = np.zeros((h, w, c), dtype = np.float32)
+    else:
+        assert BG.shape[0] == h and BG.shape[1] == w and BG.shape[2] == c
+        image = BG
+    depth_buffer = np.zeros([h, w], dtype = np.float32, order = 'C') - 999999.
+
+    # change orders. --> C-contiguous order(column major)
+    vertices = vertices.astype(np.float32).copy()
+    triangles = triangles.astype(np.int32).copy()
+    colors = colors.astype(np.float32).copy()
+    ###
+    st = time()
+    mesh_core_cython.render_colors_core(
+                image, vertices, triangles,
+                colors,
+                depth_buffer,
+                vertices.shape[0], triangles.shape[0], 
+                h, w, c)
+    return image
+
+
+def render_texture(vertices, triangles, texture, tex_coords, tex_triangles, h, w, c = 3, mapping_type = 'nearest', BG = None):
+    ''' render mesh with texture map
+    Args:
+        vertices: [3, nver]
+        triangles: [3, ntri]
+        texture: [tex_h, tex_w, 3]
+        tex_coords: [ntexcoords, 3]
+        tex_triangles: [ntri, 3]
+        h: height of rendering
+        w: width of rendering
+        c: channel
+        mapping_type: 'bilinear' or 'nearest'
+    '''
+    # initial 
+    if BG is None:
+        image = np.zeros((h, w, c), dtype = np.float32)
+    else:
+        assert BG.shape[0] == h and BG.shape[1] == w and BG.shape[2] == c
+        image = BG
+
+    depth_buffer = np.zeros([h, w], dtype = np.float32, order = 'C') - 999999.
+    
+    tex_h, tex_w, tex_c = texture.shape
+    if mapping_type == 'nearest':
+        mt = int(0)
+    elif mapping_type == 'bilinear':
+        mt = int(1)
+    else:
+        mt = int(0)
+    
+    # -> C order
+    vertices = vertices.astype(np.float32).copy()
+    triangles = triangles.astype(np.int32).copy()
+    texture = texture.astype(np.float32).copy()
+    tex_coords = tex_coords.astype(np.float32).copy()
+    tex_triangles = tex_triangles.astype(np.int32).copy()
+
+    mesh_core_cython.render_texture_core(
+                image, vertices, triangles,
+                texture, tex_coords, tex_triangles,
+                depth_buffer,
+                vertices.shape[0], tex_coords.shape[0], triangles.shape[0], 
+                h, w, c,
+                tex_h, tex_w, tex_c,
+                mt)
+    return image
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/transform.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91b09fae2bcead1ae2f29d8d43831e556494a2f
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/transform.py
@@ -0,0 +1,383 @@
+'''
+Functions about transforming mesh(changing the position: modify vertices).
+1. forward: transform(transform, camera, project).
+2. backward: estimate transform matrix from correspondences.
+
+Author: Yao Feng 
+Mail: yaofeng1995@gmail.com
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import math
+from math import cos, sin
+
+def angle2matrix(angles):
+    ''' get rotation matrix from three rotation angles(degree). right-handed.
+    Args:
+        angles: [3,]. x, y, z angles
+        x: pitch. positive for looking down.
+        y: yaw. positive for looking left. 
+        z: roll. positive for tilting head right. 
+    Returns:
+        R: [3, 3]. rotation matrix.
+    '''
+    x, y, z = np.deg2rad(angles[0]), np.deg2rad(angles[1]), np.deg2rad(angles[2])
+    # x
+    Rx=np.array([[1,      0,       0],
+                 [0, cos(x),  -sin(x)],
+                 [0, sin(x),   cos(x)]])
+    # y
+    Ry=np.array([[ cos(y), 0, sin(y)],
+                 [      0, 1,      0],
+                 [-sin(y), 0, cos(y)]])
+    # z
+    Rz=np.array([[cos(z), -sin(z), 0],
+                 [sin(z),  cos(z), 0],
+                 [     0,       0, 1]])
+    
+    R=Rz.dot(Ry.dot(Rx))
+    return R.astype(np.float32)
+
+def angle2matrix_3ddfa(angles):
+    ''' get rotation matrix from three rotation angles(radian). The same as in 3DDFA.
+    Args:
+        angles: [3,]. x, y, z angles
+        x: pitch.
+        y: yaw. 
+        z: roll. 
+    Returns:
+        R: 3x3. rotation matrix.
+    '''
+    # x, y, z = np.deg2rad(angles[0]), np.deg2rad(angles[1]), np.deg2rad(angles[2])
+    x, y, z = angles[0], angles[1], angles[2]
+    
+    # x
+    Rx=np.array([[1,      0,       0],
+                 [0, cos(x),  sin(x)],
+                 [0, -sin(x),   cos(x)]])
+    # y
+    Ry=np.array([[ cos(y), 0, -sin(y)],
+                 [      0, 1,      0],
+                 [sin(y), 0, cos(y)]])
+    # z
+    Rz=np.array([[cos(z), sin(z), 0],
+                 [-sin(z),  cos(z), 0],
+                 [     0,       0, 1]])
+    R = Rx.dot(Ry).dot(Rz)
+    return R.astype(np.float32)
+
+
+## ------------------------------------------ 1. transform(transform, project, camera).
+## ---------- 3d-3d transform. Transform obj in world space
+def rotate(vertices, angles):
+    ''' rotate vertices. 
+    X_new = R.dot(X). X: 3 x 1   
+    Args:
+        vertices: [nver, 3]. 
+        rx, ry, rz: degree angles
+        rx: pitch. positive for looking down 
+        ry: yaw. positive for looking left
+        rz: roll. positive for tilting head right
+    Returns:
+        rotated vertices: [nver, 3]
+    '''
+    R = angle2matrix(angles)
+    rotated_vertices = vertices.dot(R.T)
+
+    return rotated_vertices
+
+def similarity_transform(vertices, s, R, t3d):
+    ''' similarity transform. dof = 7.
+    3D: s*R.dot(X) + t
+    Homo: M = [[sR, t],[0^T, 1]].  M.dot(X)
+    Args:(float32)
+        vertices: [nver, 3]. 
+        s: [1,]. scale factor.
+        R: [3,3]. rotation matrix.
+        t3d: [3,]. 3d translation vector.
+    Returns:
+        transformed vertices: [nver, 3]
+    '''
+    t3d = np.squeeze(np.array(t3d, dtype = np.float32))
+    transformed_vertices = s * vertices.dot(R.T) + t3d[np.newaxis, :]
+
+    return transformed_vertices
+
+
+## -------------- Camera. from world space to camera space
+# Ref: https://cs184.eecs.berkeley.edu/lecture/transforms-2
+def normalize(x):
+    epsilon = 1e-12
+    norm = np.sqrt(np.sum(x**2, axis = 0))
+    norm = np.maximum(norm, epsilon)
+    return x/norm
+
+def lookat_camera(vertices, eye, at = None, up = None):
+    """ 'look at' transformation: from world space to camera space
+    standard camera space: 
+        camera located at the origin. 
+        looking down negative z-axis. 
+        vertical vector is y-axis.
+    Xcam = R(X - C)
+    Homo: [[R, -RC], [0, 1]]
+    Args:
+      vertices: [nver, 3] 
+      eye: [3,] the XYZ world space position of the camera.
+      at: [3,] a position along the center of the camera's gaze.
+      up: [3,] up direction 
+    Returns:
+      transformed_vertices: [nver, 3]
+    """
+    if at is None:
+      at = np.array([0, 0, 0], np.float32)
+    if up is None:
+      up = np.array([0, 1, 0], np.float32)
+
+    eye = np.array(eye).astype(np.float32)
+    at = np.array(at).astype(np.float32)
+    z_aixs = -normalize(at - eye) # look forward
+    x_aixs = normalize(np.cross(up, z_aixs)) # look right
+    y_axis = np.cross(z_aixs, x_aixs) # look up
+
+    R = np.stack((x_aixs, y_axis, z_aixs))#, axis = 0) # 3 x 3
+    transformed_vertices = vertices - eye # translation
+    transformed_vertices = transformed_vertices.dot(R.T) # rotation
+    return transformed_vertices
+
+## --------- 3d-2d project. from camera space to image plane
+# generally, image plane only keeps x,y channels, here reserve z channel for calculating z-buffer.
+def orthographic_project(vertices):
+    ''' scaled orthographic projection(just delete z)
+        assumes: variations in depth over the object is small relative to the mean distance from camera to object
+        x -> x*f/z, y -> x*f/z, z -> f.
+        for point i,j. zi~=zj. so just delete z
+        ** often used in face
+        Homo: P = [[1,0,0,0], [0,1,0,0], [0,0,1,0]]
+    Args:
+        vertices: [nver, 3]
+    Returns:
+        projected_vertices: [nver, 3] if isKeepZ=True. [nver, 2] if isKeepZ=False.
+    '''
+    return vertices.copy()
+
+def perspective_project(vertices, fovy, aspect_ratio = 1., near = 0.1, far = 1000.):
+    ''' perspective projection.
+    Args:
+        vertices: [nver, 3]
+        fovy: vertical angular field of view. degree.
+        aspect_ratio : width / height of field of view
+        near : depth of near clipping plane
+        far : depth of far clipping plane
+    Returns:
+        projected_vertices: [nver, 3] 
+    '''
+    fovy = np.deg2rad(fovy)
+    top = near*np.tan(fovy)
+    bottom = -top 
+    right = top*aspect_ratio
+    left = -right
+
+    #-- homo
+    P = np.array([[near/right, 0, 0, 0],
+                 [0, near/top, 0, 0],
+                 [0, 0, -(far+near)/(far-near), -2*far*near/(far-near)],
+                 [0, 0, -1, 0]])
+    vertices_homo = np.hstack((vertices, np.ones((vertices.shape[0], 1)))) # [nver, 4]
+    projected_vertices = vertices_homo.dot(P.T)
+    projected_vertices = projected_vertices/projected_vertices[:,3:]
+    projected_vertices = projected_vertices[:,:3]
+    projected_vertices[:,2] = -projected_vertices[:,2]
+
+    #-- non homo. only fovy
+    # projected_vertices = vertices.copy()
+    # projected_vertices[:,0] = -(near/right)*vertices[:,0]/vertices[:,2]
+    # projected_vertices[:,1] = -(near/top)*vertices[:,1]/vertices[:,2]
+    return projected_vertices
+
+
+def to_image(vertices, h, w, is_perspective = False):
+    ''' change vertices to image coord system
+    3d system: XYZ, center(0, 0, 0)
+    2d image: x(u), y(v). center(w/2, h/2), flip y-axis. 
+    Args:
+        vertices: [nver, 3]
+        h: height of the rendering
+        w : width of the rendering
+    Returns:
+        projected_vertices: [nver, 3]  
+    '''
+    image_vertices = vertices.copy()
+    if is_perspective:
+        # if perspective, the projected vertices are normalized to [-1, 1]. so change it to image size first.
+        image_vertices[:,0] = image_vertices[:,0]*w/2
+        image_vertices[:,1] = image_vertices[:,1]*h/2
+    # move to center of image
+    image_vertices[:,0] = image_vertices[:,0] + w/2
+    image_vertices[:,1] = image_vertices[:,1] + h/2
+    # flip vertices along y-axis.
+    image_vertices[:,1] = h - image_vertices[:,1] - 1
+    return image_vertices
+
+
+#### -------------------------------------------2. estimate transform matrix from correspondences.
+def estimate_affine_matrix_3d23d(X, Y):
+    ''' Using least-squares solution 
+    Args:
+        X: [n, 3]. 3d points(fixed)
+        Y: [n, 3]. corresponding 3d points(moving). Y = PX
+    Returns:
+        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
+    '''
+    X_homo = np.hstack((X, np.ones([X.shape[1],1]))) #n x 4
+    P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
+    return P
+    
+def estimate_affine_matrix_3d22d(X, x):
+    ''' Using Golden Standard Algorithm for estimating an affine camera
+        matrix P from world to image correspondences.
+        See Alg.7.2. in MVGCV 
+        Code Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/fitting/affine_camera_estimation.hpp
+        x_homo = X_homo.dot(P_Affine)
+    Args:
+        X: [n, 3]. corresponding 3d points(fixed)
+        x: [n, 2]. n>=4. 2d points(moving). x = PX
+    Returns:
+        P_Affine: [3, 4]. Affine camera matrix
+    '''
+    X = X.T; x = x.T
+    assert(x.shape[1] == X.shape[1])
+    n = x.shape[1]
+    assert(n >= 4)
+
+    #--- 1. normalization
+    # 2d points
+    mean = np.mean(x, 1) # (2,)
+    x = x - np.tile(mean[:, np.newaxis], [1, n])
+    average_norm = np.mean(np.sqrt(np.sum(x**2, 0)))
+    scale = np.sqrt(2) / average_norm
+    x = scale * x
+
+    T = np.zeros((3,3), dtype = np.float32)
+    T[0, 0] = T[1, 1] = scale
+    T[:2, 2] = -mean*scale
+    T[2, 2] = 1
+
+    # 3d points
+    X_homo = np.vstack((X, np.ones((1, n))))
+    mean = np.mean(X, 1) # (3,)
+    X = X - np.tile(mean[:, np.newaxis], [1, n])
+    m = X_homo[:3,:] - X
+    average_norm = np.mean(np.sqrt(np.sum(X**2, 0)))
+    scale = np.sqrt(3) / average_norm
+    X = scale * X
+
+    U = np.zeros((4,4), dtype = np.float32)
+    U[0, 0] = U[1, 1] = U[2, 2] = scale
+    U[:3, 3] = -mean*scale
+    U[3, 3] = 1
+
+    # --- 2. equations
+    A = np.zeros((n*2, 8), dtype = np.float32);
+    X_homo = np.vstack((X, np.ones((1, n)))).T
+    A[:n, :4] = X_homo
+    A[n:, 4:] = X_homo
+    b = np.reshape(x, [-1, 1])
+ 
+    # --- 3. solution
+    p_8 = np.linalg.pinv(A).dot(b)
+    P = np.zeros((3, 4), dtype = np.float32)
+    P[0, :] = p_8[:4, 0]
+    P[1, :] = p_8[4:, 0]
+    P[-1, -1] = 1
+
+    # --- 4. denormalization
+    P_Affine = np.linalg.inv(T).dot(P.dot(U))
+    return P_Affine
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args: 
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation. 
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0
+    r1 = R1/np.linalg.norm(R1)
+    r2 = R2/np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+#Ref: https://www.learnopencv.com/rotation-matrix-to-euler-angles/
+def isRotationMatrix(R):
+    ''' checks if a matrix is a valid rotation matrix(whether orthogonal or not)
+    '''
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    I = np.identity(3, dtype = R.dtype)
+    n = np.linalg.norm(I - shouldBeIdentity)
+    return n < 1e-6
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    assert(isRotationMatrix)
+    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])
+     
+    singular = sy < 1e-6
+ 
+    if  not singular :
+        x = math.atan2(R[2,1] , R[2,2])
+        y = math.atan2(-R[2,0], sy)
+        z = math.atan2(R[1,0], R[0,0])
+    else :
+        x = math.atan2(-R[1,2], R[1,1])
+        y = math.atan2(-R[2,0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi
+    return rx, ry, rz
+
+# def matrix2angle(R):
+#     ''' compute three Euler angles from a Rotation Matrix. Ref: http://www.gregslabaugh.net/publications/euler.pdf
+#     Args:
+#         R: (3,3). rotation matrix
+#     Returns:
+#         x: yaw
+#         y: pitch
+#         z: roll
+#     '''
+#     # assert(isRotationMatrix(R))
+
+#     if R[2,0] !=1 or R[2,0] != -1:
+#         x = math.asin(R[2,0])
+#         y = math.atan2(R[2,1]/cos(x), R[2,2]/cos(x))
+#         z = math.atan2(R[1,0]/cos(x), R[0,0]/cos(x))
+        
+#     else:# Gimbal lock
+#         z = 0 #can be anything
+#         if R[2,0] == -1:
+#             x = np.pi/2
+#             y = z + math.atan2(R[0,1], R[0,2])
+#         else:
+#             x = -np.pi/2
+#             y = -z + math.atan2(-R[0,1], -R[0,2])
+
+#     return x, y, z
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh/vis.py b/insightface/python-package/insightface/thirdparty/face3d/mesh/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db972f2c463a310fa4abde16d6a7baf02797c90
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh/vis.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import matplotlib.pyplot as plt
+from skimage import measure
+from mpl_toolkits.mplot3d import Axes3D
+
+def plot_mesh(vertices, triangles, subplot = [1,1,1], title = 'mesh', el = 90, az = -90, lwdt=.1, dist = 6, color = "grey"):
+	'''
+	plot the mesh 
+	Args:
+		vertices: [nver, 3]
+		triangles: [ntri, 3]
+	'''
+	ax = plt.subplot(subplot[0], subplot[1], subplot[2], projection = '3d')
+	ax.plot_trisurf(vertices[:, 0], vertices[:, 1], vertices[:, 2], triangles = triangles, lw = lwdt, color = color, alpha = 1)
+	ax.axis("off")
+	ax.view_init(elev = el, azim = az)
+	ax.dist = dist
+	plt.title(title)
+
+### -------------- Todo: use vtk to visualize mesh? or visvis? or VisPy?
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/__init__.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65d503903b8bec3ab20bd1915618dbd467710207
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/__init__.py
@@ -0,0 +1,10 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from . import io
+from . import vis
+from . import transform
+from . import light
+from . import render
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/io.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e48a76d21689c6f3b85cb1aa9ad6123058b3816
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/io.py
@@ -0,0 +1,170 @@
+''' io: read&write mesh
+1. read obj as array(TODO)
+2. write arrays to obj
+
+Preparation knowledge:
+representations of 3d face: mesh, point cloud...
+storage format: obj, ply, bin, asc, mat...
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+from skimage import io
+
+## TODO
+## TODO: c++ version
+def read_obj(obj_name):
+	''' read mesh
+	'''
+	return 0
+
+# ------------------------- write
+def write_asc(path, vertices):
+    '''
+    Args:
+        vertices: shape = (nver, 3)
+    '''
+    if path.split('.')[-1] == 'asc':
+        np.savetxt(path, vertices)
+    else:
+        np.savetxt(path + '.asc', vertices)
+
+def write_obj_with_colors(obj_name, vertices, triangles, colors):
+    ''' Save 3D face model with texture represented by colors.
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        colors: shape = (nver, 3)
+    '''
+    triangles = triangles.copy()
+    triangles += 1 # meshlab start with 1
+    
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+        
+    # write obj
+    with open(obj_name, 'w') as f:
+        
+        # write vertices & colors
+        for i in range(vertices.shape[0]):
+            # s = 'v {} {} {} \n'.format(vertices[0,i], vertices[1,i], vertices[2,i])
+            s = 'v {} {} {} {} {} {}\n'.format(vertices[i, 0], vertices[i, 1], vertices[i, 2], colors[i, 0], colors[i, 1], colors[i, 2])
+            f.write(s)
+
+        # write f: ver ind/ uv ind
+        [k, ntri] = triangles.shape
+        for i in range(triangles.shape[0]):
+            # s = 'f {} {} {}\n'.format(triangles[i, 0], triangles[i, 1], triangles[i, 2])
+            s = 'f {} {} {}\n'.format(triangles[i, 2], triangles[i, 1], triangles[i, 0])
+            f.write(s)
+
+## TODO: c++ version
+def write_obj_with_texture(obj_name, vertices, triangles, texture, uv_coords):
+    ''' Save 3D face model with texture represented by texture map.
+    Ref: https://github.com/patrikhuber/eos/blob/bd00155ebae4b1a13b08bf5a991694d682abbada/include/eos/core/Mesh.hpp
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        texture: shape = (256,256,3)
+        uv_coords: shape = (nver, 3) max value<=1
+    '''
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+    mtl_name = obj_name.replace('.obj', '.mtl')
+    texture_name = obj_name.replace('.obj', '_texture.png')
+    
+    triangles = triangles.copy()
+    triangles += 1 # mesh lab start with 1
+    
+    # write obj
+    with open(obj_name, 'w') as f:
+        # first line: write mtlib(material library)
+        s = "mtllib {}\n".format(os.path.abspath(mtl_name))
+        f.write(s)
+
+        # write vertices
+        for i in range(vertices.shape[0]):
+            s = 'v {} {} {}\n'.format(vertices[i, 0], vertices[i, 1], vertices[i, 2])
+            f.write(s)
+        
+        # write uv coords
+        for i in range(uv_coords.shape[0]):
+            # s = 'vt {} {}\n'.format(uv_coords[i,0], 1 - uv_coords[i,1])
+            s = 'vt {} {}\n'.format(uv_coords[i,0], uv_coords[i,1])
+            f.write(s)
+
+        f.write("usemtl FaceTexture\n")
+
+        # write f: ver ind/ uv ind
+        for i in range(triangles.shape[0]):
+            s = 'f {}/{} {}/{} {}/{}\n'.format(triangles[i,2], triangles[i,2], triangles[i,1], triangles[i,1], triangles[i,0], triangles[i,0])
+            f.write(s)
+
+    # write mtl
+    with open(mtl_name, 'w') as f:
+        f.write("newmtl FaceTexture\n")
+        s = 'map_Kd {}\n'.format(os.path.abspath(texture_name)) # map to image
+        f.write(s)
+
+    # write texture as png
+    imsave(texture_name, texture)
+
+
+def write_obj_with_colors_texture(obj_name, vertices, triangles, colors, texture, uv_coords):
+    ''' Save 3D face model with texture. 
+    Ref: https://github.com/patrikhuber/eos/blob/bd00155ebae4b1a13b08bf5a991694d682abbada/include/eos/core/Mesh.hpp
+    Args:
+        obj_name: str
+        vertices: shape = (nver, 3)
+        triangles: shape = (ntri, 3)
+        colors: shape = (nver, 3)
+        texture: shape = (256,256,3)
+        uv_coords: shape = (nver, 3) max value<=1
+    '''
+    if obj_name.split('.')[-1] != 'obj':
+        obj_name = obj_name + '.obj'
+    mtl_name = obj_name.replace('.obj', '.mtl')
+    texture_name = obj_name.replace('.obj', '_texture.png')
+    
+    triangles = triangles.copy()
+    triangles += 1 # mesh lab start with 1
+    
+    # write obj
+    with open(obj_name, 'w') as f:
+        # first line: write mtlib(material library)
+        s = "mtllib {}\n".format(os.path.abspath(mtl_name))
+        f.write(s)
+
+        # write vertices
+        for i in range(vertices.shape[0]):
+            s = 'v {} {} {} {} {} {}\n'.format(vertices[i, 0], vertices[i, 1], vertices[i, 2], colors[i, 0], colors[i, 1], colors[i, 2])
+            f.write(s)
+        
+        # write uv coords
+        for i in range(uv_coords.shape[0]):
+            # s = 'vt {} {}\n'.format(uv_coords[i,0], 1 - uv_coords[i,1])
+            s = 'vt {} {}\n'.format(uv_coords[i,0], uv_coords[i,1])
+            f.write(s)
+
+        f.write("usemtl FaceTexture\n")
+
+        # write f: ver ind/ uv ind
+        for i in range(triangles.shape[0]):
+            # s = 'f {}/{} {}/{} {}/{}\n'.format(triangles[i,0], triangles[i,0], triangles[i,1], triangles[i,1], triangles[i,2], triangles[i,2])
+            s = 'f {}/{} {}/{} {}/{}\n'.format(triangles[i,2], triangles[i,2], triangles[i,1], triangles[i,1], triangles[i,0], triangles[i,0])
+            f.write(s)
+
+    # write mtl
+    with open(mtl_name, 'w') as f:
+        f.write("newmtl FaceTexture\n")
+        s = 'map_Kd {}\n'.format(os.path.abspath(texture_name)) # map to image
+        f.write(s)
+
+    # write texture as png
+    io.imsave(texture_name, texture)
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/light.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/light.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde57114bdd57b6e9a2bac59942bfcbfbf17cbe3
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/light.py
@@ -0,0 +1,215 @@
+'''
+Functions about lighting mesh(changing colors/texture of mesh).
+1. add light to colors/texture (shade each vertex)
+2. fit light according to colors/texture & image.
+
+Preparation knowledge:
+lighting: https://cs184.eecs.berkeley.edu/lecture/pipeline
+spherical harmonics in human face: '3D Face Reconstruction from a Single Image Using a Single Reference Face Shape'
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+def get_normal(vertices, triangles):
+    ''' calculate normal direction in each vertex
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+    Returns:
+        normal: [nver, 3]
+    '''
+    pt0 = vertices[triangles[:, 0], :] # [ntri, 3]
+    pt1 = vertices[triangles[:, 1], :] # [ntri, 3]
+    pt2 = vertices[triangles[:, 2], :] # [ntri, 3]
+    tri_normal = np.cross(pt0 - pt1, pt0 - pt2) # [ntri, 3]. normal of each triangle
+
+    normal = np.zeros_like(vertices) # [nver, 3]
+    for i in range(triangles.shape[0]):
+        normal[triangles[i, 0], :] = normal[triangles[i, 0], :] + tri_normal[i, :]
+        normal[triangles[i, 1], :] = normal[triangles[i, 1], :] + tri_normal[i, :]
+        normal[triangles[i, 2], :] = normal[triangles[i, 2], :] + tri_normal[i, :]
+    
+    # normalize to unit length
+    mag = np.sum(normal**2, 1) # [nver]
+    zero_ind = (mag == 0)
+    mag[zero_ind] = 1;
+    normal[zero_ind, 0] = np.ones((np.sum(zero_ind)))
+
+    normal = normal/np.sqrt(mag[:,np.newaxis])
+
+    return normal
+
+# TODO: test
+def add_light_sh(vertices, triangles, colors, sh_coeff):
+    ''' 
+    In 3d face, usually assume:
+    1. The surface of face is Lambertian(reflect only the low frequencies of lighting)
+    2. Lighting can be an arbitrary combination of point sources
+    --> can be expressed in terms of spherical harmonics(omit the lighting coefficients)
+    I = albedo * (sh(n) x sh_coeff)
+    
+    albedo: n x 1
+    sh_coeff: 9 x 1
+    Y(n) = (1, n_x, n_y, n_z, n_xn_y, n_xn_z, n_yn_z, n_x^2 - n_y^2, 3n_z^2 - 1)': n x 9 
+    # Y(n) = (1, n_x, n_y, n_z)': n x 4
+
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        colors: [nver, 3] albedo
+        sh_coeff: [9, 1] spherical harmonics coefficients
+
+    Returns:
+        lit_colors: [nver, 3]
+    '''
+    assert vertices.shape[0] == colors.shape[0]
+    nver = vertices.shape[0]
+    normal = get_normal(vertices, triangles) # [nver, 3]
+    sh = np.array((np.ones(nver), n[:,0], n[:,1], n[:,2], n[:,0]*n[:,1], n[:,0]*n[:,2], n[:,1]*n[:,2], n[:,0]**2 - n[:,1]**2, 3*(n[:,2]**2) - 1)) # [nver, 9]
+    ref = sh.dot(sh_coeff) #[nver, 1]
+    lit_colors = colors*ref
+    return lit_colors
+
+
+def add_light(vertices, triangles, colors, light_positions = 0, light_intensities = 0):
+    ''' Gouraud shading. add point lights.
+    In 3d face, usually assume:
+    1. The surface of face is Lambertian(reflect only the low frequencies of lighting)
+    2. Lighting can be an arbitrary combination of point sources
+    3. No specular (unless skin is oil, 23333)
+
+    Ref: https://cs184.eecs.berkeley.edu/lecture/pipeline    
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        light_positions: [nlight, 3] 
+        light_intensities: [nlight, 3]
+    Returns:
+        lit_colors: [nver, 3]
+    '''
+    nver = vertices.shape[0]
+    normals = get_normal(vertices, triangles) # [nver, 3]
+
+    # ambient
+    # La = ka*Ia
+
+    # diffuse
+    # Ld = kd*(I/r^2)max(0, nxl)
+    direction_to_lights = vertices[np.newaxis, :, :] - light_positions[:, np.newaxis, :] # [nlight, nver, 3]
+    direction_to_lights_n = np.sqrt(np.sum(direction_to_lights**2, axis = 2)) # [nlight, nver]
+    direction_to_lights = direction_to_lights/direction_to_lights_n[:, :, np.newaxis]
+    normals_dot_lights = normals[np.newaxis, :, :]*direction_to_lights # [nlight, nver, 3]
+    normals_dot_lights = np.sum(normals_dot_lights, axis = 2) # [nlight, nver]
+    diffuse_output = colors[np.newaxis, :, :]*normals_dot_lights[:, :, np.newaxis]*light_intensities[:, np.newaxis, :]
+    diffuse_output = np.sum(diffuse_output, axis = 0) # [nver, 3]
+    
+    # specular
+    # h = (v + l)/(|v + l|) bisector
+    # Ls = ks*(I/r^2)max(0, nxh)^p
+    # increasing p narrows the reflectionlob
+
+    lit_colors = diffuse_output # only diffuse part here.
+    lit_colors = np.minimum(np.maximum(lit_colors, 0), 1)
+    return lit_colors
+
+
+
+## TODO. estimate light(sh coeff)
+## -------------------------------- estimate. can not use now. 
+def fit_light(image, vertices, colors, triangles, vis_ind, lamb = 10, max_iter = 3):
+    [h, w, c] = image.shape
+
+    # surface normal
+    norm = get_normal(vertices, triangles)
+    
+    nver = vertices.shape[1]
+
+    # vertices --> corresponding image pixel
+    pt2d = vertices[:2, :]
+
+    pt2d[0,:] = np.minimum(np.maximum(pt2d[0,:], 0), w - 1)
+    pt2d[1,:] = np.minimum(np.maximum(pt2d[1,:], 0), h - 1)
+    pt2d = np.round(pt2d).astype(np.int32) # 2 x nver
+
+    image_pixel = image[pt2d[1,:], pt2d[0,:], :] # nver x 3
+    image_pixel = image_pixel.T # 3 x nver
+
+    # vertices --> corresponding mean texture pixel with illumination
+    # Spherical Harmonic Basis
+    harmonic_dim = 9
+    nx = norm[0,:];
+    ny = norm[1,:];
+    nz = norm[2,:];
+    harmonic = np.zeros((nver, harmonic_dim))
+
+    pi = np.pi
+    harmonic[:,0] = np.sqrt(1/(4*pi)) * np.ones((nver,));
+    harmonic[:,1] = np.sqrt(3/(4*pi)) * nx;
+    harmonic[:,2] = np.sqrt(3/(4*pi)) * ny;
+    harmonic[:,3] = np.sqrt(3/(4*pi)) * nz;
+    harmonic[:,4] = 1/2. * np.sqrt(3/(4*pi)) * (2*nz**2 - nx**2 - ny**2);
+    harmonic[:,5] = 3 * np.sqrt(5/(12*pi)) * (ny*nz);
+    harmonic[:,6] = 3 * np.sqrt(5/(12*pi)) * (nx*nz);
+    harmonic[:,7] = 3 * np.sqrt(5/(12*pi)) * (nx*ny);
+    harmonic[:,8] = 3/2. * np.sqrt(5/(12*pi)) * (nx*nx - ny*ny);
+    
+    '''
+    I' = sum(albedo * lj * hj) j = 0:9 (albedo = tex)
+    set A = albedo*h (n x 9)
+        alpha = lj (9 x 1)
+        Y = I (n x 1)
+        Y' = A.dot(alpha)
+
+    opt function:
+        ||Y - A*alpha|| + lambda*(alpha'*alpha)
+    result:
+        A'*(Y - A*alpha) + lambda*alpha = 0
+        ==>
+        (A'*A*alpha - lambda)*alpha = A'*Y
+        left: 9 x 9
+        right: 9 x 1
+    '''
+    n_vis_ind = len(vis_ind)
+    n = n_vis_ind*c
+
+    Y = np.zeros((n, 1))
+    A = np.zeros((n, 9))
+    light = np.zeros((3, 1))
+
+    for k in range(c):
+        Y[k*n_vis_ind:(k+1)*n_vis_ind, :] = image_pixel[k, vis_ind][:, np.newaxis]
+        A[k*n_vis_ind:(k+1)*n_vis_ind, :] = texture[k, vis_ind][:, np.newaxis] * harmonic[vis_ind, :]
+        Ac = texture[k, vis_ind][:, np.newaxis]
+        Yc = image_pixel[k, vis_ind][:, np.newaxis]
+        light[k] = (Ac.T.dot(Yc))/(Ac.T.dot(Ac))
+
+    for i in range(max_iter):
+
+        Yc = Y.copy()
+        for k in range(c):
+            Yc[k*n_vis_ind:(k+1)*n_vis_ind, :]  /= light[k]
+
+        # update alpha
+        equation_left = np.dot(A.T, A) + lamb*np.eye(harmonic_dim); # why + ?
+        equation_right = np.dot(A.T, Yc) 
+        alpha = np.dot(np.linalg.inv(equation_left), equation_right)
+
+        # update light
+        for k in range(c):
+            Ac = A[k*n_vis_ind:(k+1)*n_vis_ind, :].dot(alpha)
+            Yc = Y[k*n_vis_ind:(k+1)*n_vis_ind, :]
+            light[k] = (Ac.T.dot(Yc))/(Ac.T.dot(Ac))
+
+    appearance = np.zeros_like(texture)
+    for k in range(c):
+        tmp = np.dot(harmonic*texture[k, :][:, np.newaxis], alpha*light[k])
+        appearance[k,:] = tmp.T
+
+    appearance = np.minimum(np.maximum(appearance, 0), 1)
+
+    return appearance
+
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/render.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/render.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0d92d31ee98dd17cffc9dd67180107f52340f2
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/render.py
@@ -0,0 +1,287 @@
+'''
+functions about rendering mesh(from 3d obj to 2d image).
+only use rasterization render here.
+Note that:
+1. Generally, render func includes camera, light, raterize. Here no camera and light(I write these in other files)
+2. Generally, the input vertices are normalized to [-1,1] and cetered on [0, 0]. (in world space)
+   Here, the vertices are using image coords, which centers on [w/2, h/2] with the y-axis pointing to oppisite direction.
+Means: render here only conducts interpolation.(I just want to make the input flexible)
+
+Preparation knowledge:
+z-buffer: https://cs184.eecs.berkeley.edu/lecture/pipeline
+
+Author: Yao Feng 
+Mail: yaofeng1995@gmail.com
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from time import time
+
+def isPointInTri(point, tri_points):
+    ''' Judge whether the point is in the triangle
+    Method:
+        http://blackpawn.com/texts/pointinpoly/
+    Args:
+        point: (2,). [u, v] or [x, y] 
+        tri_points: (3 vertices, 2 coords). three vertices(2d points) of a triangle. 
+    Returns:
+        bool: true for in triangle
+    '''
+    tp = tri_points
+
+    # vectors
+    v0 = tp[2,:] - tp[0,:]
+    v1 = tp[1,:] - tp[0,:]
+    v2 = point - tp[0,:]
+
+    # dot products
+    dot00 = np.dot(v0.T, v0)
+    dot01 = np.dot(v0.T, v1)
+    dot02 = np.dot(v0.T, v2)
+    dot11 = np.dot(v1.T, v1)
+    dot12 = np.dot(v1.T, v2)
+
+    # barycentric coordinates
+    if dot00*dot11 - dot01*dot01 == 0:
+        inverDeno = 0
+    else:
+        inverDeno = 1/(dot00*dot11 - dot01*dot01)
+
+    u = (dot11*dot02 - dot01*dot12)*inverDeno
+    v = (dot00*dot12 - dot01*dot02)*inverDeno
+
+    # check if point in triangle
+    return (u >= 0) & (v >= 0) & (u + v < 1)
+
+def get_point_weight(point, tri_points):
+    ''' Get the weights of the position
+    Methods: https://gamedev.stackexchange.com/questions/23743/whats-the-most-efficient-way-to-find-barycentric-coordinates
+     -m1.compute the area of the triangles formed by embedding the point P inside the triangle
+     -m2.Christer Ericson's book "Real-Time Collision Detection". faster.(used)
+    Args:
+        point: (2,). [u, v] or [x, y] 
+        tri_points: (3 vertices, 2 coords). three vertices(2d points) of a triangle. 
+    Returns:
+        w0: weight of v0
+        w1: weight of v1
+        w2: weight of v3
+     '''
+    tp = tri_points
+    # vectors
+    v0 = tp[2,:] - tp[0,:]
+    v1 = tp[1,:] - tp[0,:]
+    v2 = point - tp[0,:]
+
+    # dot products
+    dot00 = np.dot(v0.T, v0)
+    dot01 = np.dot(v0.T, v1)
+    dot02 = np.dot(v0.T, v2)
+    dot11 = np.dot(v1.T, v1)
+    dot12 = np.dot(v1.T, v2)
+
+    # barycentric coordinates
+    if dot00*dot11 - dot01*dot01 == 0:
+        inverDeno = 0
+    else:
+        inverDeno = 1/(dot00*dot11 - dot01*dot01)
+
+    u = (dot11*dot02 - dot01*dot12)*inverDeno
+    v = (dot00*dot12 - dot01*dot02)*inverDeno
+
+    w0 = 1 - u - v
+    w1 = v
+    w2 = u
+
+    return w0, w1, w2
+
+def rasterize_triangles(vertices, triangles, h, w):
+    ''' 
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3]
+        h: height
+        w: width
+    Returns:
+        depth_buffer: [h, w] saves the depth, here, the bigger the z, the fronter the point.
+        triangle_buffer: [h, w] saves the tri id(-1 for no triangle). 
+        barycentric_weight: [h, w, 3] saves corresponding barycentric weight.
+
+    # Each triangle has 3 vertices & Each vertex has 3 coordinates x, y, z.
+    # h, w is the size of rendering
+    '''
+    # initial 
+    depth_buffer = np.zeros([h, w]) - 999999. #+ np.min(vertices[2,:]) - 999999. # set the initial z to the farest position
+    triangle_buffer = np.zeros([h, w], dtype = np.int32) - 1  # if tri id = -1, the pixel has no triangle correspondance
+    barycentric_weight = np.zeros([h, w, 3], dtype = np.float32)  # 
+    
+    for i in range(triangles.shape[0]):
+        tri = triangles[i, :] # 3 vertex indices
+
+        # the inner bounding box
+        umin = max(int(np.ceil(np.min(vertices[tri, 0]))), 0)
+        umax = min(int(np.floor(np.max(vertices[tri, 0]))), w-1)
+
+        vmin = max(int(np.ceil(np.min(vertices[tri, 1]))), 0)
+        vmax = min(int(np.floor(np.max(vertices[tri, 1]))), h-1)
+
+        if umax<umin or vmax<vmin:
+            continue
+
+        for u in range(umin, umax+1):
+            for v in range(vmin, vmax+1):
+                if not isPointInTri([u,v], vertices[tri, :2]): 
+                    continue
+                w0, w1, w2 = get_point_weight([u, v], vertices[tri, :2]) # barycentric weight
+                point_depth = w0*vertices[tri[0], 2] + w1*vertices[tri[1], 2] + w2*vertices[tri[2], 2]
+                if point_depth > depth_buffer[v, u]:
+                    depth_buffer[v, u] = point_depth
+                    triangle_buffer[v, u] = i
+                    barycentric_weight[v, u, :] = np.array([w0, w1, w2])
+
+    return depth_buffer, triangle_buffer, barycentric_weight
+
+
+def render_colors_ras(vertices, triangles, colors, h, w, c = 3):
+    ''' render mesh with colors(rasterize triangle first)
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3] 
+        colors: [nver, 3]
+        h: height
+        w: width    
+        c: channel
+    Returns:
+        image: [h, w, c]. rendering.
+    '''
+    assert vertices.shape[0] == colors.shape[0]
+
+    depth_buffer, triangle_buffer, barycentric_weight = rasterize_triangles(vertices, triangles, h, w)
+
+    triangle_buffer_flat = np.reshape(triangle_buffer, [-1]) # [h*w]
+    barycentric_weight_flat = np.reshape(barycentric_weight, [-1, c]) #[h*w, c]
+    weight = barycentric_weight_flat[:, :, np.newaxis] # [h*w, 3(ver in tri), 1]
+
+    colors_flat = colors[triangles[triangle_buffer_flat, :], :] # [h*w(tri id in pixel), 3(ver in tri), c(color in ver)]
+    colors_flat = weight*colors_flat # [h*w, 3, 3]
+    colors_flat = np.sum(colors_flat, 1) #[h*w, 3]. add tri.
+
+    image = np.reshape(colors_flat, [h, w, c])
+    # mask = (triangle_buffer[:,:] > -1).astype(np.float32)
+    # image = image*mask[:,:,np.newaxis]
+    return image
+
+
+def render_colors(vertices, triangles, colors, h, w, c = 3):
+    ''' render mesh with colors
+    Args:
+        vertices: [nver, 3]
+        triangles: [ntri, 3] 
+        colors: [nver, 3]
+        h: height
+        w: width    
+    Returns:
+        image: [h, w, c]. 
+    '''
+    assert vertices.shape[0] == colors.shape[0]
+    
+    # initial 
+    image = np.zeros((h, w, c))
+    depth_buffer = np.zeros([h, w]) - 999999.
+
+    for i in range(triangles.shape[0]):
+        tri = triangles[i, :] # 3 vertex indices
+
+        # the inner bounding box
+        umin = max(int(np.ceil(np.min(vertices[tri, 0]))), 0)
+        umax = min(int(np.floor(np.max(vertices[tri, 0]))), w-1)
+
+        vmin = max(int(np.ceil(np.min(vertices[tri, 1]))), 0)
+        vmax = min(int(np.floor(np.max(vertices[tri, 1]))), h-1)
+
+        if umax<umin or vmax<vmin:
+            continue
+
+        for u in range(umin, umax+1):
+            for v in range(vmin, vmax+1):
+                if not isPointInTri([u,v], vertices[tri, :2]): 
+                    continue
+                w0, w1, w2 = get_point_weight([u, v], vertices[tri, :2])
+                point_depth = w0*vertices[tri[0], 2] + w1*vertices[tri[1], 2] + w2*vertices[tri[2], 2]
+
+                if point_depth > depth_buffer[v, u]:
+                    depth_buffer[v, u] = point_depth
+                    image[v, u, :] = w0*colors[tri[0], :] + w1*colors[tri[1], :] + w2*colors[tri[2], :]
+    return image
+
+
+def render_texture(vertices, triangles, texture, tex_coords, tex_triangles, h, w, c = 3, mapping_type = 'nearest'):
+    ''' render mesh with texture map
+    Args:
+        vertices: [nver], 3
+        triangles: [ntri, 3]
+        texture: [tex_h, tex_w, 3]
+        tex_coords: [ntexcoords, 3]
+        tex_triangles: [ntri, 3]
+        h: height of rendering
+        w: width of rendering
+        c: channel
+        mapping_type: 'bilinear' or 'nearest'
+    '''
+    assert triangles.shape[0] == tex_triangles.shape[0]
+    tex_h, tex_w, _ = texture.shape
+
+    # initial 
+    image = np.zeros((h, w, c))
+    depth_buffer = np.zeros([h, w]) - 999999.
+
+    for i in range(triangles.shape[0]):
+        tri = triangles[i, :] # 3 vertex indices
+        tex_tri = tex_triangles[i, :] # 3 tex indice
+
+        # the inner bounding box
+        umin = max(int(np.ceil(np.min(vertices[tri, 0]))), 0)
+        umax = min(int(np.floor(np.max(vertices[tri, 0]))), w-1)
+
+        vmin = max(int(np.ceil(np.min(vertices[tri, 1]))), 0)
+        vmax = min(int(np.floor(np.max(vertices[tri, 1]))), h-1)
+
+        if umax<umin or vmax<vmin:
+            continue
+
+        for u in range(umin, umax+1):
+            for v in range(vmin, vmax+1):
+                if not isPointInTri([u,v], vertices[tri, :2]): 
+                    continue
+                w0, w1, w2 = get_point_weight([u, v], vertices[tri, :2])
+                point_depth = w0*vertices[tri[0], 2] + w1*vertices[tri[1], 2] + w2*vertices[tri[2], 2]
+                if point_depth > depth_buffer[v, u]:
+                    # update depth
+                    depth_buffer[v, u] = point_depth    
+                    
+                    # tex coord
+                    tex_xy = w0*tex_coords[tex_tri[0], :] + w1*tex_coords[tex_tri[1], :] + w2*tex_coords[tex_tri[2], :]
+                    tex_xy[0] = max(min(tex_xy[0], float(tex_w - 1)), 0.0); 
+                    tex_xy[1] = max(min(tex_xy[1], float(tex_h - 1)), 0.0); 
+
+                    # nearest
+                    if mapping_type == 'nearest':
+                        tex_xy = np.round(tex_xy).astype(np.int32)
+                        tex_value = texture[tex_xy[1], tex_xy[0], :] 
+
+                    # bilinear
+                    elif mapping_type == 'bilinear':
+                        # next 4 pixels
+                        ul = texture[int(np.floor(tex_xy[1])), int(np.floor(tex_xy[0])), :]
+                        ur = texture[int(np.floor(tex_xy[1])), int(np.ceil(tex_xy[0])), :]
+                        dl = texture[int(np.ceil(tex_xy[1])), int(np.floor(tex_xy[0])), :]
+                        dr = texture[int(np.ceil(tex_xy[1])), int(np.ceil(tex_xy[0])), :]
+
+                        yd = tex_xy[1] - np.floor(tex_xy[1])
+                        xd = tex_xy[0] - np.floor(tex_xy[0])
+                        tex_value = ul*(1-xd)*(1-yd) + ur*xd*(1-yd) + dl*(1-xd)*yd + dr*xd*yd
+
+                    image[v, u, :] = tex_value
+    return image
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/transform.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab56e370ce6218650b4fa7cd6f503e3ac73be854
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/transform.py
@@ -0,0 +1,385 @@
+'''
+Functions about transforming mesh(changing the position: modify vertices).
+1. forward: transform(transform, camera, project).
+2. backward: estimate transform matrix from correspondences.
+
+Preparation knowledge:
+transform&camera model:
+https://cs184.eecs.berkeley.edu/lecture/transforms-2
+Part I: camera geometry and single view geometry in MVGCV
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import math
+from math import cos, sin
+
+def angle2matrix(angles):
+    ''' get rotation matrix from three rotation angles(degree). right-handed.
+    Args:
+        angles: [3,]. x, y, z angles
+        x: pitch. positive for looking down.
+        y: yaw. positive for looking left. 
+        z: roll. positive for tilting head right. 
+    Returns:
+        R: [3, 3]. rotation matrix.
+    '''
+    x, y, z = np.deg2rad(angles[0]), np.deg2rad(angles[1]), np.deg2rad(angles[2])
+    # x
+    Rx=np.array([[1,      0,       0],
+                 [0, cos(x),  -sin(x)],
+                 [0, sin(x),   cos(x)]])
+    # y
+    Ry=np.array([[ cos(y), 0, sin(y)],
+                 [      0, 1,      0],
+                 [-sin(y), 0, cos(y)]])
+    # z
+    Rz=np.array([[cos(z), -sin(z), 0],
+                 [sin(z),  cos(z), 0],
+                 [     0,       0, 1]])
+    
+    R=Rz.dot(Ry.dot(Rx))
+    return R.astype(np.float32)
+
+def angle2matrix_3ddfa(angles):
+    ''' get rotation matrix from three rotation angles(radian). The same as in 3DDFA.
+    Args:
+        angles: [3,]. x, y, z angles
+        x: pitch.
+        y: yaw. 
+        z: roll. 
+    Returns:
+        R: 3x3. rotation matrix.
+    '''
+    # x, y, z = np.deg2rad(angles[0]), np.deg2rad(angles[1]), np.deg2rad(angles[2])
+    x, y, z = angles[0], angles[1], angles[2]
+    
+    # x
+    Rx=np.array([[1,      0,       0],
+                 [0, cos(x),  sin(x)],
+                 [0, -sin(x),   cos(x)]])
+    # y
+    Ry=np.array([[ cos(y), 0, -sin(y)],
+                 [      0, 1,      0],
+                 [sin(y), 0, cos(y)]])
+    # z
+    Rz=np.array([[cos(z), sin(z), 0],
+                 [-sin(z),  cos(z), 0],
+                 [     0,       0, 1]])
+    R = Rx.dot(Ry).dot(Rz)
+    return R.astype(np.float32)
+
+
+## ------------------------------------------ 1. transform(transform, project, camera).
+## ---------- 3d-3d transform. Transform obj in world space
+def rotate(vertices, angles):
+    ''' rotate vertices. 
+    X_new = R.dot(X). X: 3 x 1   
+    Args:
+        vertices: [nver, 3]. 
+        rx, ry, rz: degree angles
+        rx: pitch. positive for looking down 
+        ry: yaw. positive for looking left
+        rz: roll. positive for tilting head right
+    Returns:
+        rotated vertices: [nver, 3]
+    '''
+    R = angle2matrix(angles)
+    rotated_vertices = vertices.dot(R.T)
+
+    return rotated_vertices
+
+def similarity_transform(vertices, s, R, t3d):
+    ''' similarity transform. dof = 7.
+    3D: s*R.dot(X) + t
+    Homo: M = [[sR, t],[0^T, 1]].  M.dot(X)
+    Args:(float32)
+        vertices: [nver, 3]. 
+        s: [1,]. scale factor.
+        R: [3,3]. rotation matrix.
+        t3d: [3,]. 3d translation vector.
+    Returns:
+        transformed vertices: [nver, 3]
+    '''
+    t3d = np.squeeze(np.array(t3d, dtype = np.float32))
+    transformed_vertices = s * vertices.dot(R.T) + t3d[np.newaxis, :]
+
+    return transformed_vertices
+
+
+## -------------- Camera. from world space to camera space
+# Ref: https://cs184.eecs.berkeley.edu/lecture/transforms-2
+def normalize(x):
+    epsilon = 1e-12
+    norm = np.sqrt(np.sum(x**2, axis = 0))
+    norm = np.maximum(norm, epsilon)
+    return x/norm
+
+def lookat_camera(vertices, eye, at = None, up = None):
+    """ 'look at' transformation: from world space to camera space
+    standard camera space: 
+        camera located at the origin. 
+        looking down negative z-axis. 
+        vertical vector is y-axis.
+    Xcam = R(X - C)
+    Homo: [[R, -RC], [0, 1]]
+    Args:
+      vertices: [nver, 3] 
+      eye: [3,] the XYZ world space position of the camera.
+      at: [3,] a position along the center of the camera's gaze.
+      up: [3,] up direction 
+    Returns:
+      transformed_vertices: [nver, 3]
+    """
+    if at is None:
+      at = np.array([0, 0, 0], np.float32)
+    if up is None:
+      up = np.array([0, 1, 0], np.float32)
+
+    eye = np.array(eye).astype(np.float32)
+    at = np.array(at).astype(np.float32)
+    z_aixs = -normalize(at - eye) # look forward
+    x_aixs = normalize(np.cross(up, z_aixs)) # look right
+    y_axis = np.cross(z_aixs, x_aixs) # look up
+
+    R = np.stack((x_aixs, y_axis, z_aixs))#, axis = 0) # 3 x 3
+    transformed_vertices = vertices - eye # translation
+    transformed_vertices = transformed_vertices.dot(R.T) # rotation
+    return transformed_vertices
+
+## --------- 3d-2d project. from camera space to image plane
+# generally, image plane only keeps x,y channels, here reserve z channel for calculating z-buffer.
+def orthographic_project(vertices):
+    ''' scaled orthographic projection(just delete z)
+        assumes: variations in depth over the object is small relative to the mean distance from camera to object
+        x -> x*f/z, y -> x*f/z, z -> f.
+        for point i,j. zi~=zj. so just delete z
+        ** often used in face
+        Homo: P = [[1,0,0,0], [0,1,0,0], [0,0,1,0]]
+    Args:
+        vertices: [nver, 3]
+    Returns:
+        projected_vertices: [nver, 3] if isKeepZ=True. [nver, 2] if isKeepZ=False.
+    '''
+    return vertices.copy()
+
+def perspective_project(vertices, fovy, aspect_ratio = 1., near = 0.1, far = 1000.):
+    ''' perspective projection.
+    Args:
+        vertices: [nver, 3]
+        fovy: vertical angular field of view. degree.
+        aspect_ratio : width / height of field of view
+        near : depth of near clipping plane
+        far : depth of far clipping plane
+    Returns:
+        projected_vertices: [nver, 3] 
+    '''
+    fovy = np.deg2rad(fovy)
+    top = near*np.tan(fovy)
+    bottom = -top 
+    right = top*aspect_ratio
+    left = -right
+
+    #-- homo
+    P = np.array([[near/right, 0, 0, 0],
+                 [0, near/top, 0, 0],
+                 [0, 0, -(far+near)/(far-near), -2*far*near/(far-near)],
+                 [0, 0, -1, 0]])
+    vertices_homo = np.hstack((vertices, np.ones((vertices.shape[0], 1)))) # [nver, 4]
+    projected_vertices = vertices_homo.dot(P.T)
+    projected_vertices = projected_vertices/projected_vertices[:,3:]
+    projected_vertices = projected_vertices[:,:3]
+    projected_vertices[:,2] = -projected_vertices[:,2]
+
+    #-- non homo. only fovy
+    # projected_vertices = vertices.copy()
+    # projected_vertices[:,0] = -(near/right)*vertices[:,0]/vertices[:,2]
+    # projected_vertices[:,1] = -(near/top)*vertices[:,1]/vertices[:,2]
+    return projected_vertices
+
+
+def to_image(vertices, h, w, is_perspective = False):
+    ''' change vertices to image coord system
+    3d system: XYZ, center(0, 0, 0)
+    2d image: x(u), y(v). center(w/2, h/2), flip y-axis. 
+    Args:
+        vertices: [nver, 3]
+        h: height of the rendering
+        w : width of the rendering
+    Returns:
+        projected_vertices: [nver, 3]  
+    '''
+    image_vertices = vertices.copy()
+    if is_perspective:
+        # if perspective, the projected vertices are normalized to [-1, 1]. so change it to image size first.
+        image_vertices[:,0] = image_vertices[:,0]*w/2
+        image_vertices[:,1] = image_vertices[:,1]*h/2
+    # move to center of image
+    image_vertices[:,0] = image_vertices[:,0] + w/2
+    image_vertices[:,1] = image_vertices[:,1] + h/2
+    # flip vertices along y-axis.
+    image_vertices[:,1] = h - image_vertices[:,1] - 1
+    return image_vertices
+
+
+#### -------------------------------------------2. estimate transform matrix from correspondences.
+def estimate_affine_matrix_3d23d(X, Y):
+    ''' Using least-squares solution 
+    Args:
+        X: [n, 3]. 3d points(fixed)
+        Y: [n, 3]. corresponding 3d points(moving). Y = PX
+    Returns:
+        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
+    '''
+    X_homo = np.hstack((X, np.ones([X.shape[1],1]))) #n x 4
+    P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
+    return P
+    
+def estimate_affine_matrix_3d22d(X, x):
+    ''' Using Golden Standard Algorithm for estimating an affine camera
+        matrix P from world to image correspondences.
+        See Alg.7.2. in MVGCV 
+        Code Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/fitting/affine_camera_estimation.hpp
+        x_homo = X_homo.dot(P_Affine)
+    Args:
+        X: [n, 3]. corresponding 3d points(fixed)
+        x: [n, 2]. n>=4. 2d points(moving). x = PX
+    Returns:
+        P_Affine: [3, 4]. Affine camera matrix
+    '''
+    X = X.T; x = x.T
+    assert(x.shape[1] == X.shape[1])
+    n = x.shape[1]
+    assert(n >= 4)
+
+    #--- 1. normalization
+    # 2d points
+    mean = np.mean(x, 1) # (2,)
+    x = x - np.tile(mean[:, np.newaxis], [1, n])
+    average_norm = np.mean(np.sqrt(np.sum(x**2, 0)))
+    scale = np.sqrt(2) / average_norm
+    x = scale * x
+
+    T = np.zeros((3,3), dtype = np.float32)
+    T[0, 0] = T[1, 1] = scale
+    T[:2, 2] = -mean*scale
+    T[2, 2] = 1
+
+    # 3d points
+    X_homo = np.vstack((X, np.ones((1, n))))
+    mean = np.mean(X, 1) # (3,)
+    X = X - np.tile(mean[:, np.newaxis], [1, n])
+    m = X_homo[:3,:] - X
+    average_norm = np.mean(np.sqrt(np.sum(X**2, 0)))
+    scale = np.sqrt(3) / average_norm
+    X = scale * X
+
+    U = np.zeros((4,4), dtype = np.float32)
+    U[0, 0] = U[1, 1] = U[2, 2] = scale
+    U[:3, 3] = -mean*scale
+    U[3, 3] = 1
+
+    # --- 2. equations
+    A = np.zeros((n*2, 8), dtype = np.float32);
+    X_homo = np.vstack((X, np.ones((1, n)))).T
+    A[:n, :4] = X_homo
+    A[n:, 4:] = X_homo
+    b = np.reshape(x, [-1, 1])
+ 
+    # --- 3. solution
+    p_8 = np.linalg.pinv(A).dot(b)
+    P = np.zeros((3, 4), dtype = np.float32)
+    P[0, :] = p_8[:4, 0]
+    P[1, :] = p_8[4:, 0]
+    P[-1, -1] = 1
+
+    # --- 4. denormalization
+    P_Affine = np.linalg.inv(T).dot(P.dot(U))
+    return P_Affine
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args: 
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation. 
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0
+    r1 = R1/np.linalg.norm(R1)
+    r2 = R2/np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+#Ref: https://www.learnopencv.com/rotation-matrix-to-euler-angles/
+def isRotationMatrix(R):
+    ''' checks if a matrix is a valid rotation matrix(whether orthogonal or not)
+    '''
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    I = np.identity(3, dtype = R.dtype)
+    n = np.linalg.norm(I - shouldBeIdentity)
+    return n < 1e-6
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    assert(isRotationMatrix)
+    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])
+     
+    singular = sy < 1e-6
+ 
+    if  not singular :
+        x = math.atan2(R[2,1] , R[2,2])
+        y = math.atan2(-R[2,0], sy)
+        z = math.atan2(R[1,0], R[0,0])
+    else :
+        x = math.atan2(-R[1,2], R[1,1])
+        y = math.atan2(-R[2,0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi
+    return rx, ry, rz
+
+# def matrix2angle(R):
+#     ''' compute three Euler angles from a Rotation Matrix. Ref: http://www.gregslabaugh.net/publications/euler.pdf
+#     Args:
+#         R: (3,3). rotation matrix
+#     Returns:
+#         x: yaw
+#         y: pitch
+#         z: roll
+#     '''
+#     # assert(isRotationMatrix(R))
+
+#     if R[2,0] !=1 or R[2,0] != -1:
+#         x = math.asin(R[2,0])
+#         y = math.atan2(R[2,1]/cos(x), R[2,2]/cos(x))
+#         z = math.atan2(R[1,0]/cos(x), R[0,0]/cos(x))
+        
+#     else:# Gimbal lock
+#         z = 0 #can be anything
+#         if R[2,0] == -1:
+#             x = np.pi/2
+#             y = z + math.atan2(R[0,1], R[0,2])
+#         else:
+#             x = -np.pi/2
+#             y = -z + math.atan2(-R[0,1], -R[0,2])
+
+#     return x, y, z
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/vis.py b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db972f2c463a310fa4abde16d6a7baf02797c90
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/mesh_numpy/vis.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import matplotlib.pyplot as plt
+from skimage import measure
+from mpl_toolkits.mplot3d import Axes3D
+
+def plot_mesh(vertices, triangles, subplot = [1,1,1], title = 'mesh', el = 90, az = -90, lwdt=.1, dist = 6, color = "grey"):
+	'''
+	plot the mesh 
+	Args:
+		vertices: [nver, 3]
+		triangles: [ntri, 3]
+	'''
+	ax = plt.subplot(subplot[0], subplot[1], subplot[2], projection = '3d')
+	ax.plot_trisurf(vertices[:, 0], vertices[:, 1], vertices[:, 2], triangles = triangles, lw = lwdt, color = color, alpha = 1)
+	ax.axis("off")
+	ax.view_init(elev = el, azim = az)
+	ax.dist = dist
+	plt.title(title)
+
+### -------------- Todo: use vtk to visualize mesh? or visvis? or VisPy?
diff --git a/insightface/python-package/insightface/thirdparty/face3d/morphable_model/__init__.py b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fc6a60231fc08fa728059bfc88db82703b0475
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .. import mesh
+from .morphabel_model import MorphabelModel
+from . import load
\ No newline at end of file
diff --git a/insightface/python-package/insightface/thirdparty/face3d/morphable_model/fit.py b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/fit.py
new file mode 100644
index 0000000000000000000000000000000000000000..480532c57c1c2824e0652be29d719a22b0ee871a
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/fit.py
@@ -0,0 +1,272 @@
+'''
+Estimating parameters about vertices: shape para, exp para, pose para(s, R, t)
+'''
+import numpy as np
+from .. import mesh
+
+''' TODO: a clear document. 
+Given: image_points, 3D Model, Camera Matrix(s, R, t2d)
+Estimate: shape parameters, expression parameters
+
+Inference: 
+
+    projected_vertices = s*P*R(mu + shape + exp) + t2d  --> image_points
+    s*P*R*shape + s*P*R(mu + exp) + t2d --> image_poitns
+
+    # Define:
+    X = vertices
+    x_hat = projected_vertices
+    x = image_points
+    A = s*P*R
+    b = s*P*R(mu + exp) + t2d
+    ==>
+    x_hat = A*shape + b  (2 x n)
+
+    A*shape (2 x n)
+    shape = reshape(shapePC * sp) (3 x n)
+    shapePC*sp : (3n x 1)
+
+    * flatten:
+    x_hat_flatten = A*shape + b_flatten  (2n x 1)
+    A*shape (2n x 1)
+    --> A*shapePC (2n x 199)  sp: 199 x 1
+    
+    # Define:
+    pc_2d = A* reshape(shapePC)
+    pc_2d_flatten = flatten(pc_2d) (2n x 199)
+
+    =====>
+    x_hat_flatten = pc_2d_flatten * sp + b_flatten ---> x_flatten (2n x 1)
+
+    Goals:
+    (ignore flatten, pc_2d-->pc)
+    min E = || x_hat - x || + lambda*sum(sp/sigma)^2
+          = || pc * sp + b - x || + lambda*sum(sp/sigma)^2
+
+    Solve:
+    d(E)/d(sp) = 0
+    2 * pc' * (pc * sp + b - x) + 2 * lambda * sp / (sigma' * sigma) = 0
+
+    Get:
+    (pc' * pc + lambda / (sigma'* sigma)) * sp  = pc' * (x - b)
+
+'''
+
+def estimate_shape(x, shapeMU, shapePC, shapeEV, expression, s, R, t2d, lamb = 3000):
+    '''
+    Args:
+        x: (2, n). image points (to be fitted)
+        shapeMU: (3n, 1)
+        shapePC: (3n, n_sp)
+        shapeEV: (n_sp, 1)
+        expression: (3, n)
+        s: scale
+        R: (3, 3). rotation matrix
+        t2d: (2,). 2d translation
+        lambda: regulation coefficient
+
+    Returns:
+        shape_para: (n_sp, 1) shape parameters(coefficients)
+    '''
+    x = x.copy()
+    assert(shapeMU.shape[0] == shapePC.shape[0])
+    assert(shapeMU.shape[0] == x.shape[1]*3)
+
+    dof = shapePC.shape[1]
+
+    n = x.shape[1]
+    sigma = shapeEV
+    t2d = np.array(t2d)
+    P = np.array([[1, 0, 0], [0, 1, 0]], dtype = np.float32)
+    A = s*P.dot(R)
+
+    # --- calc pc
+    pc_3d = np.resize(shapePC.T, [dof, n, 3]) # 199 x n x 3
+    pc_3d = np.reshape(pc_3d, [dof*n, 3]) 
+    pc_2d = pc_3d.dot(A.T.copy()) # 199 x n x 2
+    
+    pc = np.reshape(pc_2d, [dof, -1]).T # 2n x 199
+
+    # --- calc b
+    # shapeMU
+    mu_3d = np.resize(shapeMU, [n, 3]).T # 3 x n
+    # expression
+    exp_3d = expression
+    # 
+    b = A.dot(mu_3d + exp_3d) + np.tile(t2d[:, np.newaxis], [1, n]) # 2 x n
+    b = np.reshape(b.T, [-1, 1]) # 2n x 1
+
+    # --- solve
+    equation_left = np.dot(pc.T, pc) + lamb * np.diagflat(1/sigma**2)
+    x = np.reshape(x.T, [-1, 1])
+    equation_right = np.dot(pc.T, x - b)
+
+    shape_para = np.dot(np.linalg.inv(equation_left), equation_right)
+
+    return shape_para
+
+def estimate_expression(x, shapeMU, expPC, expEV, shape, s, R, t2d, lamb = 2000):
+    '''
+    Args:
+        x: (2, n). image points (to be fitted)
+        shapeMU: (3n, 1)
+        expPC: (3n, n_ep)
+        expEV: (n_ep, 1)
+        shape: (3, n)
+        s: scale
+        R: (3, 3). rotation matrix
+        t2d: (2,). 2d translation
+        lambda: regulation coefficient
+
+    Returns:
+        exp_para: (n_ep, 1) shape parameters(coefficients)
+    '''
+    x = x.copy()
+    assert(shapeMU.shape[0] == expPC.shape[0])
+    assert(shapeMU.shape[0] == x.shape[1]*3)
+
+    dof = expPC.shape[1]
+
+    n = x.shape[1]
+    sigma = expEV
+    t2d = np.array(t2d)
+    P = np.array([[1, 0, 0], [0, 1, 0]], dtype = np.float32)
+    A = s*P.dot(R)
+
+    # --- calc pc
+    pc_3d = np.resize(expPC.T, [dof, n, 3]) 
+    pc_3d = np.reshape(pc_3d, [dof*n, 3]) 
+    pc_2d = pc_3d.dot(A.T) 
+    pc = np.reshape(pc_2d, [dof, -1]).T # 2n x 29
+
+    # --- calc b
+    # shapeMU
+    mu_3d = np.resize(shapeMU, [n, 3]).T # 3 x n
+    # expression
+    shape_3d = shape
+    # 
+    b = A.dot(mu_3d + shape_3d) + np.tile(t2d[:, np.newaxis], [1, n]) # 2 x n
+    b = np.reshape(b.T, [-1, 1]) # 2n x 1
+
+    # --- solve
+    equation_left = np.dot(pc.T, pc) + lamb * np.diagflat(1/sigma**2)
+    x = np.reshape(x.T, [-1, 1])
+    equation_right = np.dot(pc.T, x - b)
+
+    exp_para = np.dot(np.linalg.inv(equation_left), equation_right)
+    
+    return exp_para
+
+
+# ---------------- fit 
+def fit_points(x, X_ind, model, n_sp, n_ep, max_iter = 4):
+    '''
+    Args:
+        x: (n, 2) image points
+        X_ind: (n,) corresponding Model vertex indices
+        model: 3DMM
+        max_iter: iteration
+    Returns:
+        sp: (n_sp, 1). shape parameters
+        ep: (n_ep, 1). exp parameters
+        s, R, t
+    '''
+    x = x.copy().T
+
+    #-- init
+    sp = np.zeros((n_sp, 1), dtype = np.float32)
+    ep = np.zeros((n_ep, 1), dtype = np.float32)
+
+    #-------------------- estimate
+    X_ind_all = np.tile(X_ind[np.newaxis, :], [3, 1])*3
+    X_ind_all[1, :] += 1
+    X_ind_all[2, :] += 2
+    valid_ind = X_ind_all.flatten('F')
+
+    shapeMU = model['shapeMU'][valid_ind, :]
+    shapePC = model['shapePC'][valid_ind, :n_sp]
+    expPC = model['expPC'][valid_ind, :n_ep]
+
+    for i in range(max_iter):
+        X = shapeMU + shapePC.dot(sp) + expPC.dot(ep)
+        X = np.reshape(X, [int(len(X)/3), 3]).T
+        
+        #----- estimate pose
+        P = mesh.transform.estimate_affine_matrix_3d22d(X.T, x.T)
+        s, R, t = mesh.transform.P2sRt(P)
+        rx, ry, rz = mesh.transform.matrix2angle(R)
+        #print('Iter:{}; estimated pose: s {}, rx {}, ry {}, rz {}, t1 {}, t2 {}'.format(i, s, rx, ry, rz, t[0], t[1]))
+
+        #----- estimate shape
+        # expression
+        shape = shapePC.dot(sp)
+        shape = np.reshape(shape, [int(len(shape)/3), 3]).T
+        ep = estimate_expression(x, shapeMU, expPC, model['expEV'][:n_ep,:], shape, s, R, t[:2], lamb = 20)
+
+        # shape
+        expression = expPC.dot(ep)
+        expression = np.reshape(expression, [int(len(expression)/3), 3]).T
+        if i == 0 :
+            sp = estimate_shape(x, shapeMU, shapePC, model['shapeEV'][:n_sp,:], expression, s, R, t[:2], lamb = 40)
+
+    return sp, ep, s, R, t
+
+
+# ---------------- fitting process
+def fit_points_for_show(x, X_ind, model, n_sp, n_ep, max_iter = 4):
+    '''
+    Args:
+        x: (n, 2) image points
+        X_ind: (n,) corresponding Model vertex indices
+        model: 3DMM
+        max_iter: iteration
+    Returns:
+        sp: (n_sp, 1). shape parameters
+        ep: (n_ep, 1). exp parameters
+        s, R, t
+    '''
+    x = x.copy().T
+
+    #-- init
+    sp = np.zeros((n_sp, 1), dtype = np.float32)
+    ep = np.zeros((n_ep, 1), dtype = np.float32)
+
+    #-------------------- estimate
+    X_ind_all = np.tile(X_ind[np.newaxis, :], [3, 1])*3
+    X_ind_all[1, :] += 1
+    X_ind_all[2, :] += 2
+    valid_ind = X_ind_all.flatten('F')
+
+    shapeMU = model['shapeMU'][valid_ind, :]
+    shapePC = model['shapePC'][valid_ind, :n_sp]
+    expPC = model['expPC'][valid_ind, :n_ep]
+
+    s = 4e-04
+    R = mesh.transform.angle2matrix([0, 0, 0])
+    t = [0, 0, 0]
+    lsp = []; lep = []; ls = []; lR = []; lt = []
+    for i in range(max_iter):
+        X = shapeMU + shapePC.dot(sp) + expPC.dot(ep)
+        X = np.reshape(X, [int(len(X)/3), 3]).T
+        lsp.append(sp); lep.append(ep); ls.append(s), lR.append(R), lt.append(t)
+        
+        #----- estimate pose
+        P = mesh.transform.estimate_affine_matrix_3d22d(X.T, x.T)
+        s, R, t = mesh.transform.P2sRt(P)
+        lsp.append(sp); lep.append(ep); ls.append(s), lR.append(R), lt.append(t)
+
+        #----- estimate shape
+        # expression
+        shape = shapePC.dot(sp)
+        shape = np.reshape(shape, [int(len(shape)/3), 3]).T
+        ep = estimate_expression(x, shapeMU, expPC, model['expEV'][:n_ep,:], shape, s, R, t[:2], lamb = 20)
+        lsp.append(sp); lep.append(ep); ls.append(s), lR.append(R), lt.append(t)
+
+        # shape
+        expression = expPC.dot(ep)
+        expression = np.reshape(expression, [int(len(expression)/3), 3]).T
+        sp = estimate_shape(x, shapeMU, shapePC, model['shapeEV'][:n_sp,:], expression, s, R, t[:2], lamb = 40)
+
+    # print('ls', ls)
+    # print('lR', lR)
+    return np.array(lsp), np.array(lep), np.array(ls), np.array(lR), np.array(lt)
diff --git a/insightface/python-package/insightface/thirdparty/face3d/morphable_model/load.py b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b80665bde45447139ec3a4c066a9677ce8322b0
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/load.py
@@ -0,0 +1,110 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import scipy.io as sio
+
+### ---------------------------------  load BFM data
+def load_BFM(model_path):
+    ''' load BFM 3DMM model
+    Args:
+        model_path: path to BFM model. 
+    Returns:
+        model: (nver = 53215, ntri = 105840). nver: number of vertices. ntri: number of triangles.
+            'shapeMU': [3*nver, 1]
+            'shapePC': [3*nver, 199]
+            'shapeEV': [199, 1]
+            'expMU': [3*nver, 1]
+            'expPC': [3*nver, 29]
+            'expEV': [29, 1]
+            'texMU': [3*nver, 1]
+            'texPC': [3*nver, 199]
+            'texEV': [199, 1]
+            'tri': [ntri, 3] (start from 1, should sub 1 in python and c++)
+            'tri_mouth': [114, 3] (start from 1, as a supplement to mouth triangles)
+            'kpt_ind': [68,] (start from 1)
+    PS:
+        You can change codes according to your own saved data.
+        Just make sure the model has corresponding attributes.
+    '''
+    C = sio.loadmat(model_path)
+    model = C['model']
+    model = model[0,0]
+
+    # change dtype from double(np.float64) to np.float32, 
+    # since big matrix process(espetially matrix dot) is too slow in python.
+    model['shapeMU'] = (model['shapeMU'] + model['expMU']).astype(np.float32)
+    model['shapePC'] = model['shapePC'].astype(np.float32)
+    model['shapeEV'] = model['shapeEV'].astype(np.float32)
+    model['expEV'] = model['expEV'].astype(np.float32)
+    model['expPC'] = model['expPC'].astype(np.float32)
+
+    # matlab start with 1. change to 0 in python.
+    model['tri'] = model['tri'].T.copy(order = 'C').astype(np.int32) - 1
+    model['tri_mouth'] = model['tri_mouth'].T.copy(order = 'C').astype(np.int32) - 1
+    
+    # kpt ind
+    model['kpt_ind'] = (np.squeeze(model['kpt_ind']) - 1).astype(np.int32)
+
+    return model
+
+def load_BFM_info(path = 'BFM_info.mat'):
+    ''' load 3DMM model extra information
+    Args:
+        path: path to BFM info. 
+    Returns:  
+        model_info:
+            'symlist': 2 x 26720
+            'symlist_tri': 2 x 52937
+            'segbin': 4 x n (0: nose, 1: eye, 2: mouth, 3: cheek)
+            'segbin_tri': 4 x ntri 
+            'face_contour': 1 x 28
+            'face_contour_line': 1 x 512
+            'face_contour_front': 1 x 28
+            'face_contour_front_line': 1 x 512
+            'nose_hole': 1 x 142
+            'nose_hole_right': 1 x 71
+            'nose_hole_left': 1 x 71
+            'parallel': 17 x 1 cell
+            'parallel_face_contour': 28 x 1 cell
+            'uv_coords': n x 2
+    '''
+    C = sio.loadmat(path)
+    model_info = C['model_info']
+    model_info = model_info[0,0]
+    return model_info
+
+def load_uv_coords(path = 'BFM_UV.mat'):
+    ''' load uv coords of BFM
+    Args:
+        path: path to data.
+    Returns:  
+        uv_coords: [nver, 2]. range: 0-1
+    '''
+    C = sio.loadmat(path)
+    uv_coords = C['UV'].copy(order = 'C')
+    return uv_coords
+
+def load_pncc_code(path = 'pncc_code.mat'):
+    ''' load pncc code of BFM
+    PNCC code: Defined in 'Face Alignment Across Large Poses: A 3D Solution Xiangyu'
+    download at http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/main.htm.
+    Args:
+        path: path to data.
+    Returns:  
+        pncc_code: [nver, 3]
+    '''
+    C = sio.loadmat(path)
+    pncc_code = C['vertex_code'].T
+    return pncc_code
+
+## 
+def get_organ_ind(model_info):
+    ''' get nose, eye, mouth index
+    '''
+    valid_bin = model_info['segbin'].astype(bool)
+    organ_ind = np.nonzero(valid_bin[0,:])[0]
+    for i in range(1, valid_bin.shape[0] - 1):
+        organ_ind = np.union1d(organ_ind, np.nonzero(valid_bin[i,:])[0])
+    return organ_ind.astype(np.int32)
diff --git a/insightface/python-package/insightface/thirdparty/face3d/morphable_model/morphabel_model.py b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/morphabel_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd677dacc76b93ee253114045b43c66a66bdc99
--- /dev/null
+++ b/insightface/python-package/insightface/thirdparty/face3d/morphable_model/morphabel_model.py
@@ -0,0 +1,143 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import scipy.io as sio
+from .. import mesh
+from . import fit
+from . import load
+
+class  MorphabelModel(object):
+    """docstring for  MorphabelModel
+    model: nver: number of vertices. ntri: number of triangles. *: must have. ~: can generate ones array for place holder.
+            'shapeMU': [3*nver, 1]. *
+            'shapePC': [3*nver, n_shape_para]. *
+            'shapeEV': [n_shape_para, 1]. ~
+            'expMU': [3*nver, 1]. ~ 
+            'expPC': [3*nver, n_exp_para]. ~
+            'expEV': [n_exp_para, 1]. ~
+            'texMU': [3*nver, 1]. ~
+            'texPC': [3*nver, n_tex_para]. ~
+            'texEV': [n_tex_para, 1]. ~
+            'tri': [ntri, 3] (start from 1, should sub 1 in python and c++). *
+            'tri_mouth': [114, 3] (start from 1, as a supplement to mouth triangles). ~
+            'kpt_ind': [68,] (start from 1). ~
+    """
+    def __init__(self, model_path, model_type = 'BFM'):
+        super( MorphabelModel, self).__init__()
+        if model_type=='BFM':
+            self.model = load.load_BFM(model_path)
+        else:
+            print('sorry, not support other 3DMM model now')
+            exit()
+            
+        # fixed attributes
+        self.nver = self.model['shapePC'].shape[0]/3
+        self.ntri = self.model['tri'].shape[0]
+        self.n_shape_para = self.model['shapePC'].shape[1]
+        self.n_exp_para = self.model['expPC'].shape[1]
+        self.n_tex_para = self.model['texMU'].shape[1]
+        
+        self.kpt_ind = self.model['kpt_ind']
+        self.triangles = self.model['tri']
+        self.full_triangles = np.vstack((self.model['tri'], self.model['tri_mouth']))
+
+    # ------------------------------------- shape: represented with mesh(vertices & triangles(fixed))
+    def get_shape_para(self, type = 'random'):
+        if type == 'zero':
+            sp = np.random.zeros((self.n_shape_para, 1))
+        elif type == 'random':
+            sp = np.random.rand(self.n_shape_para, 1)*1e04
+        return sp
+
+    def get_exp_para(self, type = 'random'):
+        if type == 'zero':
+            ep = np.zeros((self.n_exp_para, 1))
+        elif type == 'random':
+            ep = -1.5 + 3*np.random.random([self.n_exp_para, 1])
+            ep[6:, 0] = 0
+
+        return ep 
+
+    def generate_vertices(self, shape_para, exp_para):
+        '''
+        Args:
+            shape_para: (n_shape_para, 1)
+            exp_para: (n_exp_para, 1) 
+        Returns:
+            vertices: (nver, 3)
+        '''
+        vertices = self.model['shapeMU'] + self.model['shapePC'].dot(shape_para) + self.model['expPC'].dot(exp_para)
+        vertices = np.reshape(vertices, [int(3), int(len(vertices)/3)], 'F').T
+
+        return vertices
+
+    # -------------------------------------- texture: here represented with rgb value(colors) in vertices.
+    def get_tex_para(self, type = 'random'):
+        if type == 'zero':
+            tp = np.zeros((self.n_tex_para, 1))
+        elif type == 'random':
+            tp = np.random.rand(self.n_tex_para, 1)
+        return tp
+
+    def generate_colors(self, tex_para):
+        '''
+        Args:
+            tex_para: (n_tex_para, 1)
+        Returns:
+            colors: (nver, 3)
+        '''
+        colors = self.model['texMU'] + self.model['texPC'].dot(tex_para*self.model['texEV'])
+        colors = np.reshape(colors, [int(3), int(len(colors)/3)], 'F').T/255.  
+        
+        return colors
+
+
+    # ------------------------------------------- transformation
+    # -------------  transform
+    def rotate(self, vertices, angles):
+        ''' rotate face
+        Args:
+            vertices: [nver, 3]
+            angles: [3] x, y, z rotation angle(degree)
+            x: pitch. positive for looking down 
+            y: yaw. positive for looking left
+            z: roll. positive for tilting head right
+        Returns:
+            vertices: rotated vertices
+        '''
+        return mesh.transform.rotate(vertices, angles)
+
+    def transform(self, vertices, s, angles, t3d):
+        R = mesh.transform.angle2matrix(angles)
+        return mesh.transform.similarity_transform(vertices, s, R, t3d)
+
+    def transform_3ddfa(self, vertices, s, angles, t3d): # only used for processing 300W_LP data
+        R = mesh.transform.angle2matrix_3ddfa(angles)
+        return mesh.transform.similarity_transform(vertices, s, R, t3d)
+
+    # --------------------------------------------------- fitting
+    def fit(self, x, X_ind, max_iter = 4, isShow = False):
+        ''' fit 3dmm & pose parameters
+        Args:
+            x: (n, 2) image points
+            X_ind: (n,) corresponding Model vertex indices
+            max_iter: iteration
+            isShow: whether to reserve middle results for show
+        Returns:
+            fitted_sp: (n_sp, 1). shape parameters
+            fitted_ep: (n_ep, 1). exp parameters
+            s, angles, t
+        '''
+        if isShow:
+            fitted_sp, fitted_ep, s, R, t = fit.fit_points_for_show(x, X_ind, self.model, n_sp = self.n_shape_para, n_ep = self.n_exp_para, max_iter = max_iter)
+            angles = np.zeros((R.shape[0], 3))
+            for i in range(R.shape[0]):
+                angles[i] = mesh.transform.matrix2angle(R[i])
+        else:
+            fitted_sp, fitted_ep, s, R, t = fit.fit_points(x, X_ind, self.model, n_sp = self.n_shape_para, n_ep = self.n_exp_para, max_iter = max_iter)
+            angles = mesh.transform.matrix2angle(R)
+        return fitted_sp, fitted_ep, s, angles, t
+
+
diff --git a/insightface/python-package/insightface/utils/__init__.py b/insightface/python-package/insightface/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ccb4e652e5e09aa7dba1d70b1d59ed26458fff
--- /dev/null
+++ b/insightface/python-package/insightface/utils/__init__.py
@@ -0,0 +1,18 @@
+from __future__ import absolute_import
+
+#from . import bbox
+#from . import viz
+#from . import random
+#from . import metrics
+#from . import parallel
+
+from .storage import download, ensure_available, download_onnx
+from .filesystem import get_model_dir
+from .filesystem import makedirs, try_import_dali
+from .constant import *
+#from .bbox import bbox_iou
+#from .block import recursive_visit, set_lr_mult, freeze_bn
+#from .lr_scheduler import LRSequential, LRScheduler
+#from .plot_history import TrainingHistory
+#from .export_helper import export_block
+#from .sync_loader_helper import split_data, split_and_load
diff --git a/insightface/python-package/insightface/utils/constant.py b/insightface/python-package/insightface/utils/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..8860ff077ae7227235591edfc84c0cdc227a6432
--- /dev/null
+++ b/insightface/python-package/insightface/utils/constant.py
@@ -0,0 +1,3 @@
+
+DEFAULT_MP_NAME = 'buffalo_l'
+
diff --git a/insightface/python-package/insightface/utils/download.py b/insightface/python-package/insightface/utils/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cda84dede45b81dcd99161d87792b6c409fa279
--- /dev/null
+++ b/insightface/python-package/insightface/utils/download.py
@@ -0,0 +1,95 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py
+"""
+import os
+import hashlib
+import requests
+from tqdm import tqdm
+
+
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+
+    sha1_file = sha1.hexdigest()
+    l = min(len(sha1_file), len(sha1_hash))
+    return sha1.hexdigest()[0:l] == sha1_hash[0:l]
+
+
+def download_file(url, path=None, overwrite=False, sha1_hash=None):
+    """Download an given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split('/')[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+
+    if overwrite or not os.path.exists(fname) or (
+            sha1_hash and not check_sha1(fname, sha1_hash)):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+        print('Downloading %s from %s...' % (fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s" % url)
+        total_length = r.headers.get('content-length')
+        with open(fname, 'wb') as f:
+            if total_length is None:  # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in tqdm(r.iter_content(chunk_size=1024),
+                                  total=int(total_length / 1024. + 0.5),
+                                  unit='KB',
+                                  unit_scale=False,
+                                  dynamic_ncols=True):
+                    f.write(chunk)
+
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+
+    return fname
diff --git a/insightface/python-package/insightface/utils/face_align.py b/insightface/python-package/insightface/utils/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..226628b39cf743947df230feffbb97bf5c585e1d
--- /dev/null
+++ b/insightface/python-package/insightface/utils/face_align.py
@@ -0,0 +1,103 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+
+arcface_dst = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+def estimate_norm(lmk, image_size=112,mode='arcface'):
+    assert lmk.shape == (5, 2)
+    assert image_size%112==0 or image_size%128==0
+    if image_size%112==0:
+        ratio = float(image_size)/112.0
+        diff_x = 0
+    else:
+        ratio = float(image_size)/128.0
+        diff_x = 8.0*ratio
+    dst = arcface_dst * ratio
+    dst[:,0] += diff_x
+    tform = trans.SimilarityTransform()
+    tform.estimate(lmk, dst)
+    M = tform.params[0:2, :]
+    return M
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
+
+def norm_crop2(img, landmark, image_size=112, mode='arcface'):
+    M = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped, M
+
+def square_crop(im, S):
+    if im.shape[0] > im.shape[1]:
+        height = S
+        width = int(float(im.shape[1]) / im.shape[0] * S)
+        scale = float(S) / im.shape[0]
+    else:
+        width = S
+        height = int(float(im.shape[0]) / im.shape[1] * S)
+        scale = float(S) / im.shape[1]
+    resized_im = cv2.resize(im, (width, height))
+    det_im = np.zeros((S, S, 3), dtype=np.uint8)
+    det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im
+    return det_im, scale
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    #print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
diff --git a/insightface/python-package/insightface/utils/filesystem.py b/insightface/python-package/insightface/utils/filesystem.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e3851975bdcbbf7f5eeb7e68e70a36dc040535
--- /dev/null
+++ b/insightface/python-package/insightface/utils/filesystem.py
@@ -0,0 +1,157 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py
+"""
+import os
+import os.path as osp
+import errno
+
+
+def get_model_dir(name, root='~/.insightface'):
+    root = os.path.expanduser(root)
+    model_dir = osp.join(root, 'models', name)
+    return model_dir
+
+def makedirs(path):
+    """Create directory recursively if not exists.
+    Similar to `makedir -p`, you can skip checking existence before this function.
+
+    Parameters
+    ----------
+    path : str
+        Path of the desired dir
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+
+
+def try_import(package, message=None):
+    """Try import specified package, with custom message support.
+
+    Parameters
+    ----------
+    package : str
+        The name of the targeting package.
+    message : str, default is None
+        If not None, this function will raise customized error message when import error is found.
+
+
+    Returns
+    -------
+    module if found, raise ImportError otherwise
+
+    """
+    try:
+        return __import__(package)
+    except ImportError as e:
+        if not message:
+            raise e
+        raise ImportError(message)
+
+
+def try_import_cv2():
+    """Try import cv2 at runtime.
+
+    Returns
+    -------
+    cv2 module if found. Raise ImportError otherwise
+
+    """
+    msg = "cv2 is required, you can install by package manager, e.g. 'apt-get', \
+        or `pip install opencv-python --user` (note that this is unofficial PYPI package)."
+
+    return try_import('cv2', msg)
+
+
+def try_import_mmcv():
+    """Try import mmcv at runtime.
+
+    Returns
+    -------
+    mmcv module if found. Raise ImportError otherwise
+
+    """
+    msg = "mmcv is required, you can install by first `pip install Cython --user` \
+        and then `pip install mmcv --user` (note that this is unofficial PYPI package)."
+
+    return try_import('mmcv', msg)
+
+
+def try_import_rarfile():
+    """Try import rarfile at runtime.
+
+    Returns
+    -------
+    rarfile module if found. Raise ImportError otherwise
+
+    """
+    msg = "rarfile is required, you can install by first `sudo apt-get install unrar` \
+        and then `pip install rarfile --user` (note that this is unofficial PYPI package)."
+
+    return try_import('rarfile', msg)
+
+
+def import_try_install(package, extern_url=None):
+    """Try import the specified package.
+    If the package not installed, try use pip to install and import if success.
+
+    Parameters
+    ----------
+    package : str
+        The name of the package trying to import.
+    extern_url : str or None, optional
+        The external url if package is not hosted on PyPI.
+        For example, you can install a package using:
+         "pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx".
+        In this case, you can pass the url to the extern_url.
+
+    Returns
+    -------
+    <class 'Module'>
+        The imported python module.
+
+    """
+    try:
+        return __import__(package)
+    except ImportError:
+        try:
+            from pip import main as pipmain
+        except ImportError:
+            from pip._internal import main as pipmain
+
+        # trying to install package
+        url = package if extern_url is None else extern_url
+        pipmain(['install', '--user',
+                 url])  # will raise SystemExit Error if fails
+
+        # trying to load again
+        try:
+            return __import__(package)
+        except ImportError:
+            import sys
+            import site
+            user_site = site.getusersitepackages()
+            if user_site not in sys.path:
+                sys.path.append(user_site)
+            return __import__(package)
+    return __import__(package)
+
+
+def try_import_dali():
+    """Try import NVIDIA DALI at runtime.
+    """
+    try:
+        dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types'])
+        dali.Pipeline = dali.pipeline.Pipeline
+    except ImportError:
+
+        class dali:
+            class Pipeline:
+                def __init__(self):
+                    raise NotImplementedError(
+                        "DALI not found, please check if you installed it correctly."
+                    )
+
+    return dali
diff --git a/insightface/python-package/insightface/utils/storage.py b/insightface/python-package/insightface/utils/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf37e2d17b28dee2a8839484778815f87fc4a9c
--- /dev/null
+++ b/insightface/python-package/insightface/utils/storage.py
@@ -0,0 +1,52 @@
+
+import os
+import os.path as osp
+import zipfile
+from .download import download_file
+
+BASE_REPO_URL = 'https://github.com/deepinsight/insightface/releases/download/v0.7'
+
+def download(sub_dir, name, force=False, root='~/.insightface'):
+    _root = os.path.expanduser(root)
+    dir_path = os.path.join(_root, sub_dir, name)
+    if osp.exists(dir_path) and not force:
+        return dir_path
+    print('download_path:', dir_path)
+    zip_file_path = os.path.join(_root, sub_dir, name + '.zip')
+    model_url = "%s/%s.zip"%(BASE_REPO_URL, name)
+    download_file(model_url,
+             path=zip_file_path,
+             overwrite=True)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(dir_path)
+    #os.remove(zip_file_path)
+    return dir_path
+
+def ensure_available(sub_dir, name, root='~/.insightface'):
+    return download(sub_dir, name, force=False, root=root)
+
+def download_onnx(sub_dir, model_file, force=False, root='~/.insightface', download_zip=False):
+    _root = os.path.expanduser(root)
+    model_root = osp.join(_root, sub_dir)
+    new_model_file = osp.join(model_root, model_file)
+    if osp.exists(new_model_file) and not force:
+        return new_model_file
+    if not osp.exists(model_root):
+        os.makedirs(model_root)
+    print('download_path:', new_model_file)
+    if not download_zip:
+        model_url = "%s/%s"%(BASE_REPO_URL, model_file)
+        download_file(model_url,
+                 path=new_model_file,
+                 overwrite=True)
+    else:
+        model_url = "%s/%s.zip"%(BASE_REPO_URL, model_file)
+        zip_file_path = new_model_file+".zip"
+        download_file(model_url,
+                 path=zip_file_path,
+                 overwrite=True)
+        with zipfile.ZipFile(zip_file_path) as zf:
+            zf.extractall(model_root)
+        return new_model_file
diff --git a/insightface/python-package/insightface/utils/transform.py b/insightface/python-package/insightface/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..06531d257b694211a0b9a09c9d741b9b2ff53bfe
--- /dev/null
+++ b/insightface/python-package/insightface/utils/transform.py
@@ -0,0 +1,116 @@
+import cv2
+import math
+import numpy as np
+from skimage import transform as trans
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    #print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
+def estimate_affine_matrix_3d23d(X, Y):
+    ''' Using least-squares solution 
+    Args:
+        X: [n, 3]. 3d points(fixed)
+        Y: [n, 3]. corresponding 3d points(moving). Y = PX
+    Returns:
+        P_Affine: (3, 4). Affine camera matrix (the third row is [0, 0, 0, 1]).
+    '''
+    X_homo = np.hstack((X, np.ones([X.shape[0],1]))) #n x 4
+    P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
+    return P
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args: 
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation. 
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2.0
+    r1 = R1/np.linalg.norm(R1)
+    r2 = R2/np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])
+     
+    singular = sy < 1e-6
+ 
+    if  not singular :
+        x = math.atan2(R[2,1] , R[2,2])
+        y = math.atan2(-R[2,0], sy)
+        z = math.atan2(R[1,0], R[0,0])
+    else :
+        x = math.atan2(-R[1,2], R[1,1])
+        y = math.atan2(-R[2,0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    rx, ry, rz = x*180/np.pi, y*180/np.pi, z*180/np.pi
+    return rx, ry, rz
+
diff --git a/insightface/python-package/pyproject.toml b/insightface/python-package/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..20612d7f8a1bcbda15fc7ee1142656f7c98ae31d
--- /dev/null
+++ b/insightface/python-package/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "numpy", "cython"]
+build-backend = "setuptools.build_meta"
diff --git a/insightface/python-package/setup.py b/insightface/python-package/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b0fa157abebf578d6e7d1c1b0cb70bac895a9ac
--- /dev/null
+++ b/insightface/python-package/setup.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+import os
+import io
+import glob
+import numpy
+import re
+import shutil
+import sys
+from setuptools import setup, find_packages
+from distutils.core import Extension
+from Cython.Distutils import build_ext
+from Cython.Build import cythonize
+
+def read(*names, **kwargs):
+    with io.open(os.path.join(os.path.dirname(__file__), *names),
+                 encoding=kwargs.get("encoding", "utf8")) as fp:
+        return fp.read()
+
+def find_version(*file_paths):
+    version_file = read(*file_paths)
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                              version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")
+
+pypandoc_enabled = True
+try:
+    import pypandoc
+    print('pandoc enabled')
+    long_description = pypandoc.convert_file('README.md', 'rst')
+except (IOError, ImportError, ModuleNotFoundError):
+    print('WARNING: pandoc not enabled')
+    long_description = open('README.md').read()
+    pypandoc_enabled = False
+
+#import pypandoc
+#long_description = pypandoc.convert('README.md', 'rst')
+VERSION = find_version('insightface', '__init__.py')
+
+requirements = [
+    'numpy',
+    'onnx',
+    'tqdm',
+    'requests',
+    'matplotlib',
+    'Pillow',
+    'scipy',
+    #'opencv-python',
+    'scikit-learn',
+    'scikit-image',
+    'easydict',
+    'cython',
+    'albumentations',
+    'prettytable',
+]
+
+extensions = [
+        Extension("insightface.thirdparty.face3d.mesh.cython.mesh_core_cython", 
+            ["insightface/thirdparty/face3d/mesh/cython/mesh_core_cython.pyx", "insightface/thirdparty/face3d/mesh/cython/mesh_core.cpp"], language='c++'),
+        ]
+data_images = list(glob.glob('insightface/data/images/*.jpg'))
+data_images += list(glob.glob('insightface/data/images/*.png'))
+
+data_mesh = list(glob.glob('insightface/thirdparty/face3d/mesh/cython/*.h'))
+data_mesh += list(glob.glob('insightface/thirdparty/face3d/mesh/cython/*.c'))
+data_mesh += list(glob.glob('insightface/thirdparty/face3d/mesh/cython/*.py*'))
+
+data_objects = list(glob.glob('insightface/data/objects/*.pkl'))
+
+data_files = [ ('insightface/data/images', data_images) ]
+data_files += [ ('insightface/data/objects', data_objects) ]
+data_files += [ ('insightface/thirdparty/face3d/mesh/cython', data_mesh) ]
+
+ext_modules=cythonize(extensions)
+setup(
+    # Metadata
+    name='insightface',
+    version=VERSION,
+    author='InsightFace Contributors',
+    author_email='contact@insightface.ai',
+    url='https://github.com/deepinsight/insightface',
+    description='InsightFace Python Library',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    license='MIT',
+    # Package info
+    packages=find_packages(exclude=('docs', 'tests', 'scripts')),
+    data_files=data_files,
+    zip_safe=True,
+    include_package_data=True,
+    entry_points={"console_scripts": ["insightface-cli=insightface.commands.insightface_cli:main"]},
+    install_requires=requirements,
+    headers=['insightface/thirdparty/face3d/mesh/cython/mesh_core.h'],
+    ext_modules=ext_modules,
+    include_dirs=numpy.get_include(),
+)
+
+print('pypandoc enabled:', pypandoc_enabled)
+
diff --git a/insightface/recognition/README.md b/insightface/recognition/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7ddef1b62516ce9f4bcb116f6ac548a76f78b8f
--- /dev/null
+++ b/insightface/recognition/README.md
@@ -0,0 +1,48 @@
+## Face Recognition
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/custom/logo3.jpg" width="240"/>
+</div>
+
+
+## Introduction
+
+These are the face recognition methods of [InsightFace](https://insightface.ai)
+
+
+<div align="left">
+  <img src="https://insightface.ai/assets/img/github/facerecognitionfromvideo.PNG" width="600"/>
+</div>
+
+
+### Datasets
+
+  Please refer to [datasets](_datasets_) page for the details of face recognition datasets used for training and evaluation.
+
+### Evaluation
+
+  Please refer to [evaluation](_evaluation_) page for the details of face recognition evaluation.
+
+
+## Methods
+
+
+Supported methods:
+
+- [x] [ArcFace_torch (CVPR'2019)](arcface_torch)
+- [x] [ArcFace_mxnet (CVPR'2019)](arcface_mxnet)
+- [x] [ArcFace_Paddle (CVPR'2019)](arcface_paddle)
+- [x] [Arcface_oneflow](arcface_oneflow)
+- [x] [SubCenter ArcFace (ECCV'2020)](subcenter_arcface)
+- [x] [VPL (CVPR'2021)](vpl)
+- [x] [PartialFC_torch (CVPR'2022)](arcface_torch)
+- [x] [PartialFC_mxnet (CVPR'2022)](partial_fc)
+- [x] [IDMMD (NeurIPS'2022)](https://github.com/deepinsight/insightface/tree/master/recognition/idmmd)
+
+
+## Contributing
+
+We appreciate all contributions to improve the face recognition model zoo of InsightFace. 
+
+
diff --git a/insightface/recognition/_datasets_/README.md b/insightface/recognition/_datasets_/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6fec717344abacd6f98ad8e59f24377aac64955
--- /dev/null
+++ b/insightface/recognition/_datasets_/README.md
@@ -0,0 +1,150 @@
+# Face Recognition Datasets
+
+## Training Datasets (Updating)
+
+### CASIA-Webface (10K ids/0.5M images) [1]
+
+[Baidu](https://pan.baidu.com/s/1AfHdPsxJZBD8kBJeIhmq1w)
+
+[GDrive](https://drive.google.com/file/d/1KxNCrXzln0lal3N4JiYl9cFOIhT78y1l/view?usp=sharing)
+
+### CelebA (10K ids/0.2M images) [2]
+
+### UMDFace (8K ids/0.37M images) [3]
+
+[Baidu](https://pan.baidu.com/s/1aGutJwNWpV-lA0f_7eNsGQ)
+
+[GDrive](https://drive.google.com/file/d/1azhEHoJjVmifuzBVKJwl-sDbLZ-Wzp4O/view?usp=sharing)
+
+### VGG2 (9K ids/3.31M images) [4]
+
+[Baidu](https://pan.baidu.com/s/1c3KeLzy)
+
+[GDrive](https://drive.google.com/file/d/1dyVQ7X3d28eAcjV3s3o0MT-HyODp_v3R/view?usp=sharing)
+
+### MS1M-IBUG (85K ids/3.8M images) [5,6]
+
+[Baidu](https://pan.baidu.com/s/1nxmSCch)
+
+### MS1M-ArcFace (85K ids/5.8M images) [5,7] 
+
+[Baidu](https://pan.baidu.com/s/1S6LJZGdqcZRle1vlcMzHOQ)
+
+[GDrive](https://drive.google.com/file/d/1SXS4-Am3bsKSK615qbYdbA_FMVh3sAvR/view?usp=sharing)
+
+### MS1M-RetinaFace
+
+[Baidu](https://pan.baidu.com/s/1RBnaW88PC6cKqtYwgfVX8Q) (code:8eb3)
+
+[GDrive](https://drive.google.com/file/d/1JgmzL9OLTqDAZE86pBgETtSQL4USKTFy/view?usp=sharing)
+
+### Asian-Celeb (94K ids/2.8M images)[8]
+
+[Baidu](https://pan.baidu.com/s/12wSgofDy1flFf6lOyAxJRg)
+
+### Glint360K (360K ids/17M images)[17]
+
+[Baidu](https://pan.baidu.com/s/1GsYqTTt7_Dn8BfxxsLFN0w) (code:o3az)    
+magnet uri: `magnet:?xt=urn:btih:E5F46EE502B9E76DA8CC3A0E4F7C17E4000C7B1E&dn=glint360k`
+
+### Glint-Mini (91K ids/5.2M images)[17]
+[Baidu](https://pan.baidu.com/s/10IzEyP-Z9dWFcxxj9jdJpQ) (code:10m5)
+
+### DeepGlint (181K ids/6.75M images) [8] 
+
+[baidu](https://pan.baidu.com/s/1yApUbklBgRgOyOV4o3J8Eg)
+
+### WebFace260M [18]
+
+[Link](https://www.face-benchmark.org/download.html)
+
+
+### IMDB-Face (59K ids/1.7M images) [9]
+
+### Celeb500k (500K ids/50M images) [10]
+
+### MegaFace(train) (672K ids/4.7M images) [11]
+
+[Baidu](https://pan.baidu.com/s/1uy366DjUiGc3AvhuamRLyw) (code:5f8m)
+
+[GDrive](https://drive.google.com/file/d/1O4FxijSXoEIe6fLfOocqF4VFMh5B4d89/view?usp=sharing)
+
+### DigiFace-1M (110K ids/1.22M images) [19]
+
+[Website](https://microsoft.github.io/DigiFace1M/)
+
+[Github](https://github.com/microsoft/DigiFace1M)
+
+## Validation Datasets
+
+### CFP-FP (500 ids/7K images/7K pairs)[12]
+
+### AgeDB-30 (570 ids/12,240 images/6K pairs)[13,6]
+
+### LFW (5749 ids/13233 images/6K pairs)[14]
+
+### CALFW (5749 ids/13233 images/6K pairs)[15]
+
+### CPLFW (5749 ids/13233 images/6K pairs)[16]
+
+## Image Test Datasets
+
+### MegaFace
+
+testsuite: [GDrive](https://drive.google.com/file/d/1KBwp0U9oZgZj7SYDXRxUnnH7Lwvd9XMy/view?usp=sharing)
+
+### IJB (IJB-B, IJB-C)
+
+testsuite: [GDrive](https://drive.google.com/file/d/1aC4zf2Bn0xCVH_ZtEuQipR2JvRb1bf8o/view?usp=sharing)
+
+### TrillionPairs
+
+### NIST
+
+## Video Test Datasets
+
+### YTF
+
+### IQIYI
+
+## Reference
+
+[1] Dong Yi, Zhen Lei, Shengcai Liao, Stan Z. Li. Learning Face Representation from Scratch. arXiv:1411.7923, 2014.
+
+[2] Ziwei Liu, Ping Luo, Xiaogang Wang, Xiaoou Tang. Deep Learning Face Attributes in the Wild, ICCV, 2015.
+
+[3] Bansal Ankan, Nanduri Anirudh, Castillo Carlos D, Ranjan Rajeev, Chellappa, Rama. UMDFaces: An Annotated Face Dataset for Training Deep Networks, arXiv:1611.01484v2, 2016.
+
+[4] Qiong Cao, Li Shen, Weidi Xie, Omkar M. Parkhi, Andrew Zisserman. VGGFace2: A dataset for recognising faces across pose and age. FG, 2018.
+
+[5] Yandong Guo, Lei Zhang, Yuxiao Hu, Xiaodong He, Jianfeng Gao. Ms-celeb-1m: A dataset and benchmark for large-scale face recognition. ECCV, 2016.
+
+[6] Jiankang Deng, Yuxiang Zhou, Stefanos Zafeiriou. Marginal loss for deep face recognition, CVPRW, 2017.
+
+[7] Jiankang Deng, Jia Guo, Stefanos Zafeiriou. Arcface: Additive angular margin loss for deep face recognition, arXiv:1801.07698, 2018.
+
+[8] [http://trillionpairs.deepglint.com/](http://trillionpairs.deepglint.com/)
+
+[9] Wang Fei, Chen Liren, Li Cheng, Huang Shiyao, Chen Yanjie, Qian Chen, Loy, Chen Change. The Devil of Face Recognition is in the Noise, ECCV, 2018.
+
+[10] Cao Jiajiong, Li Yingming, Zhang Zhongfei, Celeb-500K: A Large Training Dataset for Face Recognition, ICIP, 2018.
+
+[11] Nech Aaron, Kemelmacher-Shlizerman Ira, Level Playing Field For Million Scale Face Recognition, CVPR, 2017.
+ 
+[12] Sengupta Soumyadip, Chen Jun-Cheng, Castillo Carlos, Patel Vishal M, Chellappa Rama, Jacobs David W,
+  Frontal to profile face verification in the wild, WACV, 2016.
+
+[13] Moschoglou, Stylianos and Papaioannou, Athanasios and Sagonas, Christos and Deng, Jiankang and Kotsia, Irene and Zafeiriou, Stefanos, Agedb: the first manually collected, in-the-wild age database, CVPRW, 2017.
+
+[14] Gary B. Huang, Manu Ramesh, Tamara Berg, and Erik Learned-Miller.
+Labeled Faces in the Wild: A Database for Studying Face Recognition in Unconstrained Environments, 2007.
+
+[15] Zheng Tianyue, Deng Weihong, Hu Jiani, Cross-age lfw: A database for studying cross-age face recognition in unconstrained environments, arXiv:1708.08197, 2017.
+
+[16] Zheng, Tianyue, and Weihong Deng. Cross-Pose LFW: A Database for Studying Cross-Pose Face Recognition in Unconstrained Environments, 2018.
+
+[17] An, Xiang and Zhu, Xuhan and Xiao, Yang and Wu, Lan and Zhang, Ming and Gao, Yuan and Qin, Bin and Zhang, Debing and Fu Ying. Partial FC: Training 10 Million Identities on a Single Machine, arxiv:2010.05222, 2020.
+
+[18] Zheng Zhu, Guan Huang, Jiankang Deng, Yun Ye, Junjie Huang, Xinze Chen, Jiagang Zhu, Tian Yang, Jiwen Lu, Dalong Du, Jie Zhou. WebFace260M: A Benchmark Unveiling the Power of Million-scale Deep Face Recognition
+
+[19] Gwangbin Bae, Martin de La Gorce, Tadas Baltrusaitis, Charlie Hewitt, Dong Chen, Julien Valentin, Roberto Cipolla, Jingjing Shen. DigiFace-1M: 1 Million Digital Face Images for Face Recognition. WACV 2023
\ No newline at end of file
diff --git a/insightface/recognition/_evaluation_/ijb/README.md b/insightface/recognition/_evaluation_/ijb/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..79ccd2d83cbaafaab035926a433370a5bb8aa17c
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/README.md
@@ -0,0 +1,36 @@
+To reproduce the figures and tables in the notebook, please download everything (model, code, data and meta info) from here:
+[GDrive](https://drive.google.com/file/d/1aC4zf2Bn0xCVH_ZtEuQipR2JvRb1bf8o/view?usp=sharing)
+or
+[Baidu Cloud](https://pan.baidu.com/s/1oer0p4_mcOrs4cfdeWfbFg)
+
+Updated Meta data (1:1 and 1:N):
+
+   [Baidu Cloud](https://pan.baidu.com/s/1x-ytzg4zkCTOTtklUgAhfg) (code:7g8o) ;
+   [GDrive](https://drive.google.com/file/d/1MXzrU_zUESSx_242pRUnVvW_wDzfU8Ky/view?usp=sharing)
+
+Please apply for the IJB-B and IJB-C by yourself and strictly follow their distribution licenses.
+
+## Aknowledgement
+
+Great thanks for Weidi Xie's instruction [2,3,4,5] to evaluate ArcFace [1] on IJB-B[6] and IJB-C[7] (1:1 protocol).
+
+Great thanks for Yuge Huang's code [8] to evaluate ArcFace [1] on IJB-B[6] and IJB-C[7] (1:N protocol). 
+
+## Reference
+
+[1] Jiankang Deng, Jia Guo, Niannan Xue, Stefanos Zafeiriou. Arcface: Additive angular margin loss for deep face recognition[J]. arXiv:1801.07698, 2018.
+
+[2] https://github.com/ox-vgg/vgg_face2.
+
+[3] Qiong Cao, Li Shen, Weidi Xie, Omkar M Parkhi, Andrew Zisserman. VGGFace2: A dataset for recognising faces across pose and age. FG, 2018.
+
+[4] Weidi Xie, Andrew Zisserman. Multicolumn Networks for Face Recognition. BMVC 2018.
+
+[5] Weidi Xie, Li Shen, Andrew Zisserman. Comparator Networks. ECCV, 2018.
+
+[6] Whitelam, Cameron, Emma Taborsky, Austin Blanton, Brianna Maze, Jocelyn C. Adams, Tim Miller, Nathan D. Kalka et al. IARPA Janus Benchmark-B Face Dataset. CVPR Workshops, 2017.
+
+[7] Maze, Brianna, Jocelyn Adams, James A. Duncan, Nathan Kalka, Tim Miller, Charles Otto, Anil K. Jain et al. IARPA Janus Benchmark–C: Face Dataset and Protocol. ICB, 2018.
+
+[8] Yuge Huang, Pengcheng Shen, Ying Tai, Shaoxin Li, Xiaoming Liu, Jilin Li, Feiyue Huang, Rongrong Ji. Distribution Distillation Loss: Generic Approach for Improving Face Recognition from Hard Samples. arXiv:2002.03662.
+
diff --git a/insightface/recognition/_evaluation_/ijb/example.sh b/insightface/recognition/_evaluation_/ijb/example.sh
new file mode 100755
index 0000000000000000000000000000000000000000..63f82698ce93717bbd11d6a23077d5b8b8900b30
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/example.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+python -u ijb_11.py --model-prefix ./pretrained_models/r100-arcface/model --model-epoch 1 --gpu 0 --target IJBC --job arcface > ijbc_11.log 2>&1 &
+
+python -u ijb_1n.py --model-prefix ./pretrained_models/r100-arcface/model --model-epoch 1 --gpu 0 --target IJBB --job arcface > ijbb_1n.log 2>&1 &
+
diff --git a/insightface/recognition/_evaluation_/ijb/ijb_11.py b/insightface/recognition/_evaluation_/ijb/ijb_11.py
new file mode 100644
index 0000000000000000000000000000000000000000..53b3326970c053b6deb1438eefc023fe5f4d0476
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/ijb_11.py
@@ -0,0 +1,380 @@
+# coding: utf-8
+
+import os
+import numpy as np
+#import cPickle
+import pickle
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import timeit
+import sklearn
+import argparse
+from sklearn.metrics import roc_curve, auc
+from sklearn import preprocessing
+import cv2
+import sys
+import glob
+sys.path.append('./recognition')
+from embedding import Embedding
+from menpo.visualize import print_progress
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from pathlib import Path
+import warnings
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--model-epoch', default=1, type=int, help='')
+parser.add_argument('--gpu', default=7, type=int, help='gpu id')
+parser.add_argument('--batch-size', default=32, type=int, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('--target',
+                    default='IJBC',
+                    type=str,
+                    help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+
+target = args.target
+model_path = args.model_prefix
+gpu_id = args.gpu
+epoch = args.model_epoch
+use_norm_score = True  # if Ture, TestMode(N1)
+use_detector_score = True  # if Ture, TestMode(D1)
+use_flip_test = True  # if Ture, TestMode(F1)
+job = args.job
+
+
+def read_template_media_list(path):
+    #ijb_meta = np.loadtxt(path, dtype=str)
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+# In[ ]:
+
+
+def read_template_pair_list(path):
+    #pairs = np.loadtxt(path, dtype=str)
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    #print(pairs.shape)
+    #print(pairs[:, 0].astype(np.int))
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+# In[ ]:
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# In[ ]:
+
+
+def get_image_feature(img_path, img_list_path, model_path, epoch, gpu_id):
+    img_list = open(img_list_path)
+    embedding = Embedding(model_path, epoch, gpu_id)
+    files = img_list.readlines()
+    print('files:', len(files))
+    faceness_scores = []
+    img_feats = []
+    for img_index, each_line in enumerate(files):
+        if img_index % 500 == 0:
+            print('processing', img_index)
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        img_feats.append(embedding.get(img, lmk))
+        faceness_scores.append(name_lmk_score[-1])
+    img_feats = np.array(img_feats).astype(np.float32)
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+
+    #img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+    #faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+    return img_feats, faceness_scores
+
+
+# In[ ]:
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t, ) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m, ) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    #template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+    template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+    #print(template_norm_feats.shape)
+    return template_norm_feats, unique_templates
+
+
+# In[ ]:
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1), ))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+# In[ ]:
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1), ))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# # Step1: Load Meta Data
+
+# In[ ]:
+
+assert target == 'IJBC' or target == 'IJBB'
+
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id,  mid --> media id
+# format:
+#           image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+    os.path.join('%s/meta' % target, '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id,  label : 1/0
+# format:
+#           tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % target,
+                 '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 2: Get Image Features
+
+# In[ ]:
+
+# =============================================================
+# load image features
+# format:
+#           img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = './%s/loose_crop' % target
+img_list_path = './%s/meta/%s_name_5pts_score.txt' % (target, target.lower())
+img_feats, faceness_scores = get_image_feature(img_path, img_list_path,
+                                               model_path, epoch, gpu_id)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                          img_feats.shape[1]))
+
+# # Step3: Get Template Features
+
+# In[ ]:
+
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+
+if use_flip_test:
+    # concat --- F1
+    #img_input_feats = img_feats
+    # add --- F2
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+                                2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+if use_norm_score:
+    img_input_feats = img_input_feats
+else:
+    # normalise features to remove norm information
+    img_input_feats = img_input_feats / np.sqrt(
+        np.sum(img_input_feats**2, -1, keepdims=True))
+
+if use_detector_score:
+    print(img_input_feats.shape, faceness_scores.shape)
+    #img_input_feats = img_input_feats * np.matlib.repmat(faceness_scores[:,np.newaxis], 1, img_input_feats.shape[1])
+    img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+    img_input_feats = img_input_feats
+
+template_norm_feats, unique_templates = image2template_feature(
+    img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 4: Get Template Similarity Scores
+
+# In[ ]:
+
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+save_path = './%s_result' % target
+
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+
+score_save_file = os.path.join(save_path, "%s.npy" % job)
+np.save(score_save_file, score)
+
+# # Step 5: Get ROC Curves and TPR@FPR Table
+
+# In[ ]:
+
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+    methods.append(Path(file).stem)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+#x_labels = [1/(10**x) for x in np.linspace(6, 0, 6)]
+x_labels = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, target))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        #tpr_fpr_row.append('%.4f' % tpr[min_index])
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10**-6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+#plt.show()
+fig.savefig(os.path.join(save_path, '%s.pdf' % job))
+print(tpr_fpr_table)
diff --git a/insightface/recognition/_evaluation_/ijb/ijb_1n.py b/insightface/recognition/_evaluation_/ijb/ijb_1n.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ea4051f5443e53fa07c585843de9696425429d
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/ijb_1n.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python
+# coding: utf-8
+import os
+import numpy as np
+import timeit
+import sklearn
+import cv2
+import sys
+import argparse
+import glob
+import numpy.matlib
+import heapq
+import math
+from datetime import datetime as dt
+
+from sklearn import preprocessing
+sys.path.append('./recognition')
+from embedding import Embedding
+from menpo.visualize import print_progress
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+
+
+def read_template_subject_id_list(path):
+    ijb_meta = np.loadtxt(path, dtype=str, skiprows=1, delimiter=',')
+    templates = ijb_meta[:, 0].astype(np.int)
+    subject_ids = ijb_meta[:, 1].astype(np.int)
+    return templates, subject_ids
+
+
+def read_template_media_list(path):
+    ijb_meta = np.loadtxt(path, dtype=str)
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = np.loadtxt(path, dtype=str)
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+#def get_image_feature(feature_path, faceness_path):
+#    img_feats = np.loadtxt(feature_path)
+#    faceness_scores = np.loadtxt(faceness_path)
+#    return img_feats, faceness_scores
+def get_image_feature(img_path, img_list_path, model_path, epoch, gpu_id):
+    img_list = open(img_list_path)
+    embedding = Embedding(model_path, epoch, gpu_id)
+    files = img_list.readlines()
+    print('files:', len(files))
+    faceness_scores = []
+    img_feats = []
+    for img_index, each_line in enumerate(files):
+        if img_index % 500 == 0:
+            print('processing', img_index)
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        img_feats.append(embedding.get(img, lmk))
+        faceness_scores.append(name_lmk_score[-1])
+    img_feats = np.array(img_feats).astype(np.float32)
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+
+    #img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+    #faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+    return img_feats, faceness_scores
+
+
+def image2template_feature(img_feats=None,
+                           templates=None,
+                           medias=None,
+                           choose_templates=None,
+                           choose_ids=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates, indices = np.unique(choose_templates, return_index=True)
+    unique_subjectids = choose_ids[indices]
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t, ) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m, ) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], 0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, 0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    template_norm_feats = template_feats / np.sqrt(
+        np.sum(template_feats**2, -1, keepdims=True))
+    return template_norm_feats, unique_templates, unique_subjectids
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1), ))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = cPickle.load(fid)
+    return img_feats
+
+
+def evaluation(query_feats, gallery_feats, mask):
+    Fars = [0.01, 0.1]
+    print(query_feats.shape)
+    print(gallery_feats.shape)
+
+    query_num = query_feats.shape[0]
+    gallery_num = gallery_feats.shape[0]
+
+    similarity = np.dot(query_feats, gallery_feats.T)
+    print('similarity shape', similarity.shape)
+    top_inds = np.argsort(-similarity)
+    print(top_inds.shape)
+
+    # calculate top1
+    correct_num = 0
+    for i in range(query_num):
+        j = top_inds[i, 0]
+        if j == mask[i]:
+            correct_num += 1
+    print("top1 = {}".format(correct_num / query_num))
+    # calculate top5
+    correct_num = 0
+    for i in range(query_num):
+        j = top_inds[i, 0:5]
+        if mask[i] in j:
+            correct_num += 1
+    print("top5 = {}".format(correct_num / query_num))
+    # calculate 10
+    correct_num = 0
+    for i in range(query_num):
+        j = top_inds[i, 0:10]
+        if mask[i] in j:
+            correct_num += 1
+    print("top10 = {}".format(correct_num / query_num))
+
+    neg_pair_num = query_num * gallery_num - query_num
+    print(neg_pair_num)
+    required_topk = [math.ceil(query_num * x) for x in Fars]
+    top_sims = similarity
+    # calculate fars and tprs
+    pos_sims = []
+    for i in range(query_num):
+        gt = mask[i]
+        pos_sims.append(top_sims[i, gt])
+        top_sims[i, gt] = -2.0
+
+    pos_sims = np.array(pos_sims)
+    print(pos_sims.shape)
+    neg_sims = top_sims[np.where(top_sims > -2.0)]
+    print("neg_sims num = {}".format(len(neg_sims)))
+    neg_sims = heapq.nlargest(max(required_topk), neg_sims)  # heap sort
+    print("after sorting , neg_sims num = {}".format(len(neg_sims)))
+    for far, pos in zip(Fars, required_topk):
+        th = neg_sims[pos - 1]
+        recall = np.sum(pos_sims > th) / query_num
+        print("far = {:.10f} pr = {:.10f} th = {:.10f}".format(
+            far, recall, th))
+
+
+def gen_mask(query_ids, reg_ids):
+    mask = []
+    for query_id in query_ids:
+        pos = [i for i, x in enumerate(reg_ids) if query_id == x]
+        if len(pos) != 1:
+            raise RuntimeError(
+                "RegIdsError with id = {}， duplicate = {} ".format(
+                    query_id, len(pos)))
+        mask.append(pos[0])
+    return mask
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='do ijb 1n test')
+    # general
+    parser.add_argument('--model-prefix',
+                        default='',
+                        help='path to load model.')
+    parser.add_argument('--model-epoch', default=1, type=int, help='')
+    parser.add_argument('--gpu', default=7, type=int, help='gpu id')
+    parser.add_argument('--batch-size', default=32, type=int, help='')
+    parser.add_argument('--job',
+                        default='insightface',
+                        type=str,
+                        help='job name')
+    parser.add_argument('--target',
+                        default='IJBC',
+                        type=str,
+                        help='target, set to IJBC or IJBB')
+    args = parser.parse_args()
+    target = args.target
+    model_path = args.model_prefix
+    gpu_id = args.gpu
+    epoch = args.model_epoch
+    meta_dir = "%s/meta" % args.target  #meta root dir
+    if target == 'IJBC':
+        gallery_s1_record = "%s_1N_gallery_G1.csv" % (args.target.lower())
+        gallery_s2_record = "%s_1N_gallery_G2.csv" % (args.target.lower())
+    else:
+        gallery_s1_record = "%s_1N_gallery_S1.csv" % (args.target.lower())
+        gallery_s2_record = "%s_1N_gallery_S2.csv" % (args.target.lower())
+    gallery_s1_templates, gallery_s1_subject_ids = read_template_subject_id_list(
+        os.path.join(meta_dir, gallery_s1_record))
+    print(gallery_s1_templates.shape, gallery_s1_subject_ids.shape)
+
+    gallery_s2_templates, gallery_s2_subject_ids = read_template_subject_id_list(
+        os.path.join(meta_dir, gallery_s2_record))
+    print(gallery_s2_templates.shape, gallery_s2_templates.shape)
+
+    gallery_templates = np.concatenate(
+        [gallery_s1_templates, gallery_s2_templates])
+    gallery_subject_ids = np.concatenate(
+        [gallery_s1_subject_ids, gallery_s2_subject_ids])
+    print(gallery_templates.shape, gallery_subject_ids.shape)
+
+    media_record = "%s_face_tid_mid.txt" % args.target.lower()
+    total_templates, total_medias = read_template_media_list(
+        os.path.join(meta_dir, media_record))
+    print("total_templates", total_templates.shape, total_medias.shape)
+    #load image features
+    start = timeit.default_timer()
+    feature_path = ''  #feature path
+    face_path = ''  #face path
+    img_path = './%s/loose_crop' % target
+    img_list_path = './%s/meta/%s_name_5pts_score.txt' % (target,
+                                                          target.lower())
+    #img_feats, faceness_scores = get_image_feature(feature_path, face_path)
+    img_feats, faceness_scores = get_image_feature(img_path, img_list_path,
+                                                   model_path, epoch, gpu_id)
+    print('img_feats', img_feats.shape)
+    print('faceness_scores', faceness_scores.shape)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                              img_feats.shape[1]))
+
+    # compute template features from image features.
+    start = timeit.default_timer()
+    # ==========================================================
+    # Norm feature before aggregation into template feature?
+    # Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+    # ==========================================================
+    use_norm_score = True  # if True, TestMode(N1)
+    use_detector_score = True  # if True, TestMode(D1)
+    use_flip_test = True  # if True, TestMode(F1)
+
+    if use_flip_test:
+        # concat --- F1
+        #img_input_feats = img_feats
+        # add --- F2
+        img_input_feats = img_feats[:, 0:int(
+            img_feats.shape[1] / 2)] + img_feats[:,
+                                                 int(img_feats.shape[1] / 2):]
+    else:
+        img_input_feats = img_feats[:, 0:int(img_feats.shape[1] / 2)]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        # normalise features to remove norm information
+        img_input_feats = img_input_feats / np.sqrt(
+            np.sum(img_input_feats**2, -1, keepdims=True))
+
+    if use_detector_score:
+        img_input_feats = img_input_feats * np.matlib.repmat(
+            faceness_scores[:, np.newaxis], 1, img_input_feats.shape[1])
+    else:
+        img_input_feats = img_input_feats
+    print("input features shape", img_input_feats.shape)
+
+    #load gallery feature
+    gallery_templates_feature, gallery_unique_templates, gallery_unique_subject_ids = image2template_feature(
+        img_input_feats, total_templates, total_medias, gallery_templates,
+        gallery_subject_ids)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print("gallery_templates_feature", gallery_templates_feature.shape)
+    print("gallery_unique_subject_ids", gallery_unique_subject_ids.shape)
+    #np.savetxt("gallery_templates_feature.txt", gallery_templates_feature)
+    #np.savetxt("gallery_unique_subject_ids.txt", gallery_unique_subject_ids)
+
+    #load prope feature
+    probe_mixed_record = "%s_1N_probe_mixed.csv" % target.lower()
+    probe_mixed_templates, probe_mixed_subject_ids = read_template_subject_id_list(
+        os.path.join(meta_dir, probe_mixed_record))
+    print(probe_mixed_templates.shape, probe_mixed_subject_ids.shape)
+    probe_mixed_templates_feature, probe_mixed_unique_templates, probe_mixed_unique_subject_ids = image2template_feature(
+        img_input_feats, total_templates, total_medias, probe_mixed_templates,
+        probe_mixed_subject_ids)
+    print("probe_mixed_templates_feature", probe_mixed_templates_feature.shape)
+    print("probe_mixed_unique_subject_ids",
+          probe_mixed_unique_subject_ids.shape)
+    #np.savetxt("probe_mixed_templates_feature.txt", probe_mixed_templates_feature)
+    #np.savetxt("probe_mixed_unique_subject_ids.txt", probe_mixed_unique_subject_ids)
+
+    #root_dir = "" #feature root dir
+    #gallery_id_path = "" #id filepath
+    #gallery_feats_path = "" #feature filelpath
+    #print("{}: start loading gallery feat {}".format(dt.now(), gallery_id_path))
+    #gallery_ids, gallery_feats = load_feat_file(root_dir, gallery_id_path, gallery_feats_path)
+    #print("{}: end loading gallery feat".format(dt.now()))
+    #
+    #probe_id_path = "probe_mixed_unique_subject_ids.txt" #probe id filepath
+    #probe_feats_path = "probe_mixed_templates_feature.txt" #probe feats filepath
+    #print("{}: start loading probe feat {}".format(dt.now(), probe_id_path))
+    #probe_ids, probe_feats = load_feat_file(root_dir, probe_id_path, probe_feats_path)
+    #print("{}: end loading probe feat".format(dt.now()))
+
+    gallery_ids = gallery_unique_subject_ids
+    gallery_feats = gallery_templates_feature
+    probe_ids = probe_mixed_unique_subject_ids
+    probe_feats = probe_mixed_templates_feature
+
+    mask = gen_mask(probe_ids, gallery_ids)
+
+    print("{}: start evaluation".format(dt.now()))
+    evaluation(probe_feats, gallery_feats, mask)
+    print("{}: end evaluation".format(dt.now()))
diff --git a/insightface/recognition/_evaluation_/ijb/ijb_evals.py b/insightface/recognition/_evaluation_/ijb/ijb_evals.py
new file mode 100755
index 0000000000000000000000000000000000000000..9652c89a41c0b168b383c5b448deff4121c0f247
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/ijb_evals.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+import os
+import cv2
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from skimage import transform
+from sklearn.preprocessing import normalize
+from sklearn.metrics import roc_curve, auc
+
+
+class Mxnet_model_interf:
+    def __init__(self, model_file, layer="fc1", image_size=(112, 112)):
+        import mxnet as mx
+
+        self.mx = mx
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+        if len(cvd) > 0 and int(cvd) != -1:
+            ctx = [self.mx.gpu(ii) for ii in range(len(cvd.split(",")))]
+        else:
+            ctx = [self.mx.cpu()]
+
+        prefix, epoch = model_file.split(",")
+        print(">>>> loading mxnet model:", prefix, epoch, ctx)
+        sym, arg_params, aux_params = self.mx.model.load_checkpoint(prefix, int(epoch))
+        all_layers = sym.get_internals()
+        sym = all_layers[layer + "_output"]
+        model = self.mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        model.bind(data_shapes=[("data", (1, 3, image_size[0], image_size[1]))])
+        model.set_params(arg_params, aux_params)
+        self.model = model
+
+    def __call__(self, imgs):
+        # print(imgs.shape, imgs[0])
+        imgs = imgs.transpose(0, 3, 1, 2)
+        data = self.mx.nd.array(imgs)
+        db = self.mx.io.DataBatch(data=(data,))
+        self.model.forward(db, is_train=False)
+        emb = self.model.get_outputs()[0].asnumpy()
+        return emb
+
+
+class Torch_model_interf:
+    def __init__(self, model_file, image_size=(112, 112)):
+        import torch
+
+        self.torch = torch
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+        device_name = "cuda:0" if len(cvd) > 0 and int(cvd) != -1 else "cpu"
+        self.device = self.torch.device(device_name)
+        try:
+            self.model = self.torch.jit.load(model_file, map_location=device_name)
+        except:
+            print("Error: %s is weights only, please load and save the entire model by `torch.jit.save`" % model_file)
+            self.model = None
+
+    def __call__(self, imgs):
+        # print(imgs.shape, imgs[0])
+        imgs = imgs.transpose(0, 3, 1, 2).copy().astype("float32")
+        imgs = (imgs - 127.5) * 0.0078125
+        output = self.model(self.torch.from_numpy(imgs).to(self.device).float())
+        return output.cpu().detach().numpy()
+
+
+class ONNX_model_interf:
+    def __init__(self, model_file, image_size=(112, 112)):
+        import onnxruntime as ort
+        ort.set_default_logger_severity(3)
+        self.ort_session = ort.InferenceSession(model_file)
+        self.output_names = [self.ort_session.get_outputs()[0].name]
+        self.input_name = self.ort_session.get_inputs()[0].name
+
+    def __call__(self, imgs):
+        imgs = imgs.transpose(0, 3, 1, 2).astype("float32")
+        imgs = (imgs - 127.5) * 0.0078125
+        outputs = self.ort_session.run(self.output_names, {self.input_name: imgs})
+        return outputs[0]
+
+
+def keras_model_interf(model_file):
+    import tensorflow as tf
+    from tensorflow_addons.layers import StochasticDepth
+
+    for gpu in tf.config.experimental.list_physical_devices("GPU"):
+        tf.config.experimental.set_memory_growth(gpu, True)
+
+    mm = tf.keras.models.load_model(model_file, compile=False)
+    return lambda imgs: mm((tf.cast(imgs, "float32") - 127.5) * 0.0078125).numpy()
+
+
+def face_align_landmark(img, landmark, image_size=(112, 112), method="similar"):
+    tform = transform.AffineTransform() if method == "affine" else transform.SimilarityTransform()
+    src = np.array(
+        [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366], [41.5493, 92.3655], [70.729904, 92.2041]], dtype=np.float32
+    )
+    tform.estimate(landmark, src)
+    # ndimage = transform.warp(img, tform.inverse, output_shape=image_size)
+    # ndimage = (ndimage * 255).astype(np.uint8)
+    M = tform.params[0:2, :]
+    ndimage = cv2.warpAffine(img, M, image_size, borderValue=0.0)
+    if len(ndimage.shape) == 2:
+        ndimage = np.stack([ndimage, ndimage, ndimage], -1)
+    else:
+        ndimage = cv2.cvtColor(ndimage, cv2.COLOR_BGR2RGB)
+    return ndimage
+
+
+def read_IJB_meta_columns_to_int(file_path, columns, sep=" ", skiprows=0, header=None):
+    # meta = np.loadtxt(file_path, skiprows=skiprows, delimiter=sep)
+    meta = pd.read_csv(file_path, sep=sep, skiprows=skiprows, header=header).values
+    return (meta[:, ii].astype("int") for ii in columns)
+
+
+def extract_IJB_data_11(data_path, subset, save_path=None, force_reload=False):
+    if save_path == None:
+        save_path = os.path.join(data_path, subset + "_backup.npz")
+    if not force_reload and os.path.exists(save_path):
+        print(">>>> Reload from backup: %s ..." % save_path)
+        aa = np.load(save_path)
+        return (
+            aa["templates"],
+            aa["medias"],
+            aa["p1"],
+            aa["p2"],
+            aa["label"],
+            aa["img_names"],
+            aa["landmarks"],
+            aa["face_scores"],
+        )
+
+    if subset == "IJBB":
+        media_list_path = os.path.join(data_path, "IJBB/meta/ijbb_face_tid_mid.txt")
+        pair_list_path = os.path.join(data_path, "IJBB/meta/ijbb_template_pair_label.txt")
+        img_path = os.path.join(data_path, "IJBB/loose_crop")
+        img_list_path = os.path.join(data_path, "IJBB/meta/ijbb_name_5pts_score.txt")
+    else:
+        media_list_path = os.path.join(data_path, "IJBC/meta/ijbc_face_tid_mid.txt")
+        pair_list_path = os.path.join(data_path, "IJBC/meta/ijbc_template_pair_label.txt")
+        img_path = os.path.join(data_path, "IJBC/loose_crop")
+        img_list_path = os.path.join(data_path, "IJBC/meta/ijbc_name_5pts_score.txt")
+
+    print(">>>> Loading templates and medias...")
+    templates, medias = read_IJB_meta_columns_to_int(media_list_path, columns=[1, 2])  # ['1.jpg', '1', '69544']
+    print("templates: %s, medias: %s, unique templates: %s" % (templates.shape, medias.shape, np.unique(templates).shape))
+    # templates: (227630,), medias: (227630,), unique templates: (12115,)
+
+    print(">>>> Loading pairs...")
+    p1, p2, label = read_IJB_meta_columns_to_int(pair_list_path, columns=[0, 1, 2])  # ['1', '11065', '1']
+    print("p1: %s, unique p1: %s" % (p1.shape, np.unique(p1).shape))
+    print("p2: %s, unique p2: %s" % (p2.shape, np.unique(p2).shape))
+    print("label: %s, label value counts: %s" % (label.shape, dict(zip(*np.unique(label, return_counts=True)))))
+    # p1: (8010270,), unique p1: (1845,)
+    # p2: (8010270,), unique p2: (10270,) # 10270 + 1845 = 12115 --> np.unique(templates).shape
+    # label: (8010270,), label value counts: {0: 8000000, 1: 10270}
+
+    print(">>>> Loading images...")
+    with open(img_list_path, "r") as ff:
+        # 1.jpg 46.060 62.026 87.785 60.323 68.851 77.656 52.162 99.875 86.450 98.648 0.999
+        img_records = np.array([ii.strip().split(" ") for ii in ff.readlines()])
+
+    img_names = np.array([os.path.join(img_path, ii) for ii in img_records[:, 0]])
+    landmarks = img_records[:, 1:-1].astype("float32").reshape(-1, 5, 2)
+    face_scores = img_records[:, -1].astype("float32")
+    print("img_names: %s, landmarks: %s, face_scores: %s" % (img_names.shape, landmarks.shape, face_scores.shape))
+    # img_names: (227630,), landmarks: (227630, 5, 2), face_scores: (227630,)
+    print("face_scores value counts:", dict(zip(*np.histogram(face_scores, bins=9)[::-1])))
+    # {0.1: 2515, 0.2: 0, 0.3: 62, 0.4: 94, 0.5: 136, 0.6: 197, 0.7: 291, 0.8: 538, 0.9: 223797}
+
+    print(">>>> Saving backup to: %s ..." % save_path)
+    np.savez(
+        save_path,
+        templates=templates,
+        medias=medias,
+        p1=p1,
+        p2=p2,
+        label=label,
+        img_names=img_names,
+        landmarks=landmarks,
+        face_scores=face_scores,
+    )
+    print()
+    return templates, medias, p1, p2, label, img_names, landmarks, face_scores
+
+
+def extract_gallery_prob_data(data_path, subset, save_path=None, force_reload=False):
+    if save_path == None:
+        save_path = os.path.join(data_path, subset + "_gallery_prob_backup.npz")
+    if not force_reload and os.path.exists(save_path):
+        print(">>>> Reload from backup: %s ..." % save_path)
+        aa = np.load(save_path)
+        return (
+            aa["s1_templates"],
+            aa["s1_subject_ids"],
+            aa["s2_templates"],
+            aa["s2_subject_ids"],
+            aa["probe_mixed_templates"],
+            aa["probe_mixed_subject_ids"],
+        )
+
+    if subset == "IJBC":
+        meta_dir = os.path.join(data_path, "IJBC/meta")
+        gallery_s1_record = os.path.join(meta_dir, "ijbc_1N_gallery_G1.csv")
+        gallery_s2_record = os.path.join(meta_dir, "ijbc_1N_gallery_G2.csv")
+        probe_mixed_record = os.path.join(meta_dir, "ijbc_1N_probe_mixed.csv")
+    else:
+        meta_dir = os.path.join(data_path, "IJBB/meta")
+        gallery_s1_record = os.path.join(meta_dir, "ijbb_1N_gallery_S1.csv")
+        gallery_s2_record = os.path.join(meta_dir, "ijbb_1N_gallery_S2.csv")
+        probe_mixed_record = os.path.join(meta_dir, "ijbb_1N_probe_mixed.csv")
+
+    print(">>>> Loading gallery feature...")
+    s1_templates, s1_subject_ids = read_IJB_meta_columns_to_int(gallery_s1_record, columns=[0, 1], skiprows=1, sep=",")
+    s2_templates, s2_subject_ids = read_IJB_meta_columns_to_int(gallery_s2_record, columns=[0, 1], skiprows=1, sep=",")
+    print("s1 gallery: %s, ids: %s, unique: %s" % (s1_templates.shape, s1_subject_ids.shape, np.unique(s1_templates).shape))
+    print("s2 gallery: %s, ids: %s, unique: %s" % (s2_templates.shape, s2_subject_ids.shape, np.unique(s2_templates).shape))
+
+    print(">>>> Loading prope feature...")
+    probe_mixed_templates, probe_mixed_subject_ids = read_IJB_meta_columns_to_int(
+        probe_mixed_record, columns=[0, 1], skiprows=1, sep=","
+    )
+    print("probe_mixed_templates: %s, unique: %s" % (probe_mixed_templates.shape, np.unique(probe_mixed_templates).shape))
+    print("probe_mixed_subject_ids: %s, unique: %s" % (probe_mixed_subject_ids.shape, np.unique(probe_mixed_subject_ids).shape))
+
+    print(">>>> Saving backup to: %s ..." % save_path)
+    np.savez(
+        save_path,
+        s1_templates=s1_templates,
+        s1_subject_ids=s1_subject_ids,
+        s2_templates=s2_templates,
+        s2_subject_ids=s2_subject_ids,
+        probe_mixed_templates=probe_mixed_templates,
+        probe_mixed_subject_ids=probe_mixed_subject_ids,
+    )
+    print()
+    return s1_templates, s1_subject_ids, s2_templates, s2_subject_ids, probe_mixed_templates, probe_mixed_subject_ids
+
+
+def get_embeddings(model_interf, img_names, landmarks, batch_size=64, flip=True):
+    steps = int(np.ceil(len(img_names) / batch_size))
+    embs, embs_f = [], []
+    for batch_id in tqdm(range(0, len(img_names), batch_size), "Embedding", total=steps):
+        batch_imgs, batch_landmarks = img_names[batch_id : batch_id + batch_size], landmarks[batch_id : batch_id + batch_size]
+        ndimages = [face_align_landmark(cv2.imread(img), landmark) for img, landmark in zip(batch_imgs, batch_landmarks)]
+        ndimages = np.stack(ndimages)
+        embs.extend(model_interf(ndimages))
+        if flip:
+            embs_f.extend(model_interf(ndimages[:, :, ::-1, :]))
+    return np.array(embs), np.array(embs_f)
+
+
+def process_embeddings(embs, embs_f=[], use_flip_test=True, use_norm_score=False, use_detector_score=True, face_scores=None):
+    print(">>>> process_embeddings: Norm {}, Detect_score {}, Flip {}".format(use_norm_score, use_detector_score, use_flip_test))
+    if use_flip_test and len(embs_f) != 0:
+        embs = embs + embs_f
+    if use_norm_score:
+        embs = normalize(embs)
+    if use_detector_score and face_scores is not None:
+        embs = embs * np.expand_dims(face_scores, -1)
+    return embs
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None, choose_templates=None, choose_ids=None):
+    if choose_templates is not None:  # 1:N
+        unique_templates, indices = np.unique(choose_templates, return_index=True)
+        unique_subjectids = choose_ids[indices]
+    else:  # 1:1
+        unique_templates = np.unique(templates)
+        unique_subjectids = None
+
+    # template_feats = np.zeros((len(unique_templates), img_feats.shape[1]), dtype=img_feats.dtype)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in tqdm(enumerate(unique_templates), "Extract template feature", total=len(unique_templates)):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [np.mean(face_norm_feats[ind_m], 0, keepdims=True)]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, 0)
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates, unique_subjectids
+
+
+def verification_11(template_norm_feats=None, unique_templates=None, p1=None, p2=None, batch_size=10000):
+    try:
+        print(">>>> Trying cupy.")
+        import cupy as cp
+
+        template_norm_feats = cp.array(template_norm_feats)
+        score_func = lambda feat1, feat2: cp.sum(feat1 * feat2, axis=-1).get()
+        test = score_func(template_norm_feats[:batch_size], template_norm_feats[:batch_size])
+    except:
+        score_func = lambda feat1, feat2: np.sum(feat1 * feat2, -1)
+
+    template2id = np.zeros(max(unique_templates) + 1, dtype=int)
+    template2id[unique_templates] = np.arange(len(unique_templates))
+
+    steps = int(np.ceil(len(p1) / batch_size))
+    score = []
+    for id in tqdm(range(steps), "Verification"):
+        feat1 = template_norm_feats[template2id[p1[id * batch_size : (id + 1) * batch_size]]]
+        feat2 = template_norm_feats[template2id[p2[id * batch_size : (id + 1) * batch_size]]]
+        score.extend(score_func(feat1, feat2))
+    return np.array(score)
+
+
+def evaluation_1N(query_feats, gallery_feats, query_ids, reg_ids, fars=[0.01, 0.1]):
+    print("query_feats: %s, gallery_feats: %s" % (query_feats.shape, gallery_feats.shape))
+    similarity = np.dot(query_feats, gallery_feats.T)  # (19593, 3531)
+
+    top_1_count, top_5_count, top_10_count = 0, 0, 0
+    pos_sims, neg_sims, non_gallery_sims = [], [], []
+    for index, query_id in enumerate(query_ids):
+        if query_id in reg_ids:
+            gallery_label = np.argwhere(reg_ids == query_id)[0, 0]
+            index_sorted = np.argsort(similarity[index])[::-1]
+
+            top_1_count += gallery_label in index_sorted[:1]
+            top_5_count += gallery_label in index_sorted[:5]
+            top_10_count += gallery_label in index_sorted[:10]
+
+            pos_sims.append(similarity[index][reg_ids == query_id][0])
+            neg_sims.append(similarity[index][reg_ids != query_id])
+        else:
+            non_gallery_sims.append(similarity[index])
+    total_pos = len(pos_sims)
+    pos_sims, neg_sims, non_gallery_sims = np.array(pos_sims), np.array(neg_sims), np.array(non_gallery_sims)
+    print("pos_sims: %s, neg_sims: %s, non_gallery_sims: %s" % (pos_sims.shape, neg_sims.shape, non_gallery_sims.shape))
+    print("top1: %f, top5: %f, top10: %f" % (top_1_count / total_pos, top_5_count / total_pos, top_10_count / total_pos))
+
+    correct_pos_cond = pos_sims > neg_sims.max(1)
+    non_gallery_sims_sorted = np.sort(non_gallery_sims.max(1))[::-1]
+    threshes, recalls = [], []
+    for far in fars:
+        # thresh = non_gallery_sims_sorted[int(np.ceil(non_gallery_sims_sorted.shape[0] * far)) - 1]
+        thresh = non_gallery_sims_sorted[max(int((non_gallery_sims_sorted.shape[0]) * far) - 1, 0)]
+        recall = np.logical_and(correct_pos_cond, pos_sims > thresh).sum() / pos_sims.shape[0]
+        threshes.append(thresh)
+        recalls.append(recall)
+        # print("FAR = {:.10f} TPIR = {:.10f} th = {:.10f}".format(far, recall, thresh))
+    cmc_scores = list(zip(neg_sims, pos_sims.reshape(-1, 1))) + list(zip(non_gallery_sims, [None] * non_gallery_sims.shape[0]))
+    return top_1_count, top_5_count, top_10_count, threshes, recalls, cmc_scores
+
+
+class IJB_test:
+    def __init__(self, model_file, data_path, subset, batch_size=64, force_reload=False, restore_embs=None):
+        templates, medias, p1, p2, label, img_names, landmarks, face_scores = extract_IJB_data_11(
+            data_path, subset, force_reload=force_reload
+        )
+        if model_file != None:
+            if model_file.endswith(".h5"):
+                interf_func = keras_model_interf(model_file)
+            elif model_file.endswith(".pth") or model_file.endswith(".pt"):
+                interf_func = Torch_model_interf(model_file)
+            elif model_file.endswith(".onnx") or model_file.endswith(".ONNX"):
+                interf_func = ONNX_model_interf(model_file)
+            else:
+                interf_func = Mxnet_model_interf(model_file)
+            self.embs, self.embs_f = get_embeddings(interf_func, img_names, landmarks, batch_size=batch_size)
+        elif restore_embs != None:
+            print(">>>> Reload embeddings from:", restore_embs)
+            aa = np.load(restore_embs)
+            if "embs" in aa and "embs_f" in aa:
+                self.embs, self.embs_f = aa["embs"], aa["embs_f"]
+            else:
+                print("ERROR: %s NOT containing embs / embs_f" % restore_embs)
+                exit(1)
+            print(">>>> Done.")
+        self.data_path, self.subset, self.force_reload = data_path, subset, force_reload
+        self.templates, self.medias, self.p1, self.p2, self.label = templates, medias, p1, p2, label
+        self.face_scores = face_scores.astype(self.embs.dtype)
+
+    def run_model_test_single(self, use_flip_test=True, use_norm_score=False, use_detector_score=True):
+        img_input_feats = process_embeddings(
+            self.embs,
+            self.embs_f,
+            use_flip_test=use_flip_test,
+            use_norm_score=use_norm_score,
+            use_detector_score=use_detector_score,
+            face_scores=self.face_scores,
+        )
+        template_norm_feats, unique_templates, _ = image2template_feature(img_input_feats, self.templates, self.medias)
+        score = verification_11(template_norm_feats, unique_templates, self.p1, self.p2)
+        return score
+
+    def run_model_test_bunch(self):
+        from itertools import product
+
+        scores, names = [], []
+        for use_norm_score, use_detector_score, use_flip_test in product([True, False], [True, False], [True, False]):
+            name = "N{:d}D{:d}F{:d}".format(use_norm_score, use_detector_score, use_flip_test)
+            print(">>>>", name, use_norm_score, use_detector_score, use_flip_test)
+            names.append(name)
+            scores.append(self.run_model_test_single(use_flip_test, use_norm_score, use_detector_score))
+        return scores, names
+
+    def run_model_test_1N(self, npoints=100):
+        fars_cal = [10 ** ii for ii in np.arange(-4, 0, 4 / npoints)] + [1]  # plot in range [10-4, 1]
+        fars_show_idx = np.arange(len(fars_cal))[:: npoints // 4]  # npoints=100, fars_show=[0.0001, 0.001, 0.01, 0.1, 1.0]
+
+        g1_templates, g1_ids, g2_templates, g2_ids, probe_mixed_templates, probe_mixed_ids = extract_gallery_prob_data(
+            self.data_path, self.subset, force_reload=self.force_reload
+        )
+        img_input_feats = process_embeddings(
+            self.embs,
+            self.embs_f,
+            use_flip_test=True,
+            use_norm_score=False,
+            use_detector_score=True,
+            face_scores=self.face_scores,
+        )
+        g1_templates_feature, g1_unique_templates, g1_unique_ids = image2template_feature(
+            img_input_feats, self.templates, self.medias, g1_templates, g1_ids
+        )
+        g2_templates_feature, g2_unique_templates, g2_unique_ids = image2template_feature(
+            img_input_feats, self.templates, self.medias, g2_templates, g2_ids
+        )
+        probe_mixed_templates_feature, probe_mixed_unique_templates, probe_mixed_unique_subject_ids = image2template_feature(
+            img_input_feats, self.templates, self.medias, probe_mixed_templates, probe_mixed_ids
+        )
+        print("g1_templates_feature:", g1_templates_feature.shape)  # (1772, 512)
+        print("g2_templates_feature:", g2_templates_feature.shape)  # (1759, 512)
+
+        print("probe_mixed_templates_feature:", probe_mixed_templates_feature.shape)  # (19593, 512)
+        print("probe_mixed_unique_subject_ids:", probe_mixed_unique_subject_ids.shape)  # (19593,)
+
+        print(">>>> Gallery 1")
+        g1_top_1_count, g1_top_5_count, g1_top_10_count, g1_threshes, g1_recalls, g1_cmc_scores = evaluation_1N(
+            probe_mixed_templates_feature, g1_templates_feature, probe_mixed_unique_subject_ids, g1_unique_ids, fars_cal
+        )
+        print(">>>> Gallery 2")
+        g2_top_1_count, g2_top_5_count, g2_top_10_count, g2_threshes, g2_recalls, g2_cmc_scores = evaluation_1N(
+            probe_mixed_templates_feature, g2_templates_feature, probe_mixed_unique_subject_ids, g2_unique_ids, fars_cal
+        )
+        print(">>>> Mean")
+        query_num = probe_mixed_templates_feature.shape[0]
+        top_1 = (g1_top_1_count + g2_top_1_count) / query_num
+        top_5 = (g1_top_5_count + g2_top_5_count) / query_num
+        top_10 = (g1_top_10_count + g2_top_10_count) / query_num
+        print("[Mean] top1: %f, top5: %f, top10: %f" % (top_1, top_5, top_10))
+
+        mean_tpirs = (np.array(g1_recalls) + np.array(g2_recalls)) / 2
+        show_result = {}
+        for id, far in enumerate(fars_cal):
+            if id in fars_show_idx:
+                show_result.setdefault("far", []).append(far)
+                show_result.setdefault("g1_tpir", []).append(g1_recalls[id])
+                show_result.setdefault("g1_thresh", []).append(g1_threshes[id])
+                show_result.setdefault("g2_tpir", []).append(g2_recalls[id])
+                show_result.setdefault("g2_thresh", []).append(g2_threshes[id])
+                show_result.setdefault("mean_tpir", []).append(mean_tpirs[id])
+        print(pd.DataFrame(show_result).set_index("far").to_markdown())
+        return fars_cal, mean_tpirs, g1_cmc_scores, g2_cmc_scores
+
+
+def plot_roc_and_calculate_tpr(scores, names=None, label=None):
+    print(">>>> plot roc and calculate tpr...")
+    score_dict = {}
+    for id, score in enumerate(scores):
+        name = None if names is None else names[id]
+        if isinstance(score, str) and score.endswith(".npz"):
+            aa = np.load(score)
+            score = aa.get("scores", [])
+            label = aa["label"] if label is None and "label" in aa else label
+            score_name = aa.get("names", [])
+            for ss, nn in zip(score, score_name):
+                score_dict[nn] = ss
+        elif isinstance(score, str) and score.endswith(".npy"):
+            name = name if name is not None else os.path.splitext(os.path.basename(score))[0]
+            score_dict[name] = np.load(score)
+        elif isinstance(score, str) and score.endswith(".txt"):
+            # IJB meta data like ijbb_template_pair_label.txt
+            label = pd.read_csv(score, sep=" ", header=None).values[:, 2]
+        else:
+            name = name if name is not None else str(id)
+            score_dict[name] = score
+    if label is None:
+        print("Error: Label data is not provided")
+        return None, None
+
+    x_labels = [10 ** (-ii) for ii in range(1, 7)[::-1]]
+    fpr_dict, tpr_dict, roc_auc_dict, tpr_result = {}, {}, {}, {}
+    for name, score in score_dict.items():
+        fpr, tpr, _ = roc_curve(label, score)
+        roc_auc = auc(fpr, tpr)
+        fpr, tpr = np.flipud(fpr), np.flipud(tpr)  # select largest tpr at same fpr
+        tpr_result[name] = [tpr[np.argmin(abs(fpr - ii))] for ii in x_labels]
+        fpr_dict[name], tpr_dict[name], roc_auc_dict[name] = fpr, tpr, roc_auc
+    tpr_result_df = pd.DataFrame(tpr_result, index=x_labels).T
+    tpr_result_df['AUC'] = pd.Series(roc_auc_dict)
+    tpr_result_df.columns.name = "Methods"
+    print(tpr_result_df.to_markdown())
+    # print(tpr_result_df)
+
+    try:
+        import matplotlib.pyplot as plt
+
+        fig = plt.figure()
+        for name in score_dict:
+            plt.plot(fpr_dict[name], tpr_dict[name], lw=1, label="[%s (AUC = %0.4f%%)]" % (name, roc_auc_dict[name] * 100))
+        title = "ROC on IJB" + name.split("IJB")[-1][0] if "IJB" in name else "ROC on IJB"
+
+        plt.xlim([10 ** -6, 0.1])
+        plt.xscale("log")
+        plt.xticks(x_labels)
+        plt.xlabel("False Positive Rate")
+        plt.ylim([0.3, 1.0])
+        plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+        plt.ylabel("True Positive Rate")
+
+        plt.grid(linestyle="--", linewidth=1)
+        plt.title(title)
+        plt.legend(loc="lower right", fontsize='x-small')
+        plt.tight_layout()
+        plt.show()
+    except:
+        print("matplotlib plot failed")
+        fig = None
+
+    return tpr_result_df, fig
+
+
+def plot_dir_far_cmc_scores(scores, names=None):
+    try:
+        import matplotlib.pyplot as plt
+
+        fig = plt.figure()
+        for id, score in enumerate(scores):
+            name = None if names is None else names[id]
+            if isinstance(score, str) and score.endswith(".npz"):
+                aa = np.load(score)
+                score, name = aa.get("scores")[0], aa.get("names")[0]
+            fars, tpirs = score[0], score[1]
+            name = name if name is not None else str(id)
+
+            auc_value = auc(fars, tpirs)
+            label = "[%s (AUC = %0.4f%%)]" % (name, auc_value * 100)
+            plt.plot(fars, tpirs, lw=1, label=label)
+
+        plt.xlabel("False Alarm Rate")
+        plt.xlim([0.0001, 1])
+        plt.xscale("log")
+        plt.ylabel("Detection & Identification Rate (%)")
+        plt.ylim([0, 1])
+
+        plt.grid(linestyle="--", linewidth=1)
+        plt.legend(fontsize='x-small')
+        plt.tight_layout()
+        plt.show()
+    except:
+        print("matplotlib plot failed")
+        fig = None
+
+    return fig
+
+
+def parse_arguments(argv):
+    import argparse
+
+    default_save_result_name = "IJB_result/{model_name}_{subset}_{type}.npz"
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-m", "--model_file", type=str, default=None, help="Saved model, keras h5 / pytorch jit pth / onnx / mxnet")
+    parser.add_argument("-d", "--data_path", type=str, default="./", help="Dataset path containing IJBB and IJBC sub folder")
+    parser.add_argument("-s", "--subset", type=str, default="IJBC", help="Subset test target, could be IJBB / IJBC")
+    parser.add_argument("-b", "--batch_size", type=int, default=128, help="Batch size for get_embeddings")
+    parser.add_argument(
+        "-R", "--save_result", type=str, default=default_save_result_name, help="Filename for saving / restore result"
+    )
+    parser.add_argument("-L", "--save_label", action="store_true", help="Save label data, useful for plot only")
+    parser.add_argument("-E", "--save_embeddings", action="store_true", help="Save embeddings data")
+    parser.add_argument("-B", "--is_bunch", action="store_true", help="Run all 8 tests N{0,1}D{0,1}F{0,1}")
+    parser.add_argument("-N", "--is_one_2_N", action="store_true", help="Run 1:N test instead of 1:1")
+    parser.add_argument("-F", "--force_reload", action="store_true", help="Force reload, instead of using cache")
+    parser.add_argument("-P", "--plot_only", nargs="*", type=str, help="Plot saved results, Format 1 2 3 or 1, 2, 3 or *.npy")
+    args = parser.parse_known_args(argv)[0]
+
+    if args.plot_only != None and len(args.plot_only) != 0:
+        # Plot only
+        from glob2 import glob
+
+        score_files = []
+        for ss in args.plot_only:
+            score_files.extend(glob(ss.replace(",", "").strip()))
+        args.plot_only = score_files
+    elif args.model_file == None and args.save_result == default_save_result_name:
+        print("Please provide -m MODEL_FILE, see `--help` for usage.")
+        exit(1)
+    elif args.model_file != None:
+        if args.model_file.endswith(".h5") or args.model_file.endswith(".pth") or args.model_file.endswith(".pt") or args.model_file.endswith(".onnx"):
+            # Keras model file "model.h5", pytorch model ends with `.pth` or `.pt`, onnx model ends with `.onnx`
+            model_name = os.path.splitext(os.path.basename(args.model_file))[0]
+        else:
+            # MXNet model file "models/r50-arcface-emore/model,1"
+            model_name = os.path.basename(os.path.dirname(args.model_file))
+
+        if args.save_result == default_save_result_name:
+            type = "1N" if args.is_one_2_N else "11"
+            args.save_result = default_save_result_name.format(model_name=model_name, subset=args.subset, type=type)
+    return args
+
+
+if __name__ == "__main__":
+    import sys
+
+    args = parse_arguments(sys.argv[1:])
+    if args.plot_only != None and len(args.plot_only) != 0:
+        if args.is_one_2_N:
+            plot_dir_far_cmc_scores(args.plot_only)
+        else:
+            plot_roc_and_calculate_tpr(args.plot_only)
+    else:
+        save_name = os.path.splitext(os.path.basename(args.save_result))[0]
+        save_items = {}
+        save_path = os.path.dirname(args.save_result)
+        if len(save_path) != 0 and not os.path.exists(save_path):
+            os.makedirs(save_path)
+
+        tt = IJB_test(args.model_file, args.data_path, args.subset, args.batch_size, args.force_reload, args.save_result)
+        if args.save_embeddings:  # Save embeddings first, in case of any error happens later...
+            np.savez(args.save_result, embs=tt.embs, embs_f=tt.embs_f)
+
+        if args.is_one_2_N:  # 1:N test
+            fars, tpirs, _, _ = tt.run_model_test_1N()
+            scores = [(fars, tpirs)]
+            names = [save_name]
+            save_items.update({"scores": scores, "names": names})
+        elif args.is_bunch:  # All 8 tests N{0,1}D{0,1}F{0,1}
+            scores, names = tt.run_model_test_bunch()
+            names = [save_name + "_" + ii for ii in names]
+            label = tt.label
+            save_items.update({"scores": scores, "names": names})
+        else:  # Basic 1:1 N0D1F1 test
+            score = tt.run_model_test_single()
+            scores, names, label = [score], [save_name], tt.label
+            save_items.update({"scores": scores, "names": names})
+
+        if args.save_embeddings:
+            save_items.update({"embs": tt.embs, "embs_f": tt.embs_f})
+        if args.save_label:
+            save_items.update({"label": label})
+
+        if args.model_file != None or args.save_embeddings:  # embeddings not restored from file or should save_embeddings again
+            np.savez(args.save_result, **save_items)
+
+        if args.is_one_2_N:
+            plot_dir_far_cmc_scores(scores=scores, names=names)
+        else:
+            plot_roc_and_calculate_tpr(scores, names=names, label=label)
diff --git a/insightface/recognition/_evaluation_/ijb/ijb_onnx.py b/insightface/recognition/_evaluation_/ijb/ijb_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2edbe8382c013832f1cc12a8cb7d3df44c8b36
--- /dev/null
+++ b/insightface/recognition/_evaluation_/ijb/ijb_onnx.py
@@ -0,0 +1,267 @@
+import argparse
+import os
+import pickle
+import timeit
+
+import cv2
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import prettytable
+import skimage.transform
+from sklearn.metrics import roc_curve
+from sklearn.preprocessing import normalize
+import insightface
+from insightface.model_zoo import ArcFaceONNX
+
+
+SRC = np.array(
+    [
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041]]
+    , dtype=np.float32)
+SRC[:, 0] += 8.0
+
+
+class AlignedDataSet(mx.gluon.data.Dataset):
+    def __init__(self, root, lines, align=True):
+        self.lines = lines
+        self.root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(' ')
+        name = os.path.join(self.root, name_lmk_score[0])
+        img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
+        landmark5 = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32).reshape((5, 2))
+        st = skimage.transform.SimilarityTransform()
+        st.estimate(landmark5, SRC)
+        img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0)
+        img_1 = np.expand_dims(img, 0)
+        img_2 = np.expand_dims(np.fliplr(img), 0)
+        output = np.concatenate((img_1, img_2), axis=0).astype(np.float32)
+        output = np.transpose(output, (0, 3, 1, 2))
+        output = mx.nd.array(output)
+        return output
+
+
+def extract(model_file, dataset):
+    model = ArcFaceONNX(model_file=model_file)
+    model.check()
+    feat_mat = np.zeros(shape=(len(dataset), 2 * model.feat_dim))
+
+    def batchify_fn(data):
+        return mx.nd.concat(*data, dim=0)
+
+    data_loader = mx.gluon.data.DataLoader(
+        dataset, 128, last_batch='keep', num_workers=4,
+        thread_pool=True, prefetch=16, batchify_fn=batchify_fn)
+    num_iter = 0
+    for batch in data_loader:
+        batch = batch.asnumpy()
+        feat = model.forward(batch)
+        feat = np.reshape(feat, (-1, model.feat_dim * 2))
+        feat_mat[128 * num_iter: 128 * num_iter + feat.shape[0], :] = feat
+        num_iter += 1
+        if num_iter % 50 == 0:
+            print(num_iter)
+    return feat_mat
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None,
+                           templates=None,
+                           medias=None):
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), ]
+        media_norm_feats = np.array(media_norm_feats)
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000
+    sublists = [total_pairs[i: i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def main(args):
+    use_norm_score = True  # if Ture, TestMode(N1)
+    use_detector_score = True  # if Ture, TestMode(D1)
+    use_flip_test = True  # if Ture, TestMode(F1)
+    assert args.target == 'IJBC' or args.target == 'IJBB'
+
+    start = timeit.default_timer()
+    templates, medias = read_template_media_list(
+        os.path.join('%s/meta' % args.image_path, '%s_face_tid_mid.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    p1, p2, label = read_template_pair_list(
+        os.path.join('%s/meta' % args.image_path,
+                     '%s_template_pair_label.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    img_path = '%s/loose_crop' % args.image_path
+    img_list_path = '%s/meta/%s_name_5pts_score.txt' % (args.image_path, args.target.lower())
+    img_list = open(img_list_path)
+    files = img_list.readlines()
+    dataset = AlignedDataSet(root=img_path, lines=files, align=True)
+    img_feats = extract(args.model_file, dataset)
+
+    faceness_scores = []
+    for each_line in files:
+        name_lmk_score = each_line.split()
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0], img_feats.shape[1]))
+    start = timeit.default_timer()
+
+    if use_flip_test:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_feats[:, img_feats.shape[1] // 2:]
+    else:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+    if use_detector_score:
+        print(img_input_feats.shape, faceness_scores.shape)
+        img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+    else:
+        img_input_feats = img_input_feats
+
+    template_norm_feats, unique_templates = image2template_feature(
+        img_input_feats, templates, medias)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    score = verification(template_norm_feats, unique_templates, p1, p2)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    save_path = os.path.join(args.result_dir, "{}_result".format(args.target))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    score_save_file = os.path.join(save_path, "{}.npy".format(args.model_file.split('/')[-1]))
+    np.save(score_save_file, score)
+    files = [score_save_file]
+    methods = []
+    scores = []
+    for file in files:
+        methods.append(os.path.basename(file))
+        scores.append(np.load(file))
+    methods = np.array(methods)
+    scores = dict(zip(methods, scores))
+    x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+    tpr_fpr_table = prettytable.PrettyTable(['Methods'] + [str(x) for x in x_labels])
+    for method in methods:
+        fpr, tpr, _ = roc_curve(label, scores[method])
+        fpr = np.flipud(fpr)
+        tpr = np.flipud(tpr)
+        tpr_fpr_row = []
+        tpr_fpr_row.append("%s-%s" % (method, args.target))
+        for fpr_iter in np.arange(len(x_labels)):
+            _, min_index = min(
+                list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+            tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+        tpr_fpr_table.add_row(tpr_fpr_row)
+    print(tpr_fpr_table)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do onnx ijb test')
+    # general
+    parser.add_argument('--model-file', default='', help='path to onnx model.')
+    parser.add_argument('--image-path', default='', type=str, help='')
+    parser.add_argument('--result-dir', default='.', type=str, help='')
+    parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+    main(parser.parse_args())
diff --git a/insightface/recognition/_evaluation_/megaface/README.md b/insightface/recognition/_evaluation_/megaface/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e4ee657a227642765097b9d859a192568732437
--- /dev/null
+++ b/insightface/recognition/_evaluation_/megaface/README.md
@@ -0,0 +1,4 @@
+
+Download megaface testsuite from [baiducloud](https://pan.baidu.com/s/1Vdxc2GgbY8wIW0hVcObIwg)(code:0n6w) or [gdrive](https://drive.google.com/file/d/1KBwp0U9oZgZj7SYDXRxUnnH7Lwvd9XMy/view?usp=sharing). The official devkit is also included.
+
+
diff --git a/insightface/recognition/_evaluation_/megaface/gen_megaface.py b/insightface/recognition/_evaluation_/megaface/gen_megaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9fdb0763ddcd6d53e1462ab508a52e0fb38417
--- /dev/null
+++ b/insightface/recognition/_evaluation_/megaface/gen_megaface.py
@@ -0,0 +1,196 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from easydict import EasyDict as edict
+import time
+import sys
+import numpy as np
+import argparse
+import struct
+import cv2
+import sklearn
+from sklearn.preprocessing import normalize
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+def read_img(image_path):
+    img = cv2.imread(image_path, cv2.CV_LOAD_IMAGE_COLOR)
+    return img
+
+
+def get_feature(imgs, nets):
+    count = len(imgs)
+    data = mx.nd.zeros(shape=(count * 2, 3, imgs[0].shape[0],
+                              imgs[0].shape[1]))
+    for idx, img in enumerate(imgs):
+        img = img[:, :, ::-1]  #to rgb
+        img = np.transpose(img, (2, 0, 1))
+        for flipid in [0, 1]:
+            _img = np.copy(img)
+            if flipid == 1:
+                _img = _img[:, :, ::-1]
+            _img = nd.array(_img)
+            data[count * flipid + idx] = _img
+
+    F = []
+    for net in nets:
+        db = mx.io.DataBatch(data=(data, ))
+        net.model.forward(db, is_train=False)
+        x = net.model.get_outputs()[0].asnumpy()
+        embedding = x[0:count, :] + x[count:, :]
+        embedding = sklearn.preprocessing.normalize(embedding)
+        #print('emb', embedding.shape)
+        F.append(embedding)
+    F = np.concatenate(F, axis=1)
+    F = sklearn.preprocessing.normalize(F)
+    #print('F', F.shape)
+    return F
+
+
+def write_bin(path, feature):
+    feature = list(feature)
+    with open(path, 'wb') as f:
+        f.write(struct.pack('4i', len(feature), 1, 4, 5))
+        f.write(struct.pack("%df" % len(feature), *feature))
+
+
+def get_and_write(buffer, nets):
+    imgs = []
+    for k in buffer:
+        imgs.append(k[0])
+    features = get_feature(imgs, nets)
+    #print(np.linalg.norm(feature))
+    assert features.shape[0] == len(buffer)
+    for ik, k in enumerate(buffer):
+        out_path = k[1]
+        feature = features[ik].flatten()
+        write_bin(out_path, feature)
+
+
+def main(args):
+
+    print(args)
+    gpuid = args.gpu
+    ctx = mx.gpu(gpuid)
+    nets = []
+    image_shape = [int(x) for x in args.image_size.split(',')]
+    for model in args.model.split('|'):
+        vec = model.split(',')
+        assert len(vec) > 1
+        prefix = vec[0]
+        epoch = int(vec[1])
+        print('loading', prefix, epoch)
+        net = edict()
+        net.ctx = ctx
+        net.sym, net.arg_params, net.aux_params = mx.model.load_checkpoint(
+            prefix, epoch)
+        all_layers = net.sym.get_internals()
+        net.sym = all_layers['fc1_output']
+        net.model = mx.mod.Module(symbol=net.sym,
+                                  context=net.ctx,
+                                  label_names=None)
+        net.model.bind(data_shapes=[('data', (1, 3, image_shape[1],
+                                              image_shape[2]))])
+        net.model.set_params(net.arg_params, net.aux_params)
+        nets.append(net)
+
+    facescrub_out = os.path.join(args.output, 'facescrub')
+    megaface_out = os.path.join(args.output, 'megaface')
+
+    i = 0
+    succ = 0
+    buffer = []
+    for line in open(args.facescrub_lst, 'r'):
+        if i % 1000 == 0:
+            print("writing fs", i, succ)
+        i += 1
+        image_path = line.strip()
+        _path = image_path.split('/')
+        a, b = _path[-2], _path[-1]
+        out_dir = os.path.join(facescrub_out, a)
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        image_path = os.path.join(args.facescrub_root, image_path)
+        img = read_img(image_path)
+        if img is None:
+            print('read error:', image_path)
+            continue
+        out_path = os.path.join(out_dir, b + "_%s.bin" % (args.algo))
+        item = (img, out_path)
+        buffer.append(item)
+        if len(buffer) == args.batch_size:
+            get_and_write(buffer, nets)
+            buffer = []
+        succ += 1
+    if len(buffer) > 0:
+        get_and_write(buffer, nets)
+        buffer = []
+    print('fs stat', i, succ)
+
+    i = 0
+    succ = 0
+    buffer = []
+    for line in open(args.megaface_lst, 'r'):
+        if i % 1000 == 0:
+            print("writing mf", i, succ)
+        i += 1
+        image_path = line.strip()
+        _path = image_path.split('/')
+        a1, a2, b = _path[-3], _path[-2], _path[-1]
+        out_dir = os.path.join(megaface_out, a1, a2)
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+            #continue
+        #print(landmark)
+        image_path = os.path.join(args.megaface_root, image_path)
+        img = read_img(image_path)
+        if img is None:
+            print('read error:', image_path)
+            continue
+        out_path = os.path.join(out_dir, b + "_%s.bin" % (args.algo))
+        item = (img, out_path)
+        buffer.append(item)
+        if len(buffer) == args.batch_size:
+            get_and_write(buffer, nets)
+            buffer = []
+        succ += 1
+    if len(buffer) > 0:
+        get_and_write(buffer, nets)
+        buffer = []
+    print('mf stat', i, succ)
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--batch_size', type=int, help='', default=8)
+    parser.add_argument('--image_size', type=str, help='', default='3,112,112')
+    parser.add_argument('--gpu', type=int, help='', default=0)
+    parser.add_argument('--algo', type=str, help='', default='insightface')
+    parser.add_argument('--facescrub-lst',
+                        type=str,
+                        help='',
+                        default='./data/facescrub_lst')
+    parser.add_argument('--megaface-lst',
+                        type=str,
+                        help='',
+                        default='./data/megaface_lst')
+    parser.add_argument('--facescrub-root',
+                        type=str,
+                        help='',
+                        default='./data/facescrub_images')
+    parser.add_argument('--megaface-root',
+                        type=str,
+                        help='',
+                        default='./data/megaface_images')
+    parser.add_argument('--output', type=str, help='', default='./feature_out')
+    parser.add_argument('--model', type=str, help='', default='')
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
diff --git a/insightface/recognition/_evaluation_/megaface/remove_noises.py b/insightface/recognition/_evaluation_/megaface/remove_noises.py
new file mode 100644
index 0000000000000000000000000000000000000000..aacec5404094cdbaeb72bb1ec6f0dc8ab55bf45b
--- /dev/null
+++ b/insightface/recognition/_evaluation_/megaface/remove_noises.py
@@ -0,0 +1,182 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import datetime
+import time
+import shutil
+import sys
+import numpy as np
+import argparse
+import struct
+import cv2
+import mxnet as mx
+from mxnet import ndarray as nd
+
+feature_dim = 512
+feature_ext = 1
+
+
+def load_bin(path, fill=0.0):
+    with open(path, 'rb') as f:
+        bb = f.read(4 * 4)
+        #print(len(bb))
+        v = struct.unpack('4i', bb)
+        #print(v[0])
+        bb = f.read(v[0] * 4)
+        v = struct.unpack("%df" % (v[0]), bb)
+        feature = np.full((feature_dim + feature_ext, ),
+                          fill,
+                          dtype=np.float32)
+        feature[0:feature_dim] = v
+        #feature = np.array( v, dtype=np.float32)
+    #print(feature.shape)
+    #print(np.linalg.norm(feature))
+    return feature
+
+
+def write_bin(path, feature):
+    feature = list(feature)
+    with open(path, 'wb') as f:
+        f.write(struct.pack('4i', len(feature), 1, 4, 5))
+        f.write(struct.pack("%df" % len(feature), *feature))
+
+
+def main(args):
+
+    fs_noise_map = {}
+    for line in open(args.facescrub_noises, 'r'):
+        if line.startswith('#'):
+            continue
+        line = line.strip()
+        fname = line.split('.')[0]
+        p = fname.rfind('_')
+        fname = fname[0:p]
+        fs_noise_map[line] = fname
+
+    print(len(fs_noise_map))
+
+    i = 0
+    fname2center = {}
+    noises = []
+    for line in open(args.facescrub_lst, 'r'):
+        if i % 1000 == 0:
+            print("reading fs", i)
+        i += 1
+        image_path = line.strip()
+        _path = image_path.split('/')
+        a, b = _path[-2], _path[-1]
+        feature_path = os.path.join(args.feature_dir_input, 'facescrub', a,
+                                    "%s_%s.bin" % (b, args.algo))
+        feature_dir_out = os.path.join(args.feature_dir_out, 'facescrub', a)
+        if not os.path.exists(feature_dir_out):
+            os.makedirs(feature_dir_out)
+        feature_path_out = os.path.join(feature_dir_out,
+                                        "%s_%s.bin" % (b, args.algo))
+        #print(b)
+        if not b in fs_noise_map:
+            #shutil.copyfile(feature_path, feature_path_out)
+            feature = load_bin(feature_path)
+            write_bin(feature_path_out, feature)
+            if not a in fname2center:
+                fname2center[a] = np.zeros((feature_dim + feature_ext, ),
+                                           dtype=np.float32)
+            fname2center[a] += feature
+        else:
+            #print('n', b)
+            noises.append((a, b))
+    print(len(noises))
+
+    for k in noises:
+        a, b = k
+        assert a in fname2center
+        center = fname2center[a]
+        g = np.zeros((feature_dim + feature_ext, ), dtype=np.float32)
+        g2 = np.random.uniform(-0.001, 0.001, (feature_dim, ))
+        g[0:feature_dim] = g2
+        f = center + g
+        _norm = np.linalg.norm(f)
+        f /= _norm
+        feature_path_out = os.path.join(args.feature_dir_out, 'facescrub', a,
+                                        "%s_%s.bin" % (b, args.algo))
+        write_bin(feature_path_out, f)
+
+    mf_noise_map = {}
+    for line in open(args.megaface_noises, 'r'):
+        if line.startswith('#'):
+            continue
+        line = line.strip()
+        _vec = line.split("\t")
+        if len(_vec) > 1:
+            line = _vec[1]
+        mf_noise_map[line] = 1
+
+    print(len(mf_noise_map))
+
+    i = 0
+    nrof_noises = 0
+    for line in open(args.megaface_lst, 'r'):
+        if i % 1000 == 0:
+            print("reading mf", i)
+        i += 1
+        image_path = line.strip()
+        _path = image_path.split('/')
+        a1, a2, b = _path[-3], _path[-2], _path[-1]
+        feature_path = os.path.join(args.feature_dir_input, 'megaface', a1, a2,
+                                    "%s_%s.bin" % (b, args.algo))
+        feature_dir_out = os.path.join(args.feature_dir_out, 'megaface', a1,
+                                       a2)
+        if not os.path.exists(feature_dir_out):
+            os.makedirs(feature_dir_out)
+        feature_path_out = os.path.join(feature_dir_out,
+                                        "%s_%s.bin" % (b, args.algo))
+        bb = '/'.join([a1, a2, b])
+        #print(b)
+        if not bb in mf_noise_map:
+            feature = load_bin(feature_path)
+            write_bin(feature_path_out, feature)
+            #shutil.copyfile(feature_path, feature_path_out)
+        else:
+            feature = load_bin(feature_path, 100.0)
+            write_bin(feature_path_out, feature)
+            #g = np.random.uniform(-0.001, 0.001, (feature_dim,))
+            #print('n', bb)
+            #write_bin(feature_path_out, g)
+            nrof_noises += 1
+    print(nrof_noises)
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--facescrub-noises',
+                        type=str,
+                        help='',
+                        default='./data/facescrub_noises.txt')
+    parser.add_argument('--megaface-noises',
+                        type=str,
+                        help='',
+                        default='./data/megaface_noises.txt')
+    parser.add_argument('--algo', type=str, help='', default='insightface')
+    parser.add_argument('--facescrub-lst',
+                        type=str,
+                        help='',
+                        default='./data/facescrub_lst')
+    parser.add_argument('--megaface-lst',
+                        type=str,
+                        help='',
+                        default='./data/megaface_lst')
+    parser.add_argument('--feature-dir-input',
+                        type=str,
+                        help='',
+                        default='./feature_out')
+    parser.add_argument('--feature-dir-out',
+                        type=str,
+                        help='',
+                        default='./feature_out_clean')
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
diff --git a/insightface/recognition/_evaluation_/megaface/run.sh b/insightface/recognition/_evaluation_/megaface/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e7f8ce258283c5e00ecb112212ea653ded01fb0b
--- /dev/null
+++ b/insightface/recognition/_evaluation_/megaface/run.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+DEVKIT="/raid5data/dplearn/megaface/devkit/experiments"
+ALGO="r100ii" #ms1mv2
+ROOT=$(dirname `which $0`)
+echo $ROOT
+python -u gen_megaface.py --gpu 0 --algo "$ALGO" --model '../../models2/model-r100-ii/model,0'
+python -u remove_noises.py --algo "$ALGO"
+
+cd "$DEVKIT"
+LD_LIBRARY_PATH="/usr/local/lib64:$LD_LIBRARY_PATH" python -u run_experiment.py "$ROOT/feature_out_clean/megaface" "$ROOT/feature_out_clean/facescrub" _"$ALGO".bin ../../mx_results/ -s 1000000 -p ../templatelists/facescrub_features_list.json
+cd -
+
diff --git a/insightface/recognition/_tools_/README.md b/insightface/recognition/_tools_/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f5facee570dac250bec55fd06cfec4bb055ea65
--- /dev/null
+++ b/insightface/recognition/_tools_/README.md
@@ -0,0 +1,71 @@
+
+## I. CPP-Align
+ 
+  -
+
+## II. Face Mask Renderer
+
+We provide a simple tool to add masks on face images automatically.
+
+We can use this tool to do data augmentation while training our face recognition models.
+
+| Face Image  | OP | Mask Image | Out |  
+| ------- | ------ | --------- | ----------- |  
+|  <img src="https://github.com/deepinsight/insightface/blob/master/python-package/insightface/data/images/Tom_Hanks_54745.png" alt="face" height="112" /> | +F  | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/mask1.jpg" alt="mask" height="112" />     | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/mask_out1.jpg?raw=true" alt="mask" height="112" />      | 
+|  <img src="https://github.com/deepinsight/insightface/blob/master/python-package/insightface/data/images/Tom_Hanks_54745.png" alt="face" height="112" /> | +F  | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/black-mask.png" alt="mask" height="112" />     | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/mask_out3.jpg?raw=true" alt="mask" height="112" />      | 
+|  <img src="https://github.com/deepinsight/insightface/blob/master/python-package/insightface/data/images/Tom_Hanks_54745.png" alt="face" height="112" /> | +H  | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/mask2.jpg?raw=true" alt="mask" height="112" />     | <img src="https://github.com/nttstar/insightface-resources/blob/master/images/mask_out2h.jpg?raw=true" alt="mask" height="112" />      | 
+
+**F** means FULL while **H** means HALF.
+
+### Prepare
+
+- insightface package library
+
+   ``pip install -U insightface``
+
+- insightface model pack
+
+  ``bash> insightface-cli model.download antelope``
+  
+- BFM models
+
+   Please follow the tutorial of [https://github.com/YadiraF/face3d/tree/master/examples/Data/BFM](https://github.com/YadiraF/face3d/tree/master/examples/Data/BFM) to generate `BFM.mat` and `BFM_UV.mat`. Put them into the insightface model pack directory, such as ``~/.insightface/models/antelope/``
+   
+   
+- mask images
+
+   some mask images are included in insightface package, such as 'mask\_blue', 'mask\_white', 'mask\_black' and 'mask\_green'.
+   
+### Add Mask to Face Image
+
+Please refer to `make_renderer.py` for detail example. 
+
+(1) init renderer:
+```
+import insightface
+from insightface.app import MaskRenderer
+tool = MaskRenderer()
+tool.prepare(ctx_id=0, det_size=(128,128)) #use gpu
+```
+
+(2) load face and mask images
+```
+from insightface.data import get_image as ins_get_image
+image = ins_get_image('Tom_Hanks_54745')
+mask_image  = "mask_blue"
+```
+
+(3) build necessary params for face image, this can be done in offline.
+```
+params = tool.build_params(image)
+```
+
+(4) do mask render, it costs about `10ms` on 224x224 UV size, CPU single thread.
+```
+mask_out = tool.render_mask(image, mask_image, params)
+```
+
+(5) do half mask render.
+```
+mask_half_out = tool.render_mask(image, mask_image, params, positions=[0.1, 0.5, 0.9, 0.7])
+```
diff --git a/insightface/recognition/_tools_/cpp_align/face_align.h b/insightface/recognition/_tools_/cpp_align/face_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..afef20bafba72528460c6b229118b368a8dfc7c2
--- /dev/null
+++ b/insightface/recognition/_tools_/cpp_align/face_align.h
@@ -0,0 +1,146 @@
+//
+// Created by Jack Yu on 23/03/2018.
+//
+
+#ifndef FACE_DEMO_FACEPREPROCESS_H
+#define FACE_DEMO_FACEPREPROCESS_H
+
+#include<opencv2/opencv.hpp>
+
+
+namespace FacePreprocess {
+
+    cv::Mat meanAxis0(const cv::Mat &src)
+    {
+        int num = src.rows;
+        int dim = src.cols;
+
+        // x1 y1
+        // x2 y2
+
+        cv::Mat output(1,dim,CV_32F);
+        for(int i = 0 ; i <  dim; i ++)
+        {
+            float sum = 0 ;
+            for(int j = 0 ; j < num ; j++)
+            {
+                sum+=src.at<float>(j,i);
+            }
+            output.at<float>(0,i) = sum/num;
+        }
+
+        return output;
+    }
+
+    cv::Mat elementwiseMinus(const cv::Mat &A,const cv::Mat &B)
+    {
+        cv::Mat output(A.rows,A.cols,A.type());
+
+        assert(B.cols == A.cols);
+        if(B.cols == A.cols)
+        {
+            for(int i = 0 ; i <  A.rows; i ++)
+            {
+                for(int j = 0 ; j < B.cols; j++)
+                {
+                    output.at<float>(i,j) = A.at<float>(i,j) - B.at<float>(0,j);
+                }
+            }
+        }
+        return output;
+    }
+
+
+    cv::Mat varAxis0(const cv::Mat &src)
+    {
+        cv:Mat temp_ = elementwiseMinus(src,meanAxis0(src));
+        cv::multiply(temp_ ,temp_ ,temp_ );
+        return meanAxis0(temp_);
+
+    }
+
+
+
+    int MatrixRank(cv::Mat M)
+    {
+        Mat w, u, vt;
+        SVD::compute(M, w, u, vt);
+        Mat1b nonZeroSingularValues = w > 0.0001;
+        int rank = countNonZero(nonZeroSingularValues);
+        return rank;
+
+    }
+
+//    References
+//    ----------
+//    .. [1] "Least-squares estimation of transformation parameters between two
+//    point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+//
+//    """
+//
+//    Anthor:Jack Yu
+    cv::Mat similarTransform(cv::Mat src,cv::Mat dst) {
+        int num = src.rows;
+        int dim = src.cols;
+        cv::Mat src_mean = meanAxis0(src);
+        cv::Mat dst_mean = meanAxis0(dst);
+        cv::Mat src_demean = elementwiseMinus(src, src_mean);
+        cv::Mat dst_demean = elementwiseMinus(dst, dst_mean);
+        cv::Mat A = (dst_demean.t() * src_demean) / static_cast<float>(num);
+        cv::Mat d(dim, 1, CV_32F);
+        d.setTo(1.0f);
+        if (cv::determinant(A) < 0) {
+            d.at<float>(dim - 1, 0) = -1;
+
+        }
+        cv::Mat T = cv::Mat::eye(dim + 1, dim + 1, CV_32F);
+        cv::Mat U, S, V;
+        cv::SVD::compute(A, S,U, V);
+
+        // the SVD function in opencv differ from scipy .
+
+
+        int rank = MatrixRank(A);
+        if (rank == 0) {
+            assert(rank == 0);
+
+        } else if (rank == dim - 1) {
+            if (cv::determinant(U) * cv::determinant(V) > 0) {
+                T.rowRange(0, dim).colRange(0, dim) = U * V;
+            } else {
+                int s = d.at<float>(dim - 1, 0) = -1;
+                d.at<float>(dim - 1, 0) = -1;
+
+                T.rowRange(0, dim).colRange(0, dim) = U * V;
+                cv::Mat diag_ = cv::Mat::diag(d);
+                cv::Mat twp = diag_*V; //np.dot(np.diag(d), V.T)
+                cv::Mat B = cv::Mat::zeros(3, 3, CV_8UC1);
+                cv::Mat C = B.diag(0);
+                T.rowRange(0, dim).colRange(0, dim) = U* twp;
+                d.at<float>(dim - 1, 0) = s;
+            }
+        }
+        else{
+            cv::Mat diag_ = cv::Mat::diag(d);
+            cv::Mat twp = diag_*V.t(); //np.dot(np.diag(d), V.T)
+            cv::Mat res = U* twp; // U
+            T.rowRange(0, dim).colRange(0, dim) = U *  diag_ * V;
+        }
+        cv::Mat var_ = varAxis0(src_demean);
+        float val = cv::sum(var_).val[0];
+        cv::Mat res;
+        cv::multiply(d,S,res);
+        float scale =  1.0/val*cv::sum(res).val[0];
+        cv::Mat  temp1 = T.rowRange(0, dim).colRange(0, dim) * src_mean.t();
+        cv::Mat  temp2 = scale * temp1;
+        cv::Mat  temp3 = dst_mean - temp2.t();
+        T.at<float>(0,2) = temp3.at<float>(0);
+        T.at<float>(1,2) = temp3.at<float>(1);
+        T.rowRange(0, dim).colRange(0, dim) *= scale; // T[:dim, :dim] *= scale
+
+        return T;
+    }
+
+
+}
+#endif //FACE_DEMO_FACEPREPROCESS_H
diff --git a/insightface/recognition/_tools_/mask_renderer.py b/insightface/recognition/_tools_/mask_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..57fa865f3deb90af0e7e25fa36a5f41bd284ecec
--- /dev/null
+++ b/insightface/recognition/_tools_/mask_renderer.py
@@ -0,0 +1,21 @@
+import os, sys, datetime
+import numpy as np
+import os.path as osp
+import cv2
+import insightface
+from insightface.app import MaskRenderer
+
+
+if __name__ == "__main__":
+    #make sure that you have download correct insightface model pack.
+    #make sure that BFM.mat and BFM_UV.mat have been generated
+    tool = MaskRenderer()
+    tool.prepare(ctx_id=0, det_size=(128,128))
+    image = cv2.imread("../../deploy/Tom_Hanks_54745.png")
+    mask_image  = "mask_blue"
+    params = tool.build_params(image)
+    mask_out = tool.render_mask(image, mask_image, params)
+
+    cv2.imwrite('output_mask.jpg', mask_out)
+
+
diff --git a/insightface/recognition/arcface_mxnet/README.md b/insightface/recognition/arcface_mxnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d463f74e10bc6c9338fb7554faf5669023b2c21
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/README.md
@@ -0,0 +1,95 @@
+## ArcFace with Parallel Acceleration on both Features and Centers
+
+### Memory Consumption and Training Speed
+
+![Memoryspeed](https://insightface.ai/assets/img/github/memoryspeed.png)
+
+Parallel acceleration on both feature x and centre W. Setting: ResNet 50, batch size 8 * 64, feature dimension 512, float point 32, GPU 8 * P40 (24GB).
+
+### Illustration of Main Steps
+
+![Memoryspeed](https://insightface.ai/assets/img/github/mainsteps.png)
+
+Parallel calculation by simple matrix partition. Setting: ResNet 50, batch size 8 * 64, feature dimension 512, float point 32, identity number 1 Million, GPU 8 * 1080ti (11GB). Communication cost: 1MB (feature x). Training speed: 800 samples/second.
+
+**Note:** Replace ``train.py`` with ``train_parall.py`` in following examples if you want to use parallel acceleration.
+
+### Model Training
+
+1. Install `MXNet` with GPU support.
+
+```
+pip install mxnet-cu100 # mxnet-cu102
+```
+
+2. Clone the InsightFace repository. We call the directory insightface as *`INSIGHTFACE_ROOT`*.
+
+```
+git clone --recursive https://github.com/deepinsight/insightface.git
+```
+
+3. Download the training set (`MS1MV2-Arcface`) and place it in *`$INSIGHTFACE_ROOT/recognition/datasets/`*. Each training dataset includes the following 6 files:
+
+```Shell
+    faces_emore/
+       train.idx
+       train.rec
+       property
+       lfw.bin
+       cfp_fp.bin
+       agedb_30.bin
+```
+
+The first three files are the training dataset while the last three files are verification sets.
+
+4. Train deep face recognition models.
+In this part, we assume you are in the directory *`$INSIGHTFACE_ROOT/recognition/ArcFace`*.
+
+Place and edit config file:
+```Shell
+cp sample_config.py config.py
+vim config.py # edit dataset path etc..
+```
+
+We give some examples below. Our experiments were conducted on the Tesla P40 GPU.
+
+(1). Train ArcFace with LResNet100E-IR.
+
+```Shell
+CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --network r100 --loss arcface --dataset emore
+```
+
+It will output verification results of *LFW*, *CFP-FP* and *AgeDB-30* every 2000 batches. You can check all options in *config.py*.
+This model can achieve *LFW 99.80+* and *MegaFace 98.3%+*.
+
+(2). Train CosineFace with LResNet50E-IR.
+
+```Shell
+CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --network r50 --loss cosface --dataset emore
+```
+
+(3). Train Softmax with MobileFaceNet.
+
+```Shell
+CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --network y1 --loss softmax --dataset emore
+```
+
+(4). Fine-turn the above Softmax model with Triplet loss.
+
+```Shell
+CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --network mnas05 --loss triplet --lr 0.005 --pretrained ./models/y1-softmax-emore,1
+```
+
+### Citation
+
+If you find *ArcFace* useful in your research, please consider to cite the following related papers:
+
+```
+@inproceedings{deng2019arcface,
+  title={Arcface: Additive angular margin loss for deep face recognition},
+  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={4690--4699},
+  year={2019}
+}
+```
diff --git a/insightface/recognition/arcface_mxnet/common/build_eval_pack.py b/insightface/recognition/arcface_mxnet/common/build_eval_pack.py
new file mode 100644
index 0000000000000000000000000000000000000000..23208ceebfd1204c5596d53966952ec8d4c88cb3
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/build_eval_pack.py
@@ -0,0 +1,136 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+#import mxnet as mx
+#from mxnet import ndarray as nd
+import argparse
+import cv2
+import pickle
+import numpy as np
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'common'))
+sys.path.append(
+    os.path.join(os.path.dirname(__file__), '..', '..', 'RetinaFace'))
+import face_align
+from retinaface import RetinaFace
+
+
+def to_rgb(img):
+    w, h = img.shape
+    ret = np.empty((w, h, 3), dtype=np.uint8)
+    ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
+    return ret
+
+
+def IOU(Reframe, GTframe):
+    x1 = Reframe[0]
+    y1 = Reframe[1]
+    width1 = Reframe[2] - Reframe[0]
+    height1 = Reframe[3] - Reframe[1]
+
+    x2 = GTframe[0]
+    y2 = GTframe[1]
+    width2 = GTframe[2] - GTframe[0]
+    height2 = GTframe[3] - GTframe[1]
+
+    endx = max(x1 + width1, x2 + width2)
+    startx = min(x1, x2)
+    width = width1 + width2 - (endx - startx)
+
+    endy = max(y1 + height1, y2 + height2)
+    starty = min(y1, y2)
+    height = height1 + height2 - (endy - starty)
+
+    if width <= 0 or height <= 0:
+        ratio = 0
+    else:
+        Area = width * height
+        Area1 = width1 * height1
+        Area2 = width2 * height2
+        ratio = Area * 1. / (Area1 + Area2 - Area)
+    return ratio
+
+
+parser = argparse.ArgumentParser(description='Package eval images')
+# general
+parser.add_argument('--data-dir', default='', help='')
+parser.add_argument('--image-size', type=int, default=112, help='')
+parser.add_argument('--gpu', type=int, default=0, help='')
+parser.add_argument('--det-prefix', type=str, default='./model/R50', help='')
+parser.add_argument('--output', default='./', help='path to save.')
+parser.add_argument('--align-mode', default='arcface', help='align mode.')
+args = parser.parse_args()
+
+gpu_id = args.gpu
+
+detector = RetinaFace(args.det_prefix, 0, gpu_id, network='net3')
+target_size = 400
+max_size = 800
+
+
+def get_norm_crop(image_path):
+    im = cv2.imread(image_path)
+    im_shape = im.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(target_size) / float(im_size_min)
+    # prevent bigger axis from being more than max_size:
+    if np.round(im_scale * im_size_max) > max_size:
+        im_scale = float(max_size) / float(im_size_max)
+    bbox, landmark = detector.detect(im, threshold=0.5, scales=[im_scale])
+    #print(im.shape, bbox.shape, landmark.shape)
+    if bbox.shape[0] == 0:
+        bbox, landmark = detector.detect(
+            im,
+            threshold=0.05,
+            scales=[im_scale * 0.75, im_scale, im_scale * 2.0])
+        print('refine', im.shape, bbox.shape, landmark.shape)
+    nrof_faces = bbox.shape[0]
+    if nrof_faces > 0:
+        det = bbox[:, 0:4]
+        img_size = np.asarray(im.shape)[0:2]
+        bindex = 0
+        if nrof_faces > 1:
+            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                           det[:, 1])
+            img_center = img_size / 2
+            offsets = np.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                                 (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            bindex = np.argmax(bounding_box_size - offset_dist_squared *
+                               2.0)  # some extra weight on the centering
+        #_bbox = bounding_boxes[bindex, 0:4]
+        _landmark = landmark[bindex]
+        warped = face_align.norm_crop(im,
+                                      landmark=_landmark,
+                                      image_size=args.image_size,
+                                      mode=args.align_mode)
+        return warped
+    else:
+        return None
+
+
+bins = []
+issame_list = []
+pp = 0
+for line in open(os.path.join(args.data_dir, 'pairs_label.txt'), 'r'):
+    pp += 1
+    if pp % 100 == 0:
+        print('processing', pp)
+    line = line.strip().split()
+    assert len(line) == 3
+    path1 = os.path.join(args.data_dir, line[0])
+    path2 = os.path.join(args.data_dir, line[1])
+    im1 = get_norm_crop(path1)
+    im2 = get_norm_crop(path2)
+    issame = True
+    if line[2] == '0':
+        issame = False
+    issame_list.append(issame)
+    for im in [im1, im2]:
+        _, s = cv2.imencode('.jpg', im)
+        bins.append(s)
+
+with open(args.output, 'wb') as f:
+    pickle.dump((bins, issame_list), f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/insightface/recognition/arcface_mxnet/common/face_align.py b/insightface/recognition/arcface_mxnet/common/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f48a7691dacb54d1847a748660db0ed02371d63
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/face_align.py
@@ -0,0 +1,71 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+src1 = np.array([[51.642, 50.115], [57.617, 49.990], [35.740, 69.007],
+                 [51.157, 89.050], [57.025, 89.702]],
+                dtype=np.float32)
+#<--left
+src2 = np.array([[45.031, 50.118], [65.568, 50.872], [39.677, 68.111],
+                 [45.177, 86.190], [64.246, 86.758]],
+                dtype=np.float32)
+
+#---frontal
+src3 = np.array([[39.730, 51.138], [72.270, 51.138], [56.000, 68.493],
+                 [42.463, 87.010], [69.537, 87.010]],
+                dtype=np.float32)
+
+#-->right
+src4 = np.array([[46.845, 50.872], [67.382, 50.118], [72.737, 68.111],
+                 [48.167, 86.758], [67.236, 86.190]],
+                dtype=np.float32)
+
+#-->right profile
+src5 = np.array([[54.796, 49.990], [60.771, 50.115], [76.673, 69.007],
+                 [55.388, 89.702], [61.257, 89.050]],
+                dtype=np.float32)
+
+src = np.array([src1, src2, src3, src4, src5])
+src_map = {112: src, 224: src * 2}
+
+arcface_src = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+arcface_src = np.expand_dims(arcface_src, axis=0)
+
+# In[66]:
+
+
+# lmk is prediction; src is template
+def estimate_norm(lmk, image_size=112, mode='arcface'):
+    assert lmk.shape == (5, 2)
+    tform = trans.SimilarityTransform()
+    lmk_tran = np.insert(lmk, 2, values=np.ones(5), axis=1)
+    min_M = []
+    min_index = []
+    min_error = float('inf')
+    if mode == 'arcface':
+        assert image_size == 112
+        src = arcface_src
+    else:
+        src = src_map[image_size]
+    for i in np.arange(src.shape[0]):
+        tform.estimate(lmk, src[i])
+        M = tform.params[0:2, :]
+        results = np.dot(M, lmk_tran.T)
+        results = results.T
+        error = np.sum(np.sqrt(np.sum((results - src[i])**2, axis=1)))
+        #         print(error)
+        if error < min_error:
+            min_error = error
+            min_M = M
+            min_index = i
+    return min_M, min_index
+
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M, pose_index = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
diff --git a/insightface/recognition/arcface_mxnet/common/flops_counter.py b/insightface/recognition/arcface_mxnet/common/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..809424142da4db304ac57539f8cc87c827eaea8e
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/flops_counter.py
@@ -0,0 +1,120 @@
+'''
+@author: insightface
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+import json
+import argparse
+import numpy as np
+import mxnet as mx
+
+
+def is_no_bias(attr):
+    ret = False
+    if 'no_bias' in attr and (attr['no_bias'] == True
+                              or attr['no_bias'] == 'True'):
+        ret = True
+    return ret
+
+
+def count_fc_flops(input_filter, output_filter, attr):
+    #print(input_filter, output_filter ,attr)
+    ret = 2 * input_filter * output_filter
+    if is_no_bias(attr):
+        ret -= output_filter
+    return int(ret)
+
+
+def count_conv_flops(input_shape, output_shape, attr):
+    kernel = attr['kernel'][1:-1].split(',')
+    kernel = [int(x) for x in kernel]
+
+    #print('kernel', kernel)
+    if is_no_bias(attr):
+        ret = (2 * input_shape[1] * kernel[0] * kernel[1] -
+               1) * output_shape[2] * output_shape[3] * output_shape[1]
+    else:
+        ret = 2 * input_shape[1] * kernel[0] * kernel[1] * output_shape[
+            2] * output_shape[3] * output_shape[1]
+    num_group = 1
+    if 'num_group' in attr:
+        num_group = int(attr['num_group'])
+    ret /= num_group
+    return int(ret)
+
+
+def count_flops(sym, **data_shapes):
+    all_layers = sym.get_internals()
+    #print(all_layers)
+    arg_shapes, out_shapes, aux_shapes = all_layers.infer_shape(**data_shapes)
+    out_shape_dict = dict(zip(all_layers.list_outputs(), out_shapes))
+
+    nodes = json.loads(sym.tojson())['nodes']
+    nodeid_shape = {}
+    for nodeid, node in enumerate(nodes):
+        name = node['name']
+        layer_name = name + "_output"
+        if layer_name in out_shape_dict:
+            nodeid_shape[nodeid] = out_shape_dict[layer_name]
+    #print(nodeid_shape)
+    FLOPs = 0
+    for nodeid, node in enumerate(nodes):
+        flops = 0
+        if node['op'] == 'Convolution':
+            output_shape = nodeid_shape[nodeid]
+            name = node['name']
+            attr = node['attrs']
+            input_nodeid = node['inputs'][0][0]
+            input_shape = nodeid_shape[input_nodeid]
+            flops = count_conv_flops(input_shape, output_shape, attr)
+        elif node['op'] == 'FullyConnected':
+            attr = node['attrs']
+            output_shape = nodeid_shape[nodeid]
+            input_nodeid = node['inputs'][0][0]
+            input_shape = nodeid_shape[input_nodeid]
+            output_filter = output_shape[1]
+            input_filter = input_shape[1] * input_shape[2] * input_shape[3]
+            #assert len(input_shape)==4 and input_shape[2]==1 and input_shape[3]==1
+            flops = count_fc_flops(input_filter, output_filter, attr)
+        #print(node, flops)
+        FLOPs += flops
+
+    return FLOPs
+
+
+def flops_str(FLOPs):
+    preset = [(1e12, 'T'), (1e9, 'G'), (1e6, 'M'), (1e3, 'K')]
+
+    for p in preset:
+        if FLOPs // p[0] > 0:
+            N = FLOPs / p[0]
+            ret = "%.1f%s" % (N, p[1])
+            return ret
+    ret = "%.1f" % (FLOPs)
+    return ret
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='flops counter')
+    # general
+    #parser.add_argument('--model', default='../models2/y2-arcface-retinat1/model,1', help='path to load model.')
+    #parser.add_argument('--model', default='../models2/r100fc-arcface-retinaa/model,1', help='path to load model.')
+    parser.add_argument('--model',
+                        default='../models2/r50fc-arcface-emore/model,1',
+                        help='path to load model.')
+    args = parser.parse_args()
+    _vec = args.model.split(',')
+    assert len(_vec) == 2
+    prefix = _vec[0]
+    epoch = int(_vec[1])
+    print('loading', prefix, epoch)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+    all_layers = sym.get_internals()
+    sym = all_layers['fc1_output']
+    FLOPs = count_flops(sym, data=(1, 3, 112, 112))
+    print('FLOPs:', FLOPs)
diff --git a/insightface/recognition/arcface_mxnet/common/rec2image.py b/insightface/recognition/arcface_mxnet/common/rec2image.py
new file mode 100644
index 0000000000000000000000000000000000000000..21e5ec4822fc4afe6d317c87bf9b6e47d5eb8051
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/rec2image.py
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import mxnet as mx
+from mxnet import ndarray as nd
+import random
+import argparse
+import cv2
+import time
+import sklearn
+import numpy as np
+
+
+def main(args):
+    include_datasets = args.include.split(',')
+    rec_list = []
+    for ds in include_datasets:
+        path_imgrec = os.path.join(ds, 'train.rec')
+        path_imgidx = os.path.join(ds, 'train.idx')
+        imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+        rec_list.append(imgrec)
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    for ds_id in range(len(rec_list)):
+        id_list = []
+        imgrec = rec_list[ds_id]
+        s = imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        assert header.flag > 0
+        print('header0 label', header.label)
+        header0 = (int(header.label[0]), int(header.label[1]))
+        seq_identity = range(int(header.label[0]), int(header.label[1]))
+        pp = 0
+        for identity in seq_identity:
+            id_dir = os.path.join(args.output, "%d_%d" % (ds_id, identity))
+            os.makedirs(id_dir)
+            pp += 1
+            if pp % 10 == 0:
+                print('processing id', pp)
+            s = imgrec.read_idx(identity)
+            header, _ = mx.recordio.unpack(s)
+            imgid = 0
+            for _idx in range(int(header.label[0]), int(header.label[1])):
+                s = imgrec.read_idx(_idx)
+                _header, _img = mx.recordio.unpack(s)
+                _img = mx.image.imdecode(_img).asnumpy()[:, :, ::-1]  # to bgr
+                image_path = os.path.join(id_dir, "%d.jpg" % imgid)
+                cv2.imwrite(image_path, _img)
+                imgid += 1
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do dataset merge')
+    # general
+    parser.add_argument('--include', default='', type=str, help='')
+    parser.add_argument('--output', default='', type=str, help='')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/arcface_mxnet/common/rec2shufrec.py b/insightface/recognition/arcface_mxnet/common/rec2shufrec.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf916b4ad4c45a9d59394159fa743f5b0b76b511
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/rec2shufrec.py
@@ -0,0 +1,72 @@
+import os
+import os.path as osp
+import sys
+import datetime
+import glob
+import shutil
+import numbers
+import mxnet as mx
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+import random
+import argparse
+import cv2
+import time
+import numpy as np
+
+def main(args):
+    ds = args.input
+    path_imgrec = osp.join(ds, 'train.rec')
+    path_imgidx = osp.join(ds, 'train.idx')
+    imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+    if not osp.exists(args.output):
+        os.makedirs(args.output)
+    writer = mx.recordio.MXRecordIO(osp.join(args.output, 'train.rec'), 'w')
+    s = imgrec.read_idx(0)
+    header, _ = recordio.unpack(s)
+    if header.flag > 0:
+        print('header0 label', header.label)
+        header0 = (int(header.label[0]), int(header.label[1]))
+        imgidx = list(range(1, int(header.label[0])))
+    else:
+        imgidx = list(imgrec.keys)
+    random.shuffle(imgidx)
+    label_stat = None
+    print('total images:', len(imgidx))
+    for i, idx in enumerate(imgidx):
+        if i%10000==0:
+            print('processing', i, idx)
+        s = imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        if label_stat is None:
+            label_stat = [label, label]
+        else:
+            label_stat[0] = min(label, label_stat[0])
+            label_stat[1] = max(label, label_stat[1])
+        wheader = mx.recordio.IRHeader(0, label, i, 0)
+        ws = mx.recordio.pack(wheader, img)
+        writer.write(ws)
+    print('label_stat:', label_stat)
+    writer.close()
+    if args.copy_vers:
+        for binfile in glob.glob(osp.join(args.input, '*.bin')):
+            target_file = osp.join(args.output, binfile.split('/')[-1])
+            shutil.copyfile(binfile, target_file)
+    with open(osp.join(args.output, 'property'), 'w') as f:
+        f.write("%d,112,112\n"%(int(label_stat[1])+1))
+        f.write("%d\n"%len(imgidx))
+        f.write("shuffled\n")
+        f.write("%s\n"%(datetime.datetime.now()))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='convert rec to shuffled rec')
+    # general
+    parser.add_argument('--input', default='', type=str, help='')
+    parser.add_argument('--output', default='', type=str, help='')
+    parser.add_argument('--copy-vers', action='store_true', help='copy verification bins')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/arcface_mxnet/common/rec_builder.py b/insightface/recognition/arcface_mxnet/common/rec_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d517152f85c9d822160c542ce2a6e51aa2660b5
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/rec_builder.py
@@ -0,0 +1,109 @@
+import os
+import sys
+import mxnet as mx
+from mxnet import ndarray as nd
+import random
+import argparse
+import cv2
+import time
+import sklearn
+import numpy as np
+
+
+class SeqRecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.last_label = -1
+        self.widx = 0
+        if not os.path.exists(path):
+            os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(
+            os.path.join(path, 'train.idx'), os.path.join(path, 'train.rec'),
+            'w')
+        self.label_stat = [-1, -1]
+
+    def add(self, label, img, is_image=True):
+        #img should be BGR
+        #if self.sis:
+        #    assert label>=self.last_label
+        idx = self.widx
+        self.widx += 1
+        header = mx.recordio.IRHeader(0, label, idx, 0)
+        if is_image:
+            s = mx.recordio.pack_img(header, img, quality=95, img_fmt='.jpg')
+        else:
+            s = mx.recordio.pack(header, img)
+        self.writer.write_idx(idx, s)
+        if self.label_stat[0] < 0:
+            self.label_stat = [label, label]
+        else:
+            self.label_stat[0] = min(self.label_stat[0], label)
+            self.label_stat[1] = max(self.label_stat[1], label)
+
+    def close(self):
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.label_stat[1] + 1, self.image_size[0],
+                                    self.image_size[1]))
+
+
+class RecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.last_label = -1
+        self.widx = 1
+        if not os.path.exists(path):
+            os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(
+            os.path.join(path, 'train.idx'), os.path.join(path, 'train.rec'),
+            'w')
+        self.label_stat = [-1, -1]
+        self.identities = []
+
+    def add(self, label, imgs):
+        #img should be BGR
+        assert label >= 0
+        assert label > self.last_label
+        assert len(imgs) > 0
+        idflag = [self.widx, -1]
+        for img in imgs:
+            idx = self.widx
+            self.widx += 1
+            header = mx.recordio.IRHeader(0, label, idx, 0)
+            if isinstance(img, np.ndarray):
+                s = mx.recordio.pack_img(header,
+                                         img,
+                                         quality=95,
+                                         img_fmt='.jpg')
+            else:
+                s = mx.recordio.pack(header, img)
+            self.writer.write_idx(idx, s)
+        idflag[1] = self.widx
+        self.identities.append(idflag)
+        if self.label_stat[0] < 0:
+            self.label_stat = [label, label]
+        else:
+            self.label_stat[0] = min(self.label_stat[0], label)
+            self.label_stat[1] = max(self.label_stat[1], label)
+        self.last_label = label
+
+    def close(self):
+        id_idx = self.widx
+        for id_flag in self.identities:
+            idx = self.widx
+            self.widx += 1
+            _header = mx.recordio.IRHeader(0, id_flag, idx, 0)
+            s = mx.recordio.pack(_header, b'')
+            self.writer.write_idx(idx, s)
+
+        print('id0:', (id_idx, self.widx))
+        idx = 0
+        _header = mx.recordio.IRHeader(0, (id_idx, self.widx), idx, 1)
+        s = mx.recordio.pack(_header, b'')
+        self.writer.write_idx(idx, s)
+        print('label stat:', self.label_stat)
+
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.label_stat[1] + 1, self.image_size[0],
+                                    self.image_size[1]))
diff --git a/insightface/recognition/arcface_mxnet/common/verification.py b/insightface/recognition/arcface_mxnet/common/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f46942a2581b56319af2f057d29bfe6e7694efd4
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/common/verification.py
@@ -0,0 +1,423 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import sys
+import numpy as np
+from scipy import misc
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import sklearn
+import cv2
+import math
+import datetime
+import pickle
+from sklearn.decomposition import PCA
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    #print('pca', pca)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        #print('train_set', train_set)
+        #print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            #print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            #print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        #print('threshold', thresholds[best_threshold_index])
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    #print(true_accept, false_accept)
+    #print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  #py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  #py3
+    data_list = []
+    for flip in [0, 1]:
+        data = nd.empty(
+            (len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][i][:] = img
+        if i % 1000 == 0:
+            print('loading bin', i)
+    print(data_list[0].shape)
+    return (data_list, issame_list)
+
+
+def test(data_set,
+         mx_model,
+         batch_size,
+         nfolds=10,
+         data_extra=None,
+         label_shape=None):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            #print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            #_arg, _aux = model.get_params()
+            #__arg = {}
+            #for k,v in _arg.iteritems():
+            #  __arg[k] = v.as_in_context(_ctx)
+            #_arg = __arg
+            #_arg["data"] = _data.as_in_context(_ctx)
+            #_arg["softmax_label"] = _label.as_in_context(_ctx)
+            #for k,v in _arg.iteritems():
+            #  print(k,v.context)
+            #exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
+            #exe.forward(is_train=False)
+            #net_out = exe.outputs
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            #print(_embeddings.shape)
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            #print(_em.shape, _norm)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    #_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=10)
+    #acc1, std1 = np.mean(accuracy), np.std(accuracy)
+
+    #print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    #embeddings = np.concatenate(embeddings_list, axis=1)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='do verification')
+    # general
+    parser.add_argument('--data-dir', default='', help='')
+    parser.add_argument('--model',
+                        default='../model/softmax,50',
+                        help='path to load model.')
+    parser.add_argument('--target',
+                        default='lfw,cfp_ff,cfp_fp,agedb_30',
+                        help='test targets.')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+    parser.add_argument('--batch-size', default=32, type=int, help='')
+    parser.add_argument('--max', default='', type=str, help='')
+    parser.add_argument('--mode', default=0, type=int, help='')
+    parser.add_argument('--nfolds', default=10, type=int, help='')
+    args = parser.parse_args()
+    #sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
+    #import face_image
+    #prop = face_image.load_property(args.data_dir)
+    #image_size = prop.image_size
+    image_size = [112, 112]
+    print('image_size', image_size)
+    ctx = mx.gpu(args.gpu)
+    nets = []
+    vec = args.model.split(',')
+    prefix = args.model.split(',')[0]
+    epochs = []
+    if len(vec) == 1:
+        pdir = os.path.dirname(prefix)
+        for fname in os.listdir(pdir):
+            if not fname.endswith('.params'):
+                continue
+            _file = os.path.join(pdir, fname)
+            if _file.startswith(prefix):
+                epoch = int(fname.split('.')[0].split('-')[1])
+                epochs.append(epoch)
+        epochs = sorted(epochs, reverse=True)
+        if len(args.max) > 0:
+            _max = [int(x) for x in args.max.split(',')]
+            assert len(_max) == 2
+            if len(epochs) > _max[1]:
+                epochs = epochs[_max[0]:_max[1]]
+
+    else:
+        epochs = [int(x) for x in vec[1].split('|')]
+    print('model number', len(epochs))
+    time0 = datetime.datetime.now()
+    for epoch in epochs:
+        print('loading', prefix, epoch)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        #arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+        all_layers = sym.get_internals()
+        sym = all_layers['fc1_output']
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+        model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+                                          image_size[1]))])
+        model.set_params(arg_params, aux_params)
+        nets.append(model)
+    time_now = datetime.datetime.now()
+    diff = time_now - time0
+    print('model loading time', diff.total_seconds())
+
+    ver_list = []
+    ver_name_list = []
+    for name in args.target.split(','):
+        path = os.path.join(args.data_dir, name + ".bin")
+        if os.path.exists(path):
+            print('loading.. ', name)
+            data_set = load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+
+    if args.mode == 0:
+        for i in range(len(ver_list)):
+            results = []
+            for model in nets:
+                acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+                    ver_list[i], model, args.batch_size, args.nfolds)
+                print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+                print('[%s]Accuracy: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc1, std1))
+                print('[%s]Accuracy-Flip: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc2, std2))
+                results.append(acc2)
+            print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+    elif args.mode == 1:
+        model = nets[0]
+        test_badcase(ver_list[0], model, args.batch_size, args.target)
+    else:
+        model = nets[0]
+        dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/arcface_mxnet/image_iter.py b/insightface/recognition/arcface_mxnet/image_iter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a77bf60dbafd368597c6763fe0ea98fd90cbc6
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/image_iter.py
@@ -0,0 +1,367 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import logging
+import sys
+import numbers
+import math
+import sklearn
+import datetime
+import numpy as np
+import cv2
+
+import mxnet as mx
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+
+logger = logging.getLogger()
+
+
+class FaceImageIter(io.DataIter):
+    def __init__(self,
+                 batch_size,
+                 data_shape,
+                 path_imgrec=None,
+                 shuffle=False,
+                 aug_list=None,
+                 mean=None,
+                 rand_mirror=False,
+                 cutoff=0,
+                 color_jittering=0,
+                 images_filter=0,
+                 data_name='data',
+                 label_name='softmax_label',
+                 **kwargs):
+        super(FaceImageIter, self).__init__()
+        assert path_imgrec
+        if path_imgrec:
+            logging.info('loading recordio %s...', path_imgrec)
+            path_imgidx = path_imgrec[0:-4] + ".idx"
+            self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec,
+                                                     'r')  # pylint: disable=redefined-variable-type
+            s = self.imgrec.read_idx(0)
+            header, _ = recordio.unpack(s)
+            if header.flag > 0:
+                print('header0 label', header.label)
+                self.header0 = (int(header.label[0]), int(header.label[1]))
+                #assert(header.flag==1)
+                #self.imgidx = range(1, int(header.label[0]))
+                self.imgidx = []
+                self.id2range = {}
+                self.seq_identity = range(int(header.label[0]),
+                                          int(header.label[1]))
+                for identity in self.seq_identity:
+                    s = self.imgrec.read_idx(identity)
+                    header, _ = recordio.unpack(s)
+                    a, b = int(header.label[0]), int(header.label[1])
+                    count = b - a
+                    if count < images_filter:
+                        continue
+                    self.id2range[identity] = (a, b)
+                    self.imgidx += range(a, b)
+                print('id2range', len(self.id2range))
+            else:
+                self.imgidx = list(self.imgrec.keys)
+            if shuffle:
+                self.seq = self.imgidx
+                self.oseq = self.imgidx
+                print(len(self.seq))
+            else:
+                self.seq = None
+
+        self.mean = mean
+        self.nd_mean = None
+        if self.mean:
+            self.mean = np.array(self.mean, dtype=np.float32).reshape(1, 1, 3)
+            self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3))
+
+        self.check_data_shape(data_shape)
+        self.provide_data = [(data_name, (batch_size, ) + data_shape)]
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+        self.shuffle = shuffle
+        self.image_size = '%d,%d' % (data_shape[1], data_shape[2])
+        self.rand_mirror = rand_mirror
+        print('rand_mirror', rand_mirror)
+        self.cutoff = cutoff
+        self.color_jittering = color_jittering
+        self.CJA = mx.image.ColorJitterAug(0.125, 0.125, 0.125)
+        self.provide_label = [(label_name, (batch_size, ))]
+        #print(self.provide_label[0][1])
+        self.cur = 0
+        self.nbatch = 0
+        self.is_init = False
+
+    def reset(self):
+        """Resets the iterator to the beginning of the data."""
+        print('call reset()')
+        self.cur = 0
+        if self.shuffle:
+            random.shuffle(self.seq)
+        if self.seq is None and self.imgrec is not None:
+            self.imgrec.reset()
+
+    def num_samples(self):
+        return len(self.seq)
+
+    def next_sample(self):
+        """Helper function for reading in next sample."""
+        #set total batch size, for example, 1800, and maximum size for each people, for example 45
+        if self.seq is not None:
+            while True:
+                if self.cur >= len(self.seq):
+                    raise StopIteration
+                idx = self.seq[self.cur]
+                self.cur += 1
+                if self.imgrec is not None:
+                    s = self.imgrec.read_idx(idx)
+                    header, img = recordio.unpack(s)
+                    label = header.label
+                    if not isinstance(label, numbers.Number):
+                        label = label[0]
+                    return label, img, None, None
+                else:
+                    label, fname, bbox, landmark = self.imglist[idx]
+                    return label, self.read_image(fname), bbox, landmark
+        else:
+            s = self.imgrec.read()
+            if s is None:
+                raise StopIteration
+            header, img = recordio.unpack(s)
+            return header.label, img, None, None
+
+    def brightness_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        src *= alpha
+        return src
+
+    def contrast_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = (3.0 * (1.0 - alpha) / gray.size) * nd.sum(gray)
+        src *= alpha
+        src += gray
+        return src
+
+    def saturation_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = nd.sum(gray, axis=2, keepdims=True)
+        gray *= (1.0 - alpha)
+        src *= alpha
+        src += gray
+        return src
+
+    def color_aug(self, img, x):
+        #augs = [self.brightness_aug, self.contrast_aug, self.saturation_aug]
+        #random.shuffle(augs)
+        #for aug in augs:
+        #  #print(img.shape)
+        #  img = aug(img, x)
+        #  #print(img.shape)
+        #return img
+        return self.CJA(img)
+
+    def mirror_aug(self, img):
+        _rd = random.randint(0, 1)
+        if _rd == 1:
+            for c in range(img.shape[2]):
+                img[:, :, c] = np.fliplr(img[:, :, c])
+        return img
+
+    def compress_aug(self, img):
+        from PIL import Image
+        from io import BytesIO
+        buf = BytesIO()
+        img = Image.fromarray(img.asnumpy(), 'RGB')
+        q = random.randint(2, 20)
+        img.save(buf, format='JPEG', quality=q)
+        buf = buf.getvalue()
+        img = Image.open(BytesIO(buf))
+        return nd.array(np.asarray(img, 'float32'))
+
+    def next(self):
+        if not self.is_init:
+            self.reset()
+            self.is_init = True
+        """Returns the next batch of data."""
+        #print('in next', self.cur, self.labelcur)
+        self.nbatch += 1
+        batch_size = self.batch_size
+        c, h, w = self.data_shape
+        batch_data = nd.empty((batch_size, c, h, w))
+        if self.provide_label is not None:
+            batch_label = nd.empty(self.provide_label[0][1])
+        i = 0
+        try:
+            while i < batch_size:
+                label, s, bbox, landmark = self.next_sample()
+                _data = self.imdecode(s)
+                if _data.shape[0] != self.data_shape[1]:
+                    _data = mx.image.resize_short(_data, self.data_shape[1])
+                if self.rand_mirror:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        _data = mx.ndarray.flip(data=_data, axis=1)
+                if self.color_jittering > 0:
+                    if self.color_jittering > 1:
+                        _rd = random.randint(0, 1)
+                        if _rd == 1:
+                            _data = self.compress_aug(_data)
+                    #print('do color aug')
+                    _data = _data.astype('float32', copy=False)
+                    #print(_data.__class__)
+                    _data = self.color_aug(_data, 0.125)
+                if self.nd_mean is not None:
+                    _data = _data.astype('float32', copy=False)
+                    _data -= self.nd_mean
+                    _data *= 0.0078125
+                if self.cutoff > 0:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        #print('do cutoff aug', self.cutoff)
+                        centerh = random.randint(0, _data.shape[0] - 1)
+                        centerw = random.randint(0, _data.shape[1] - 1)
+                        half = self.cutoff // 2
+                        starth = max(0, centerh - half)
+                        endh = min(_data.shape[0], centerh + half)
+                        startw = max(0, centerw - half)
+                        endw = min(_data.shape[1], centerw + half)
+                        #print(starth, endh, startw, endw, _data.shape)
+                        _data[starth:endh, startw:endw, :] = 128
+                data = [_data]
+                try:
+                    self.check_valid_image(data)
+                except RuntimeError as e:
+                    logging.debug('Invalid image, skipping:  %s', str(e))
+                    continue
+                #print('aa',data[0].shape)
+                #data = self.augmentation_transform(data)
+                #print('bb',data[0].shape)
+                for datum in data:
+                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
+                    #print(datum.shape)
+                    batch_data[i][:] = self.postprocess_data(datum)
+                    batch_label[i][:] = label
+                    i += 1
+        except StopIteration:
+            if i < batch_size:
+                raise StopIteration
+
+        return io.DataBatch([batch_data], [batch_label], batch_size - i)
+
+    def check_data_shape(self, data_shape):
+        """Checks if the input data shape is valid"""
+        if not len(data_shape) == 3:
+            raise ValueError(
+                'data_shape should have length 3, with dimensions CxHxW')
+        if not data_shape[0] == 3:
+            raise ValueError(
+                'This iterator expects inputs to have 3 channels.')
+
+    def check_valid_image(self, data):
+        """Checks if the input data is valid"""
+        if len(data[0].shape) == 0:
+            raise RuntimeError('Data shape is wrong')
+
+    def imdecode(self, s):
+        """Decodes a string or byte string to an NDArray.
+        See mx.img.imdecode for more details."""
+        img = mx.image.imdecode(s)  #mx.ndarray
+        return img
+
+    def read_image(self, fname):
+        """Reads an input image `fname` and returns the decoded raw bytes.
+
+        Example usage:
+        ----------
+        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
+        """
+        with open(os.path.join(self.path_root, fname), 'rb') as fin:
+            img = fin.read()
+        return img
+
+    def augmentation_transform(self, data):
+        """Transforms input data with specified augmentation."""
+        for aug in self.auglist:
+            data = [ret for src in data for ret in aug(src)]
+        return data
+
+    def postprocess_data(self, datum):
+        """Final postprocessing step before image is loaded into the batch."""
+        return nd.transpose(datum, axes=(2, 0, 1))
+
+
+class FaceImageIterList(io.DataIter):
+    def __init__(self, iter_list):
+        assert len(iter_list) > 0
+        self.provide_data = iter_list[0].provide_data
+        self.provide_label = iter_list[0].provide_label
+        self.iter_list = iter_list
+        self.cur_iter = None
+
+    def reset(self):
+        self.cur_iter.reset()
+
+    def next(self):
+        self.cur_iter = random.choice(self.iter_list)
+        while True:
+            try:
+                ret = self.cur_iter.next()
+            except StopIteration:
+                self.cur_iter.reset()
+                continue
+            return ret
+
+def get_face_image_iter(cfg, data_shape, path_imgrec):
+    print('loading:', path_imgrec, cfg.is_shuffled_rec)
+    if not cfg.is_shuffled_rec:
+        train_dataiter = FaceImageIter(
+            batch_size=cfg.batch_size,
+            data_shape=data_shape,
+            path_imgrec=path_imgrec,
+            shuffle=True,
+            rand_mirror=cfg.data_rand_mirror,
+            mean=None,
+            cutoff=cfg.data_cutoff,
+            color_jittering=cfg.data_color,
+            images_filter=cfg.data_images_filter,
+        )
+        train_dataiter = mx.io.PrefetchingIter(train_dataiter)
+    else:
+        train_dataiter = mx.io.ImageRecordIter(
+               path_imgrec         = path_imgrec,
+               data_shape          = data_shape,
+               batch_size          = cfg.batch_size,
+               rand_mirror         = cfg.data_rand_mirror,
+               preprocess_threads  = 2,
+               shuffle             = True,
+               shuffle_chunk_size  = 1024,
+               )
+    return train_dataiter
+
+def test_face_image_iter(path_imgrec):
+    train_dataiter = mx.io.ImageRecordIter(
+           path_imgrec         = path_imgrec,
+           data_shape          = (3,112,112),
+           batch_size          = 512,
+           rand_mirror         = True,
+           preprocess_threads  = 2,
+           shuffle             = True,
+           shuffle_chunk_size  = 1024,
+           )
+    for batch in train_dataiter:
+        data = batch.data[0].asnumpy()
+        print(data.shape)
+        img0 = data[0]
+        print(img0[0,:5,:5])
+
+if __name__ == '__main__':
+    test_face_image_iter('/train_tmp/ms1mv3shuf/train.rec')
+
diff --git a/insightface/recognition/arcface_mxnet/metric.py b/insightface/recognition/arcface_mxnet/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a64c16e4a4ed6e72e6940f4cbec0ebc06d634e5
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/metric.py
@@ -0,0 +1,50 @@
+import numpy as np
+import mxnet as mx
+
+
+class AccMetric(mx.metric.EvalMetric):
+    def __init__(self):
+        self.axis = 1
+        super(AccMetric, self).__init__('acc',
+                                        axis=self.axis,
+                                        output_names=None,
+                                        label_names=None)
+        self.losses = []
+        self.count = 0
+
+    def update(self, labels, preds):
+        self.count += 1
+        label = labels[0]
+        pred_label = preds[1]
+        #print('ACC', label.shape, pred_label.shape)
+        if pred_label.shape != label.shape:
+            pred_label = mx.ndarray.argmax(pred_label, axis=self.axis)
+        pred_label = pred_label.asnumpy().astype('int32').flatten()
+        label = label.asnumpy()
+        if label.ndim == 2:
+            label = label[:, 0]
+        label = label.astype('int32').flatten()
+        assert label.shape == pred_label.shape
+        self.sum_metric += (pred_label.flat == label.flat).sum()
+        self.num_inst += len(pred_label.flat)
+
+
+class LossValueMetric(mx.metric.EvalMetric):
+    def __init__(self):
+        self.axis = 1
+        super(LossValueMetric, self).__init__('lossvalue',
+                                              axis=self.axis,
+                                              output_names=None,
+                                              label_names=None)
+        self.losses = []
+
+    def update(self, labels, preds):
+        #label = labels[0].asnumpy()
+        pred = preds[-1].asnumpy()
+        #print('in loss', pred.shape)
+        #print(pred)
+        loss = pred[0]
+        self.sum_metric += loss
+        self.num_inst += 1.0
+        #gt_label = preds[-2].asnumpy()
+        #print(gt_label)
diff --git a/insightface/recognition/arcface_mxnet/parall_module_local_v1.py b/insightface/recognition/arcface_mxnet/parall_module_local_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..394fe2f23f13d7d4fdd05a16152c0ac1947e6fc3
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/parall_module_local_v1.py
@@ -0,0 +1,612 @@
+'''
+@author: insightface
+'''
+
+import logging
+import copy
+import time
+import os
+
+import mxnet as mx
+import numpy as np
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+from mxnet import metric
+from mxnet.model import BatchEndParam
+from mxnet import io
+import mxnet.ndarray as nd
+from config import config
+
+
+class ParallModule(BaseModule):
+    def __init__(self,
+                 symbol,
+                 data_names,
+                 label_names,
+                 logger=logging,
+                 context=ctx.cpu(),
+                 work_load_list=None,
+                 asymbol=None,
+                 args=None):
+        super(ParallModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._asymbol = asymbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+        self._num_classes = config.num_classes
+        self._batch_size = args.batch_size
+        self._verbose = args.verbose
+        self._emb_size = config.emb_size
+        self._local_class_start = args.local_class_start
+        self._iter = 0
+
+        self._curr_module = None
+
+        self._num_workers = config.num_workers
+        self._num_ctx = len(self._context)
+        self._ctx_num_classes = args.ctx_num_classes
+        self._nd_cache = {}
+        self._ctx_cpu = mx.cpu()
+        self._ctx_single_gpu = self._context[-1]
+        self._fixed_param_names = None
+        self._curr_module = Module(self._symbol,
+                                   self._data_names,
+                                   self._label_names,
+                                   logger=self.logger,
+                                   context=self._context,
+                                   work_load_list=self._work_load_list,
+                                   fixed_param_names=self._fixed_param_names)
+        self._arcface_modules = []
+        self._ctx_class_start = []
+        for i in range(len(self._context)):
+
+            args._ctxid = i
+            _module = Module(self._asymbol(args),
+                             self._data_names,
+                             self._label_names,
+                             logger=self.logger,
+                             context=mx.gpu(i),
+                             work_load_list=self._work_load_list,
+                             fixed_param_names=self._fixed_param_names)
+            self._arcface_modules.append(_module)
+            _c = args.local_class_start + i * args.ctx_num_classes
+            self._ctx_class_start.append(_c)
+        self._usekv = False
+        if self._usekv:
+            self._distkv = mx.kvstore.create('dist_sync')
+            self._kvinit = {}
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_export_params(self):
+        assert self.binded and self.params_initialized
+        _g, _x = self._curr_module.get_params()
+        g = _g.copy()
+        x = _x.copy()
+        return g, x
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        _g, _x = self._curr_module.get_params()
+        g = _g.copy()
+        x = _x.copy()
+        for _module in self._arcface_modules:
+            _g, _x = _module.get_params()
+            ag = _g.copy()
+            ax = _x.copy()
+            g.update(ag)
+            x.update(ax)
+        return g, x
+
+    def set_params(self,
+                   arg_params,
+                   aux_params,
+                   allow_missing=False,
+                   force_init=True,
+                   allow_extra=False):
+        g = arg_params
+        x = aux_params
+        #ag = {}
+        #ax = {}
+        rk = []
+        for k in g:
+            v = g[k]
+            if k.startswith('fc7'):
+                p1 = k.find('_')
+                p2 = k.rfind('_')
+                _ctxid = int(k[p1 + 1:p2])
+                self._arcface_modules[_ctxid].set_params({k: v}, {})
+                rk.append(k)
+        for k in rk:
+            del g[k]
+        self._curr_module.set_params(g, x)
+        #self._arcface_module.set_params(ag, ax)
+
+    def init_params(self,
+                    initializer=Uniform(0.01),
+                    arg_params=None,
+                    aux_params=None,
+                    allow_missing=False,
+                    force_init=False,
+                    allow_extra=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        #TODO init the same weights with all work nodes
+        self._curr_module.init_params(initializer=initializer,
+                                      arg_params=arg_params,
+                                      aux_params=aux_params,
+                                      allow_missing=allow_missing,
+                                      force_init=force_init,
+                                      allow_extra=allow_extra)
+        for _module in self._arcface_modules:
+            #_initializer = initializer
+            _initializer = mx.init.Normal(0.01)
+            _module.init_params(initializer=_initializer,
+                                arg_params=None,
+                                aux_params=None,
+                                allow_missing=allow_missing,
+                                force_init=force_init,
+                                allow_extra=allow_extra)
+        self.params_initialized = True
+
+    def bind(self,
+             data_shapes,
+             label_shapes=None,
+             for_training=True,
+             inputs_need_grad=False,
+             force_rebind=False,
+             shared_module=None):
+        print('in_bind', self.params_initialized, data_shapes, label_shapes)
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+        self._curr_module.bind(data_shapes,
+                               label_shapes,
+                               for_training,
+                               inputs_need_grad,
+                               force_rebind=False,
+                               shared_module=None)
+        _data_shape = data_shapes[0][1]
+        print('_data_shape', _data_shape, label_shapes)
+        for _module in self._arcface_modules:
+            _module.bind(
+                [('data',
+                  (_data_shape[0] * self._num_workers, self._emb_size))],
+                [('softmax_label', (_data_shape[0] * self._num_workers, ))],
+                for_training,
+                True,
+                force_rebind=False,
+                shared_module=None)
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self,
+                       kvstore='local',
+                       optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01), ),
+                       force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore,
+                                         optimizer[0],
+                                         optimizer_params,
+                                         force_init=force_init)
+        for _module in self._arcface_modules:
+            _module.init_optimizer(kvstore,
+                                   optimizer[1],
+                                   optimizer_params,
+                                   force_init=force_init)
+        self.optimizer_initialized = True
+
+    def kv_push(self, key, value):
+        #if value.context!=mx.cpu():
+        #  value = value.as_in_context(mx.cpu())
+        if not key in self._kvinit:
+            self._distkv.init(key, nd.zeros_like(value))
+            self._kvinit[key] = 1
+        self._distkv.push(key, value)
+
+    #get fc1 and partial fc7
+    def forward(self, data_batch, is_train=None):
+        #g,x = self.get_params()
+        #print('{fc7_weight[0][0]}', self._iter, g['fc7_0_weight'].asnumpy()[0][0])
+        #print('{pre_fc1_weight[0][0]}', self._iter, g['pre_fc1_weight'].asnumpy()[0][0])
+
+        assert self.binded and self.params_initialized
+        self._curr_module.forward(data_batch, is_train=is_train)
+        if is_train:
+            self._iter += 1
+            fc1, label = self._curr_module.get_outputs(
+                merge_multi_context=True)
+            global_fc1 = fc1
+            self.global_label = label.as_in_context(self._ctx_cpu)
+
+            for i, _module in enumerate(self._arcface_modules):
+                _label = self.global_label - self._ctx_class_start[i]
+                db_global_fc1 = io.DataBatch([global_fc1], [_label])
+                _module.forward(db_global_fc1)  #fc7 with margin
+        #print('forward end')
+
+    def get_ndarray(self, context, name, shape):
+        key = "%s_%s" % (name, context)
+        #print(key)
+        if not key in self._nd_cache:
+            v = nd.zeros(shape=shape, ctx=context)
+            self._nd_cache[key] = v
+        else:
+            v = self._nd_cache[key]
+        return v
+
+    def get_ndarray2(self, context, name, arr):
+        key = "%s_%s" % (name, context)
+        #print(key)
+        if not key in self._nd_cache:
+            v = nd.zeros(shape=arr.shape, ctx=context)
+            self._nd_cache[key] = v
+        else:
+            v = self._nd_cache[key]
+        arr.copyto(v)
+        return v
+
+    def backward(self, out_grads=None):
+        #print('in backward')
+        assert self.binded and self.params_initialized
+        #tmp_ctx = self._ctx_cpu
+        tmp_ctx = self._ctx_single_gpu
+        fc7_outs = []
+        ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max',
+                                       (self._batch_size, len(self._context)))
+        #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu())
+        for i, _module in enumerate(self._arcface_modules):
+            _fc7 = _module.get_outputs(merge_multi_context=True)[0]
+            fc7_outs.append(_fc7)
+            _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx)
+            ctx_fc7_max[:, i] = _fc7_max
+
+        local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max',
+                                         (self._batch_size, 1))
+        nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max)
+        global_fc7_max = local_fc7_max
+        #local_fc7_sum = None
+        local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum',
+                                         (self._batch_size, 1))
+        local_fc7_sum[:, :] = 0.0
+        for i, _module in enumerate(self._arcface_modules):
+            _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max',
+                                     global_fc7_max)
+            fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max)
+            fc7_outs[i] = nd.exp(fc7_outs[i])
+            _sum = nd.sum(fc7_outs[i], axis=1,
+                          keepdims=True).as_in_context(tmp_ctx)
+            local_fc7_sum += _sum
+        global_fc7_sum = local_fc7_sum
+
+        if self._iter % self._verbose == 0:
+            #_ctx = self._context[-1]
+            _ctx = self._ctx_cpu
+            _probs = []
+            for i, _module in enumerate(self._arcface_modules):
+                _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d' % i,
+                                          fc7_outs[i])
+                _probs.append(_prob)
+            fc7_prob = self.get_ndarray(
+                _ctx, 'test_fc7_prob',
+                (self._batch_size, self._ctx_num_classes * len(self._context)))
+            nd.concat(*_probs, dim=1, out=fc7_prob)
+            fc7_pred = nd.argmax(fc7_prob, axis=1)
+            local_label = self.global_label - self._local_class_start
+            #local_label = self.get_ndarray2(_ctx, 'test_label', local_label)
+            _pred = nd.equal(fc7_pred, local_label)
+            print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0])
+
+        #local_fc1_grad = []
+        #fc1_grad_ctx = self._ctx_cpu
+        fc1_grad_ctx = self._ctx_single_gpu
+        local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad',
+                                          (self._batch_size, self._emb_size))
+        local_fc1_grad[:, :] = 0.0
+
+        for i, _module in enumerate(self._arcface_modules):
+            _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum',
+                                     global_fc7_sum)
+            fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum)
+            a = i * self._ctx_num_classes
+            b = (i + 1) * self._ctx_num_classes
+            _label = self.global_label - self._ctx_class_start[i]
+            _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label)
+            onehot_label = self.get_ndarray(
+                fc7_outs[i].context, 'label_onehot',
+                (self._batch_size, self._ctx_num_classes))
+            nd.one_hot(_label,
+                       depth=self._ctx_num_classes,
+                       on_value=1.0,
+                       off_value=0.0,
+                       out=onehot_label)
+            fc7_outs[i] -= onehot_label
+            _module.backward(out_grads=[fc7_outs[i]])
+            #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu())
+            ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx,
+                                             'ctx_fc1_grad_%d' % i,
+                                             _module.get_input_grads()[0])
+            local_fc1_grad += ctx_fc1_grad
+
+        global_fc1_grad = local_fc1_grad
+        self._curr_module.backward(out_grads=[global_fc1_grad])
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+        for i, _module in enumerate(self._arcface_modules):
+            _module.update()
+        mx.nd.waitall()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(
+            merge_multi_context=merge_multi_context)
+        #return self._arcface_module.get_outputs(merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(
+            merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        #self._curr_module.update_metric(eval_metric, labels)
+        #label = labels[0]
+        #print(label.shape)
+        #self._arcface_module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._curr_module.install_monitor(mon)
+
+    def forward_backward(self, data_batch):
+        """A convenient function that calls both ``forward`` and ``backward``."""
+        self.forward(data_batch, is_train=True)  # get fc1 and partial fc7
+        self.backward()
+
+    def fit(self,
+            train_data,
+            eval_data=None,
+            eval_metric='acc',
+            epoch_end_callback=None,
+            batch_end_callback=None,
+            kvstore='local',
+            optimizer='sgd',
+            optimizer_params=(('learning_rate', 0.01), ),
+            eval_end_callback=None,
+            eval_batch_end_callback=None,
+            initializer=Uniform(0.01),
+            arg_params=None,
+            aux_params=None,
+            allow_missing=False,
+            force_rebind=False,
+            force_init=False,
+            begin_epoch=0,
+            num_epoch=None,
+            validation_metric=None,
+            monitor=None,
+            sparse_row_id_fn=None):
+        """Trains the module parameters.
+
+        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
+        a end-to-end use-case.
+
+        Parameters
+        ----------
+        train_data : DataIter
+            Train DataIter.
+        eval_data : DataIter
+            If not ``None``, will be used as validation set and the performance
+            after each epoch will be evaluated.
+        eval_metric : str or EvalMetric
+            Defaults to 'accuracy'. The performance measure used to display during training.
+            Other possible predefined metrics are:
+            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
+        epoch_end_callback : function or list of functions
+            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
+            and `aux_params`.
+        batch_end_callback : function or list of function
+            Each callback will be called with a `BatchEndParam`.
+        kvstore : str or KVStore
+            Defaults to 'local'.
+        optimizer : str or Optimizer
+            Defaults to 'sgd'.
+        optimizer_params : dict
+            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
+            the optimizer constructor.
+            The default value is not a dict, just to avoid pylint warning on dangerous
+            default values.
+        eval_end_callback : function or list of function
+            These will be called at the end of each full evaluation, with the metrics over
+            the entire evaluation set.
+        eval_batch_end_callback : function or list of function
+            These will be called at the end of each mini-batch during evaluation.
+        initializer : Initializer
+            The initializer is called to initialize the module parameters when they are
+            not already initialized.
+        arg_params : dict
+            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
+            model or loaded from a checkpoint (previously saved model). In this case,
+            the value here will be used to initialize the module parameters, unless they
+            are already initialized by the user via a call to `init_params` or `fit`.
+            `arg_params` has a higher priority than `initializer`.
+        aux_params : dict
+            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
+        allow_missing : bool
+            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
+            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
+            will be initialized via the `initializer`.
+        force_rebind : bool
+            Defaults to ``False``. Whether to force rebinding the executors if already bound.
+        force_init : bool
+            Defaults to ``False``. Indicates whether to force initialization even if the
+            parameters are already initialized.
+        begin_epoch : int
+            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
+            checkpoint saved at a previous training phase at epoch N, then this value should be
+            N+1.
+        num_epoch : int
+            Number of epochs for training.
+        sparse_row_id_fn : A callback function
+            The function  takes `data_batch` as an input and returns a dict of
+            str -> NDArray. The resulting dict is used for pulling row_sparse
+            parameters from the kvstore, where the str key is the name of the param,
+            and the value is the row id of the param to pull.
+
+        Examples
+        --------
+        >>> # An example of using fit for training.
+        >>> # Assume training dataIter and validation dataIter are ready
+        >>> # Assume loading a previously checkpointed model
+        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
+        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
+        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
+        ...     arg_params=arg_params, aux_params=aux_params,
+        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
+        """
+        assert num_epoch is not None, 'please specify number of epochs'
+        assert arg_params is None and aux_params is None
+
+        self.bind(data_shapes=train_data.provide_data,
+                  label_shapes=train_data.provide_label,
+                  for_training=True,
+                  force_rebind=force_rebind)
+        if monitor is not None:
+            self.install_monitor(monitor)
+        self.init_params(initializer=initializer,
+                         arg_params=arg_params,
+                         aux_params=aux_params,
+                         allow_missing=allow_missing,
+                         force_init=force_init)
+        self.init_optimizer(kvstore=kvstore,
+                            optimizer=optimizer,
+                            optimizer_params=optimizer_params)
+
+        if validation_metric is None:
+            validation_metric = eval_metric
+        if not isinstance(eval_metric, metric.EvalMetric):
+            eval_metric = metric.create(eval_metric)
+        epoch_eval_metric = copy.deepcopy(eval_metric)
+
+        ################################################################################
+        # training loop
+        ################################################################################
+        for epoch in range(begin_epoch, num_epoch):
+            tic = time.time()
+            eval_metric.reset()
+            epoch_eval_metric.reset()
+            nbatch = 0
+            data_iter = iter(train_data)
+            end_of_batch = False
+            next_data_batch = next(data_iter)
+            while not end_of_batch:
+                data_batch = next_data_batch
+                if monitor is not None:
+                    monitor.tic()
+                self.forward_backward(data_batch)
+                self.update()
+                assert not isinstance(data_batch, list)
+
+                #if isinstance(data_batch, list):
+                #    #print('XXX')
+                #    self.update_metric(eval_metric,
+                #                       [db.label for db in data_batch],
+                #                       pre_sliced=True)
+                #    self.update_metric(epoch_eval_metric,
+                #                       [db.label for db in data_batch],
+                #                       pre_sliced=True)
+                #else:
+                #    #print('before update metric')
+                #    self.update_metric(eval_metric, data_batch.label)
+                #    self.update_metric(epoch_eval_metric, data_batch.label)
+                #labels = data_batch.label
+                #labels = [self.global_label]
+                #self.update_metric(eval_metric, labels)
+                #self.update_metric(epoch_eval_metric, labels)
+
+                try:
+                    # pre fetch next batch
+                    next_data_batch = next(data_iter)
+                    self.prepare(next_data_batch,
+                                 sparse_row_id_fn=sparse_row_id_fn)
+                except StopIteration:
+                    end_of_batch = True
+
+                if monitor is not None:
+                    monitor.toc_print()
+
+                #if end_of_batch:
+                #    eval_name_vals = epoch_eval_metric.get_name_value()
+
+                if batch_end_callback is not None:
+                    batch_end_params = BatchEndParam(epoch=epoch,
+                                                     nbatch=nbatch,
+                                                     eval_metric=None,
+                                                     locals=locals())
+                    batch_end_callback(batch_end_params)
+                    #for callback in _as_list(batch_end_callback):
+                    #    callback(batch_end_params)
+                nbatch += 1
+
+            # one epoch of training is finished
+            #for name, val in eval_name_vals:
+            #    self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
+            toc = time.time()
+            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
+
+            # sync aux params across devices
+            arg_params, aux_params = self.get_params()
+            self.set_params(arg_params, aux_params)
+
+            # end of 1 epoch, reset the data-iter for another epoch
+            train_data.reset()
diff --git a/insightface/recognition/arcface_mxnet/sample_config.py b/insightface/recognition/arcface_mxnet/sample_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..804fb3204450637a002116cf469164993e05ff6f
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/sample_config.py
@@ -0,0 +1,224 @@
+import numpy as np
+import os
+from easydict import EasyDict as edict
+
+config = edict()
+
+config.bn_mom = 0.9
+config.workspace = 256
+config.emb_size = 512
+config.ckpt_embedding = True
+config.net_se = 0
+config.net_act = 'prelu'
+config.net_unit = 3
+config.net_input = 1
+config.net_blocks = [1, 4, 6, 2]
+config.net_output = 'E'
+config.net_multiplier = 1.0
+config.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+config.ce_loss = True
+config.fc7_lr_mult = 1.0
+config.fc7_wd_mult = 1.0
+config.fc7_no_bias = False
+config.max_steps = 0
+config.data_rand_mirror = True
+config.data_cutoff = False
+config.data_color = 0
+config.data_images_filter = 0
+config.count_flops = True
+config.memonger = False  #not work now
+config.is_shuffled_rec = False
+
+config.fp16 = False
+
+# network settings
+network = edict()
+
+network.r100 = edict()
+network.r100.net_name = 'fresnet'
+network.r100.num_layers = 100
+
+network.r100fc = edict()
+network.r100fc.net_name = 'fresnet'
+network.r100fc.num_layers = 100
+network.r100fc.net_output = 'FC'
+
+network.r50 = edict()
+network.r50.net_name = 'fresnet'
+network.r50.num_layers = 50
+
+network.r50v1 = edict()
+network.r50v1.net_name = 'fresnet'
+network.r50v1.num_layers = 50
+network.r50v1.net_unit = 1
+
+network.d169 = edict()
+network.d169.net_name = 'fdensenet'
+network.d169.num_layers = 169
+network.d169.per_batch_size = 64
+network.d169.densenet_dropout = 0.0
+
+network.d201 = edict()
+network.d201.net_name = 'fdensenet'
+network.d201.num_layers = 201
+network.d201.per_batch_size = 64
+network.d201.densenet_dropout = 0.0
+
+network.y1 = edict()
+network.y1.net_name = 'fmobilefacenet'
+network.y1.emb_size = 128
+network.y1.net_output = 'GDC'
+
+network.y2 = edict()
+network.y2.net_name = 'fmobilefacenet'
+network.y2.emb_size = 256
+network.y2.net_output = 'GDC'
+network.y2.net_blocks = [2, 8, 16, 4]
+
+network.m1 = edict()
+network.m1.net_name = 'fmobilenet'
+network.m1.emb_size = 256
+network.m1.net_output = 'GDC'
+network.m1.net_multiplier = 1.0
+
+network.m05 = edict()
+network.m05.net_name = 'fmobilenet'
+network.m05.emb_size = 256
+network.m05.net_output = 'GDC'
+network.m05.net_multiplier = 0.5
+
+network.mnas = edict()
+network.mnas.net_name = 'fmnasnet'
+network.mnas.emb_size = 256
+network.mnas.net_output = 'GDC'
+network.mnas.net_multiplier = 1.0
+
+network.mnas05 = edict()
+network.mnas05.net_name = 'fmnasnet'
+network.mnas05.emb_size = 256
+network.mnas05.net_output = 'GDC'
+network.mnas05.net_multiplier = 0.5
+
+network.mnas025 = edict()
+network.mnas025.net_name = 'fmnasnet'
+network.mnas025.emb_size = 256
+network.mnas025.net_output = 'GDC'
+network.mnas025.net_multiplier = 0.25
+
+network.vargfacenet = edict()
+network.vargfacenet.net_name = 'vargfacenet'
+network.vargfacenet.net_multiplier = 1.25
+network.vargfacenet.emb_size = 512
+network.vargfacenet.net_output = 'J'
+
+# dataset settings
+dataset = edict()
+
+dataset.emore = edict()
+dataset.emore.dataset = 'emore'
+dataset.emore.dataset_path = '../datasets/faces_emore'
+dataset.emore.num_classes = 85742
+dataset.emore.image_shape = (112, 112, 3)
+dataset.emore.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+
+dataset.retina = edict()
+dataset.retina.dataset = 'retina'
+dataset.retina.dataset_path = '../datasets/ms1m-retinaface-t1'
+dataset.retina.num_classes = 93431
+dataset.retina.image_shape = (112, 112, 3)
+dataset.retina.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+
+loss = edict()
+loss.softmax = edict()
+loss.softmax.loss_name = 'softmax'
+
+loss.nsoftmax = edict()
+loss.nsoftmax.loss_name = 'margin_softmax'
+loss.nsoftmax.loss_s = 64.0
+loss.nsoftmax.loss_m1 = 1.0
+loss.nsoftmax.loss_m2 = 0.0
+loss.nsoftmax.loss_m3 = 0.0
+
+loss.arcface = edict()
+loss.arcface.loss_name = 'margin_softmax'
+loss.arcface.loss_s = 64.0
+loss.arcface.loss_m1 = 1.0
+loss.arcface.loss_m2 = 0.5
+loss.arcface.loss_m3 = 0.0
+
+loss.cosface = edict()
+loss.cosface.loss_name = 'margin_softmax'
+loss.cosface.loss_s = 64.0
+loss.cosface.loss_m1 = 1.0
+loss.cosface.loss_m2 = 0.0
+loss.cosface.loss_m3 = 0.35
+
+loss.combined = edict()
+loss.combined.loss_name = 'margin_softmax'
+loss.combined.loss_s = 64.0
+loss.combined.loss_m1 = 1.0
+loss.combined.loss_m2 = 0.3
+loss.combined.loss_m3 = 0.2
+
+loss.triplet = edict()
+loss.triplet.loss_name = 'triplet'
+loss.triplet.images_per_identity = 5
+loss.triplet.triplet_alpha = 0.3
+loss.triplet.triplet_bag_size = 7200
+loss.triplet.triplet_max_ap = 0.0
+loss.triplet.per_batch_size = 60
+loss.triplet.lr = 0.05
+
+loss.atriplet = edict()
+loss.atriplet.loss_name = 'atriplet'
+loss.atriplet.images_per_identity = 5
+loss.atriplet.triplet_alpha = 0.35
+loss.atriplet.triplet_bag_size = 7200
+loss.atriplet.triplet_max_ap = 0.0
+loss.atriplet.per_batch_size = 60
+loss.atriplet.lr = 0.05
+
+# default settings
+default = edict()
+
+# default network
+default.network = 'r100'
+default.pretrained = ''
+default.pretrained_epoch = 1
+# default dataset
+default.dataset = 'emore'
+default.loss = 'arcface'
+default.frequent = 20
+default.verbose = 2000
+default.kvstore = 'device'
+
+default.end_epoch = 10000
+default.lr = 0.1
+default.wd = 0.0005
+default.mom = 0.9
+default.per_batch_size = 128
+default.ckpt = 3
+default.lr_steps = '100000,160000,220000'
+default.models_root = './models'
+
+
+def generate_config(_network, _dataset, _loss):
+    for k, v in loss[_loss].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    for k, v in network[_network].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    for k, v in dataset[_dataset].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    config.loss = _loss
+    config.network = _network
+    config.dataset = _dataset
+    config.num_workers = 1
+    config.fp16 = False
+    if 'DMLC_NUM_WORKER' in os.environ:
+        config.num_workers = int(os.environ['DMLC_NUM_WORKER'])
diff --git a/insightface/recognition/arcface_mxnet/symbol/fdensenet.py b/insightface/recognition/arcface_mxnet/symbol/fdensenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d49ee876f85dc48645543d5d8ad1170928566d
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/fdensenet.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""DenseNet, implemented in Gluon."""
+
+import sys
+import os
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act():
+    if config.net_act == 'prelu':
+        return nn.PReLU()
+    else:
+        return nn.Activation(config.net_act)
+
+
+# Helpers
+def _make_dense_block(num_layers, bn_size, growth_rate, dropout, stage_index):
+    out = nn.HybridSequential(prefix='stage%d_' % stage_index)
+    with out.name_scope():
+        for _ in range(num_layers):
+            out.add(_make_dense_layer(growth_rate, bn_size, dropout))
+    return out
+
+
+def _make_dense_layer(growth_rate, bn_size, dropout):
+    new_features = nn.HybridSequential(prefix='')
+    new_features.add(nn.BatchNorm())
+    #new_features.add(nn.Activation('relu'))
+    new_features.add(Act())
+    new_features.add(
+        nn.Conv2D(bn_size * growth_rate, kernel_size=1, use_bias=False))
+    new_features.add(nn.BatchNorm())
+    #new_features.add(nn.Activation('relu'))
+    new_features.add(Act())
+    new_features.add(
+        nn.Conv2D(growth_rate, kernel_size=3, padding=1, use_bias=False))
+    if dropout:
+        new_features.add(nn.Dropout(dropout))
+
+    out = gluon.contrib.nn.HybridConcurrent(axis=1, prefix='')
+    out.add(gluon.contrib.nn.Identity())
+    out.add(new_features)
+
+    return out
+
+
+def _make_transition(num_output_features):
+    out = nn.HybridSequential(prefix='')
+    out.add(nn.BatchNorm())
+    #out.add(nn.Activation('relu'))
+    out.add(Act())
+    out.add(nn.Conv2D(num_output_features, kernel_size=1, use_bias=False))
+    out.add(nn.AvgPool2D(pool_size=2, strides=2))
+    return out
+
+
+# Net
+class DenseNet(nn.HybridBlock):
+    r"""Densenet-BC model from the
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
+
+    Parameters
+    ----------
+    num_init_features : int
+        Number of filters to learn in the first convolution layer.
+    growth_rate : int
+        Number of filters to add each layer (`k` in the paper).
+    block_config : list of int
+        List of integers for numbers of layers in each pooling block.
+    bn_size : int, default 4
+        Multiplicative factor for number of bottle neck layers.
+        (i.e. bn_size * k features in the bottleneck layer)
+    dropout : float, default 0
+        Rate of dropout after each dense layer.
+    classes : int, default 1000
+        Number of classification classes.
+    """
+    def __init__(self,
+                 num_init_features,
+                 growth_rate,
+                 block_config,
+                 bn_size=4,
+                 dropout=0,
+                 classes=1000,
+                 **kwargs):
+
+        super(DenseNet, self).__init__(**kwargs)
+        with self.name_scope():
+            self.features = nn.HybridSequential(prefix='')
+            self.features.add(
+                nn.Conv2D(num_init_features,
+                          kernel_size=3,
+                          strides=1,
+                          padding=1,
+                          use_bias=False))
+            self.features.add(nn.BatchNorm())
+            self.features.add(nn.Activation('relu'))
+            self.features.add(nn.MaxPool2D(pool_size=3, strides=2, padding=1))
+            # Add dense blocks
+            num_features = num_init_features
+            for i, num_layers in enumerate(block_config):
+                self.features.add(
+                    _make_dense_block(num_layers, bn_size, growth_rate,
+                                      dropout, i + 1))
+                num_features = num_features + num_layers * growth_rate
+                if i != len(block_config) - 1:
+                    self.features.add(_make_transition(num_features // 2))
+                    num_features = num_features // 2
+            self.features.add(nn.BatchNorm())
+            self.features.add(nn.Activation('relu'))
+            #self.features.add(nn.AvgPool2D(pool_size=7))
+            #self.features.add(nn.Flatten())
+
+            #self.output = nn.Dense(classes)
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        #x = self.output(x)
+        return x
+
+
+# Specification
+densenet_spec = {
+    121: (64, 32, [6, 12, 24, 16]),
+    161: (96, 48, [6, 12, 36, 24]),
+    169: (64, 32, [6, 12, 32, 32]),
+    201: (64, 32, [6, 12, 48, 32])
+}
+
+
+# Constructor
+def get_symbol():
+    num_layers = config.num_layers
+    num_init_features, growth_rate, block_config = densenet_spec[num_layers]
+    net = DenseNet(num_init_features,
+                   growth_rate,
+                   block_config,
+                   dropout=config.densenet_dropout)
+    data = mx.sym.Variable(name='data')
+    data = data - 127.5
+    data = data * 0.0078125
+    body = net(data)
+    fc1 = symbol_utils.get_fc1(body, config.emb_size, config.net_output)
+    return fc1
diff --git a/insightface/recognition/arcface_mxnet/symbol/fmnasnet.py b/insightface/recognition/arcface_mxnet/symbol/fmnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..118beb94298cbaaef43811fdc4154bdb47a66bd9
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/fmnasnet.py
@@ -0,0 +1,213 @@
+import sys
+import os
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act():
+    if config.net_act == 'prelu':
+        return nn.PReLU()
+    else:
+        return nn.Activation(config.net_act)
+
+
+def ConvBlock(channels, kernel_size, strides, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels,
+                      kernel_size,
+                      strides=strides,
+                      padding=1,
+                      use_bias=False), nn.BatchNorm(scale=True),
+            Act()
+            #nn.Activation('relu')
+        )
+    return out
+
+
+def Conv1x1(channels, is_linear=False, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(nn.Conv2D(channels, 1, padding=0, use_bias=False),
+                nn.BatchNorm(scale=True))
+        if not is_linear:
+            #out.add(nn.Activation('relu'))
+            out.add(Act())
+    return out
+
+
+def DWise(channels, strides, kernel_size=3, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels,
+                      kernel_size,
+                      strides=strides,
+                      padding=kernel_size // 2,
+                      groups=channels,
+                      use_bias=False), nn.BatchNorm(scale=True),
+            Act()
+            #nn.Activation('relu')
+        )
+    return out
+
+
+class SepCONV(nn.HybridBlock):
+    def __init__(self,
+                 inp,
+                 output,
+                 kernel_size,
+                 depth_multiplier=1,
+                 with_bn=True,
+                 **kwargs):
+        super(SepCONV, self).__init__(**kwargs)
+        with self.name_scope():
+            self.net = nn.HybridSequential()
+            cn = int(inp * depth_multiplier)
+
+            if output is None:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp,
+                              channels=cn,
+                              groups=inp,
+                              kernel_size=kernel_size,
+                              strides=(1, 1),
+                              padding=kernel_size // 2,
+                              use_bias=not with_bn))
+            else:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp,
+                              channels=cn,
+                              groups=inp,
+                              kernel_size=kernel_size,
+                              strides=(1, 1),
+                              padding=kernel_size // 2,
+                              use_bias=False),
+                    nn.BatchNorm(),
+                    Act(),
+                    #nn.Activation('relu'),
+                    nn.Conv2D(in_channels=cn,
+                              channels=output,
+                              kernel_size=(1, 1),
+                              strides=(1, 1),
+                              use_bias=not with_bn))
+
+            self.with_bn = with_bn
+            self.act = Act()
+            #self.act = nn.Activation('relu')
+            if with_bn:
+                self.bn = nn.BatchNorm()
+
+    def hybrid_forward(self, F, x):
+        x = self.net(x)
+        if self.with_bn:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class ExpandedConv(nn.HybridBlock):
+    def __init__(self,
+                 inp,
+                 oup,
+                 t,
+                 strides,
+                 kernel=3,
+                 same_shape=True,
+                 **kwargs):
+        super(ExpandedConv, self).__init__(**kwargs)
+
+        self.same_shape = same_shape
+        self.strides = strides
+        with self.name_scope():
+            self.bottleneck = nn.HybridSequential()
+            self.bottleneck.add(
+                Conv1x1(inp * t, prefix="expand_"),
+                DWise(inp * t, self.strides, kernel, prefix="dwise_"),
+                Conv1x1(oup, is_linear=True, prefix="linear_"))
+
+    def hybrid_forward(self, F, x):
+        out = self.bottleneck(x)
+        if self.strides == 1 and self.same_shape:
+            out = F.elemwise_add(out, x)
+        return out
+
+
+def ExpandedConvSequence(t, k, inp, oup, repeats, first_strides, **kwargs):
+    seq = nn.HybridSequential(**kwargs)
+    with seq.name_scope():
+        seq.add(ExpandedConv(inp, oup, t, first_strides, k, same_shape=False))
+        curr_inp = oup
+        for i in range(1, repeats):
+            seq.add(ExpandedConv(curr_inp, oup, t, 1))
+            curr_inp = oup
+    return seq
+
+
+class MNasNet(nn.HybridBlock):
+    def __init__(self, m=1.0, **kwargs):
+        super(MNasNet, self).__init__(**kwargs)
+
+        self.first_oup = int(32 * m)
+        self.second_oup = int(16 * m)
+        #self.second_oup = int(32*m)
+        self.interverted_residual_setting = [
+            # t, c,  n, s, k
+            [3, int(24 * m), 3, 2, 3, "stage2_"],  # -> 56x56
+            [3, int(40 * m), 3, 2, 5, "stage3_"],  # -> 28x28
+            [6, int(80 * m), 3, 2, 5, "stage4_1_"],  # -> 14x14
+            [6, int(96 * m), 2, 1, 3, "stage4_2_"],  # -> 14x14
+            [6, int(192 * m), 4, 2, 5, "stage5_1_"],  # -> 7x7
+            [6, int(320 * m), 1, 1, 3, "stage5_2_"],  # -> 7x7          
+        ]
+        self.last_channels = int(1024 * m)
+
+        with self.name_scope():
+            self.features = nn.HybridSequential()
+            self.features.add(
+                ConvBlock(self.first_oup, 3, 1, prefix="stage1_conv0_"))
+            self.features.add(
+                SepCONV(self.first_oup,
+                        self.second_oup,
+                        3,
+                        prefix="stage1_sepconv0_"))
+            inp = self.second_oup
+            for i, (t, c, n, s, k,
+                    prefix) in enumerate(self.interverted_residual_setting):
+                oup = c
+                self.features.add(
+                    ExpandedConvSequence(t, k, inp, oup, n, s, prefix=prefix))
+                inp = oup
+
+            self.features.add(Conv1x1(self.last_channels, prefix="stage5_3_"))
+            #self.features.add(nn.GlobalAvgPool2D())
+            #self.features.add(nn.Flatten())
+            #self.output = nn.Dense(num_classes)
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        #x = self.output(x)
+        return x
+
+    def num_output_channel(self):
+        return self.last_channels
+
+
+def get_symbol():
+    net = MNasNet(config.net_multiplier)
+    data = mx.sym.Variable(name='data')
+    data = data - 127.5
+    data = data * 0.0078125
+    body = net(data)
+    fc1 = symbol_utils.get_fc1(body,
+                               config.emb_size,
+                               config.net_output,
+                               input_channel=net.num_output_channel())
+    return fc1
diff --git a/insightface/recognition/arcface_mxnet/symbol/fmobilefacenet.py b/insightface/recognition/arcface_mxnet/symbol/fmobilefacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f498264ea5f765a5f9ff41f12569c8e6e70810e8
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/fmobilefacenet.py
@@ -0,0 +1,224 @@
+import sys
+import os
+import mxnet as mx
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def Conv(data,
+         num_filter=1,
+         kernel=(1, 1),
+         stride=(1, 1),
+         pad=(0, 0),
+         num_group=1,
+         name=None,
+         suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=config.bn_mom)
+    act = Act(data=bn,
+              act_type=config.net_act,
+              name='%s%s_relu' % (name, suffix))
+    return act
+
+
+def Linear(data,
+           num_filter=1,
+           kernel=(1, 1),
+           stride=(1, 1),
+           pad=(0, 0),
+           num_group=1,
+           name=None,
+           suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=config.bn_mom)
+    return bn
+
+
+def ConvOnly(data,
+             num_filter=1,
+             kernel=(1, 1),
+             stride=(1, 1),
+             pad=(0, 0),
+             num_group=1,
+             name=None,
+             suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    return conv
+
+
+def DResidual(data,
+              num_out=1,
+              kernel=(3, 3),
+              stride=(2, 2),
+              pad=(1, 1),
+              num_group=1,
+              name=None,
+              suffix=''):
+    conv = Conv(data=data,
+                num_filter=num_group,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                name='%s%s_conv_sep' % (name, suffix))
+    conv_dw = Conv(data=conv,
+                   num_filter=num_group,
+                   num_group=num_group,
+                   kernel=kernel,
+                   pad=pad,
+                   stride=stride,
+                   name='%s%s_conv_dw' % (name, suffix))
+    proj = Linear(data=conv_dw,
+                  num_filter=num_out,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name='%s%s_conv_proj' % (name, suffix))
+    return proj
+
+
+def Residual(data,
+             num_block=1,
+             num_out=1,
+             kernel=(3, 3),
+             stride=(1, 1),
+             pad=(1, 1),
+             num_group=1,
+             name=None,
+             suffix=''):
+    identity = data
+    for i in range(num_block):
+        shortcut = identity
+        conv = DResidual(data=identity,
+                         num_out=num_out,
+                         kernel=kernel,
+                         stride=stride,
+                         pad=pad,
+                         num_group=num_group,
+                         name='%s%s_block' % (name, suffix),
+                         suffix='%d' % i)
+        identity = conv + shortcut
+    return identity
+
+
+def get_symbol():
+    num_classes = config.emb_size
+    print('in_network', config)
+    fc_type = config.net_output
+    data = mx.symbol.Variable(name="data")
+    data = data - 127.5
+    data = data * 0.0078125
+    blocks = config.net_blocks
+    conv_1 = Conv(data,
+                  num_filter=64,
+                  kernel=(3, 3),
+                  pad=(1, 1),
+                  stride=(2, 2),
+                  name="conv_1")
+    if blocks[0] == 1:
+        conv_2_dw = Conv(conv_1,
+                         num_group=64,
+                         num_filter=64,
+                         kernel=(3, 3),
+                         pad=(1, 1),
+                         stride=(1, 1),
+                         name="conv_2_dw")
+    else:
+        conv_2_dw = Residual(conv_1,
+                             num_block=blocks[0],
+                             num_out=64,
+                             kernel=(3, 3),
+                             stride=(1, 1),
+                             pad=(1, 1),
+                             num_group=64,
+                             name="res_2")
+    conv_23 = DResidual(conv_2_dw,
+                        num_out=64,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=128,
+                        name="dconv_23")
+    conv_3 = Residual(conv_23,
+                      num_block=blocks[1],
+                      num_out=64,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=128,
+                      name="res_3")
+    conv_34 = DResidual(conv_3,
+                        num_out=128,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=256,
+                        name="dconv_34")
+    conv_4 = Residual(conv_34,
+                      num_block=blocks[2],
+                      num_out=128,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=256,
+                      name="res_4")
+    conv_45 = DResidual(conv_4,
+                        num_out=128,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=512,
+                        name="dconv_45")
+    conv_5 = Residual(conv_45,
+                      num_block=blocks[3],
+                      num_out=128,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=256,
+                      name="res_5")
+    conv_6_sep = Conv(conv_5,
+                      num_filter=512,
+                      kernel=(1, 1),
+                      pad=(0, 0),
+                      stride=(1, 1),
+                      name="conv_6sep")
+
+    fc1 = symbol_utils.get_fc1(conv_6_sep, num_classes, fc_type)
+    return fc1
diff --git a/insightface/recognition/arcface_mxnet/symbol/fmobilenet.py b/insightface/recognition/arcface_mxnet/symbol/fmobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdbf8a559f3d9a374d1a4e753f509e90e4391290
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/fmobilenet.py
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import os
+import mxnet as mx
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def Conv(data,
+         num_filter=1,
+         kernel=(1, 1),
+         stride=(1, 1),
+         pad=(0, 0),
+         num_group=1,
+         name=None,
+         suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=True)
+    act = Act(data=bn,
+              act_type=config.net_act,
+              name='%s%s_relu' % (name, suffix))
+    return act
+
+
+def ConvOnly(data,
+             num_filter=1,
+             kernel=(1, 1),
+             stride=(1, 1),
+             pad=(0, 0),
+             num_group=1,
+             name=None,
+             suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    return conv
+
+
+def get_symbol():
+    num_classes = config.emb_size
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    data = mx.symbol.Variable(name="data")  # 224
+    data = data - 127.5
+    data = data * 0.0078125
+    fc_type = config.net_output
+    bf = int(32 * config.net_multiplier)
+    if config.net_input == 0:
+        conv_1 = Conv(data,
+                      num_filter=bf,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(2, 2),
+                      name="conv_1")  # 224/112
+    else:
+        conv_1 = Conv(data,
+                      num_filter=bf,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_1")  # 224/112
+    conv_2_dw = Conv(conv_1,
+                     num_group=bf,
+                     num_filter=bf,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_2_dw")  # 112/112
+    conv_2 = Conv(conv_2_dw,
+                  num_filter=bf * 2,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_2")  # 112/112
+    conv_3_dw = Conv(conv_2,
+                     num_group=bf * 2,
+                     num_filter=bf * 2,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_3_dw")  # 112/56
+    conv_3 = Conv(conv_3_dw,
+                  num_filter=bf * 4,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_3")  # 56/56
+    conv_4_dw = Conv(conv_3,
+                     num_group=bf * 4,
+                     num_filter=bf * 4,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_4_dw")  # 56/56
+    conv_4 = Conv(conv_4_dw,
+                  num_filter=bf * 4,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_4")  # 56/56
+    conv_5_dw = Conv(conv_4,
+                     num_group=bf * 4,
+                     num_filter=bf * 4,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_5_dw")  # 56/28
+    conv_5 = Conv(conv_5_dw,
+                  num_filter=bf * 8,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_5")  # 28/28
+    conv_6_dw = Conv(conv_5,
+                     num_group=bf * 8,
+                     num_filter=bf * 8,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_6_dw")  # 28/28
+    conv_6 = Conv(conv_6_dw,
+                  num_filter=bf * 8,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_6")  # 28/28
+    conv_7_dw = Conv(conv_6,
+                     num_group=bf * 8,
+                     num_filter=bf * 8,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_7_dw")  # 28/14
+    conv_7 = Conv(conv_7_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_7")  # 14/14
+
+    conv_8_dw = Conv(conv_7,
+                     num_group=bf * 16,
+                     num_filter=bf * 16,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_8_dw")  # 14/14
+    conv_8 = Conv(conv_8_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_8")  # 14/14
+    conv_9_dw = Conv(conv_8,
+                     num_group=bf * 16,
+                     num_filter=bf * 16,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_9_dw")  # 14/14
+    conv_9 = Conv(conv_9_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_9")  # 14/14
+    conv_10_dw = Conv(conv_9,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_10_dw")  # 14/14
+    conv_10 = Conv(conv_10_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_10")  # 14/14
+    conv_11_dw = Conv(conv_10,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_11_dw")  # 14/14
+    conv_11 = Conv(conv_11_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_11")  # 14/14
+    conv_12_dw = Conv(conv_11,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_12_dw")  # 14/14
+    conv_12 = Conv(conv_12_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_12")  # 14/14
+
+    conv_13_dw = Conv(conv_12,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(2, 2),
+                      name="conv_13_dw")  # 14/7
+    conv_13 = Conv(conv_13_dw,
+                   num_filter=bf * 32,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_13")  # 7/7
+    conv_14_dw = Conv(conv_13,
+                      num_group=bf * 32,
+                      num_filter=bf * 32,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_14_dw")  # 7/7
+    conv_14 = Conv(conv_14_dw,
+                   num_filter=bf * 32,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_14")  # 7/7
+    body = conv_14
+    fc1 = symbol_utils.get_fc1(body, num_classes, fc_type)
+    return fc1
diff --git a/insightface/recognition/arcface_mxnet/symbol/fresnet.py b/insightface/recognition/arcface_mxnet/symbol/fresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b17788d1a58d82161ef1931478c306656ffdbd9
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/fresnet.py
@@ -0,0 +1,1191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import mxnet as mx
+import numpy as np
+import symbol_utils
+import memonger
+import sklearn
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Conv(**kwargs):
+    #name = kwargs.get('name')
+    #_weight = mx.symbol.Variable(name+'_weight')
+    #_bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    #body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def residual_unit_v1(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v1_L(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v2(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit2')
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act3 = Act(data=bn3, act_type=act_type, name=name + '_relu3')
+        conv3 = Conv(data=act3,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=conv3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv3 = mx.symbol.broadcast_mul(conv3, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=conv2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv2 = mx.symbol.broadcast_mul(conv2, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit3')
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn4 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn4')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn4,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn4 = mx.symbol.broadcast_mul(bn4, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn4 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn3 + shortcut
+
+
+def residual_unit_v3_x(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNeXt Unit symbol for building ResNeXt
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    assert (bottle_neck)
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    num_group = 32
+    #print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+    act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+    conv3 = Conv(data=act2,
+                 num_filter=num_filter,
+                 kernel=(1, 1),
+                 stride=stride,
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv3')
+    bn4 = mx.sym.BatchNorm(data=conv3,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn4')
+
+    if use_se:
+        #se begin
+        body = mx.sym.Pooling(data=bn4,
+                              global_pool=True,
+                              kernel=(7, 7),
+                              pool_type='avg',
+                              name=name + '_se_pool1')
+        body = Conv(data=body,
+                    num_filter=num_filter // 16,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv1",
+                    workspace=workspace)
+        body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+        body = Conv(data=body,
+                    num_filter=num_filter,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv2",
+                    workspace=workspace)
+        body = mx.symbol.Activation(data=body,
+                                    act_type='sigmoid',
+                                    name=name + "_se_sigmoid")
+        bn4 = mx.symbol.broadcast_mul(bn4, body)
+        #se end
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=bn_mom,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn4 + shortcut
+
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck,
+                  **kwargs):
+    uv = kwargs.get('version_unit', 3)
+    version_input = kwargs.get('version_input', 1)
+    if uv == 1:
+        if version_input == 0:
+            return residual_unit_v1(data, num_filter, stride, dim_match, name,
+                                    bottle_neck, **kwargs)
+        else:
+            return residual_unit_v1_L(data, num_filter, stride, dim_match,
+                                      name, bottle_neck, **kwargs)
+    elif uv == 2:
+        return residual_unit_v2(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    elif uv == 4:
+        return residual_unit_v4(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    else:
+        return residual_unit_v3(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+
+
+def resnet(units, num_stages, filter_list, num_classes, bottle_neck):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {
+        'version_se': config.net_se,
+        'version_input': config.net_input,
+        'version_output': config.net_output,
+        'version_unit': config.net_unit,
+        'version_act': config.net_act,
+        'bn_mom': bn_mom,
+        'workspace': workspace,
+        'memonger': config.memonger,
+    }
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    """
+    version_se = kwargs.get('version_se', 1)
+    version_input = kwargs.get('version_input', 1)
+    assert version_input >= 0
+    version_output = kwargs.get('version_output', 'E')
+    fc_type = version_output
+    version_unit = kwargs.get('version_unit', 3)
+    act_type = kwargs.get('version_act', 'prelu')
+    memonger = kwargs.get('memonger', False)
+    print(version_se, version_input, version_output, version_unit, act_type,
+          memonger)
+    num_unit = len(units)
+    assert (num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if config.fp16:
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    if version_input == 0:
+        #data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+        #body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+    elif version_input == 2:
+        data = mx.sym.BatchNorm(data=data,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn_data')
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+    else:
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = data
+        body = Conv(data=body,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+
+    for i in range(num_stages):
+        #if version_input==0:
+        #  body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+        #                       name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        #else:
+        #  body = residual_unit(body, filter_list[i+1], (2, 2), False,
+        #    name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        if i==num_stages-1 and config.fp16:
+            body = mx.sym.Cast(data=body, dtype=np.float32)
+        body = residual_unit(body,
+                             filter_list[i + 1], (2, 2),
+                             False,
+                             name='stage%d_unit%d' % (i + 1, 1),
+                             bottle_neck=bottle_neck,
+                             **kwargs)
+        for j in range(units[i] - 1):
+            body = residual_unit(body,
+                                 filter_list[i + 1], (1, 1),
+                                 True,
+                                 name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck,
+                                 **kwargs)
+
+    if bottle_neck:
+        body = Conv(data=body,
+                    num_filter=512,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    no_bias=True,
+                    name="convd",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnd')
+        body = Act(data=body, act_type=act_type, name='relud')
+
+    fc1 = symbol_utils.get_fc1(body, num_classes, fc_type)
+    return fc1
+
+
+def get_symbol():
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    num_classes = config.emb_size
+    num_layers = config.num_layers
+    if num_layers >= 500:
+        filter_list = [64, 256, 512, 1024, 2048]
+        bottle_neck = True
+    else:
+        filter_list = [64, 64, 128, 256, 512]
+        bottle_neck = False
+    num_stages = 4
+    if num_layers == 18:
+        units = [2, 2, 2, 2]
+    elif num_layers == 34:
+        units = [3, 4, 6, 3]
+    elif num_layers == 49:
+        units = [3, 4, 14, 3]
+    elif num_layers == 50:
+        units = [3, 4, 14, 3]
+    elif num_layers == 74:
+        units = [3, 6, 24, 3]
+    elif num_layers == 90:
+        units = [3, 8, 30, 3]
+    elif num_layers == 98:
+        units = [3, 4, 38, 3]
+    elif num_layers == 99:
+        units = [3, 8, 35, 3]
+    elif num_layers == 100:
+        units = [3, 13, 30, 3]
+    elif num_layers == 134:
+        units = [3, 10, 50, 3]
+    elif num_layers == 136:
+        units = [3, 13, 48, 3]
+    elif num_layers == 140:
+        units = [3, 15, 48, 3]
+    elif num_layers == 124:
+        units = [3, 13, 40, 5]
+    elif num_layers == 160:
+        units = [3, 24, 49, 3]
+    elif num_layers == 101:
+        units = [3, 4, 23, 3]
+    elif num_layers == 152:
+        units = [3, 8, 36, 3]
+    elif num_layers == 200:
+        units = [3, 24, 36, 3]
+    elif num_layers == 269:
+        units = [3, 30, 48, 8]
+    else:
+        raise ValueError(
+            "no experiments done on num_layers {}, you can do it yourself".
+            format(num_layers))
+
+    net = resnet(units=units,
+                 num_stages=num_stages,
+                 filter_list=filter_list,
+                 num_classes=num_classes,
+                 bottle_neck=bottle_neck)
+
+    if config.memonger:
+        dshape = (config.per_batch_size, config.image_shape[2],
+                  config.image_shape[0], config.image_shape[1])
+        net_mem_planned = memonger.search_plan(net, data=dshape)
+        old_cost = memonger.get_cost(net, data=dshape)
+        new_cost = memonger.get_cost(net_mem_planned, data=dshape)
+
+        print('Old feature map cost=%d MB' % old_cost)
+        print('New feature map cost=%d MB' % new_cost)
+        net = net_mem_planned
+    return net
diff --git a/insightface/recognition/arcface_mxnet/symbol/memonger.py b/insightface/recognition/arcface_mxnet/symbol/memonger.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad610b57b821ec6b8f0087ee2569ad6fda4d177
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/memonger.py
@@ -0,0 +1,175 @@
+import mxnet as mx
+import math
+
+
+def prod(shape):
+    """Get product of the shape.
+    """
+    ret = 1
+    for s in shape:
+        ret *= s
+    return ret
+
+
+def is_param(name):
+    """Quick script to check if name is a parameter.
+    """
+    if name == 'data':
+        return False
+    if name.endswith('weight'):
+        return True
+    if name.endswith('bias'):
+        return True
+    if name.endswith('beta'):
+        return True
+    if name.endswith('gamma'):
+        return True
+    return False
+
+
+def make_mirror_plan(sym, threshold, plan_info=None, **kwargs):
+    """Memory allocation planner with a given threshold.
+
+    The user can pass in a network configuration,
+    a threshold that limits memory per block.
+    And input shape configurations.
+
+    Parameters
+    ----------
+    sym : symbol
+        Input configuration of symbols.
+        The user need to pre-mark the attribute "mirror_stage" on the nodes
+        that can be book-kept as stage
+
+        The algorithm will decide whether to disbale mirror on the stage nodes.
+
+    threshold: integer
+        A tuning parameter to tune the approximate size of each stage blocks
+
+    plan_info: dict, optional
+        Used to hold plan information.
+
+    **kwargs:
+        The arguments to infer shape.
+
+    Returns
+    -------
+    alloc_sym: symbol
+        A symbol with force mirror tagged on the nodes for better allocation.
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_sb = None
+    last_local = 0
+    period = 1
+    last_stage = ''
+    stage_decision = ''
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        else:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def get_cost(sym, type_dict=None, **kwargs):
+    """Get the cost of the current symbolic plan by running bind on CPU.
+
+    sym : Symbolic Variable
+
+    """
+    texec = sym.simple_bind(ctx=mx.gpu(),
+                            grad_req='write',
+                            type_dict=type_dict,
+                            **kwargs)
+    return int(texec.debug_str().split('\n')[-3].split()[1])
+
+
+def search_plan(sym, ntrial=6, type_dict=None, **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    threshold = 0
+    min_threshold = None
+    min_cost = None
+    nbegin = 3
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan(sym,
+                               threshold=threshold,
+                               plan_info=info,
+                               **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = int(math.sqrt(save_size * local_size / 2))
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan(sym,
+                                   threshold=threshold,
+                                   plan_info=info,
+                                   **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
diff --git a/insightface/recognition/arcface_mxnet/symbol/memonger_v2.py b/insightface/recognition/arcface_mxnet/symbol/memonger_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..92963de5e4256855fa3192107b2e584c1fa449bb
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/memonger_v2.py
@@ -0,0 +1,300 @@
+import mxnet as mx
+import math
+
+
+def prod(shape):
+    """Get product of the shape.
+    """
+    ret = 1
+    for s in shape:
+        ret *= s
+    return ret
+
+
+def is_param(name):
+    """Quick script to check if name is a parameter.
+    """
+    if name == 'data':
+        return False
+    if name.endswith('weight'):
+        return True
+    if name.endswith('bias'):
+        return True
+    if name.endswith('beta'):
+        return True
+    if name.endswith('gamma'):
+        return True
+    return False
+
+
+def make_mirror_plan(sym, threshold, plan_info=None, **kwargs):
+    """Memory allocation planner with a given threshold.
+
+    The user can pass in a network configuration,
+    a threshold that limits memory per block.
+    And input shape configurations.
+
+    Parameters
+    ----------
+    sym : symbol
+        Input configuration of symbols.
+        The user need to pre-mark the attribute "mirror_stage" on the nodes
+        that can be book-kept as stage
+
+        The algorithm will decide whether to disbale mirror on the stage nodes.
+
+    threshold: integer
+        A tuning parameter to tune the approximate size of each stage blocks
+
+    plan_info: dict, optional
+        Used to hold plan information.
+
+    **kwargs:
+        The arguments to infer shape.
+
+    Returns
+    -------
+    alloc_sym: symbol
+        A symbol with force mirror tagged on the nodes for better allocation.
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_sb = None
+    last_local = 0
+    period = 1
+    last_stage = ''
+    stage_decision = ''
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        else:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def get_cost(sym, type_dict=None, **kwargs):
+    """Get the cost of the current symbolic plan by running bind on CPU.
+
+    sym : Symbolic Variable
+
+    """
+    texec = sym.simple_bind(ctx=mx.gpu(),
+                            grad_req='write',
+                            type_dict=type_dict,
+                            **kwargs)
+    return int(texec.debug_str().split('\n')[-3].split()[1])
+
+
+def search_plan(sym, ntrial=6, type_dict=None, **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    threshold = 0
+    min_threshold = None
+    min_cost = None
+    nbegin = 3
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan(sym,
+                               threshold=threshold,
+                               plan_info=info,
+                               **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = int(math.sqrt(save_size * local_size / 2))
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan(sym,
+                                   threshold=threshold,
+                                   plan_info=info,
+                                   **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
+
+
+def make_mirror_plan_to_layer(sym,
+                              layer_name,
+                              threshold,
+                              plan_info=None,
+                              **kwargs):
+    """
+    sym is the original symbal
+    layer_name is a name to which layer of the network should be set as mirror
+    threshhold is the approximate size of each mirror block
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_stage = ''
+    stage_decision = ''
+    switch = True
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        #print(name, switch)
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        elif switch and not 'bn' in name:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+            print('set force_mirroring', name, total_size, local_size)
+        if layer_name != '' and layer_name in name:
+            switch = False
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            #print(name, stage)
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def search_plan_to_layer(sym,
+                         layer_name=None,
+                         threshold=500,
+                         ntrial=6,
+                         type_dict=None,
+                         **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    min_threshold = None
+    min_cost = None
+    nbegin = 10
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan_to_layer(sym,
+                                        layer_name=layer_name,
+                                        threshold=threshold,
+                                        plan_info=info,
+                                        **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = 300 * (k + 1)
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    print(min_threshold, max_threshold, step)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan_to_layer(sym,
+                                            layer_name=layer_name,
+                                            threshold=threshold,
+                                            plan_info=info,
+                                            **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
diff --git a/insightface/recognition/arcface_mxnet/symbol/symbol_utils.py b/insightface/recognition/arcface_mxnet/symbol/symbol_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb6f1cf96cb674aea22250cf372f6aab95590f8
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/symbol_utils.py
@@ -0,0 +1,595 @@
+import sys
+import os
+import mxnet as mx
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Conv(**kwargs):
+    #name = kwargs.get('name')
+    #_weight = mx.symbol.Variable(name+'_weight')
+    #_bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    #body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+bn_mom = config.bn_mom
+
+
+def Linear(data,
+           num_filter=1,
+           kernel=(1, 1),
+           stride=(1, 1),
+           pad=(0, 0),
+           num_group=1,
+           name=None,
+           suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=bn_mom)
+    return bn
+
+
+def get_fc1(last_conv, num_classes, fc_type, input_channel=512):
+    body = last_conv
+    if fc_type == 'Z':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = body
+    elif fc_type == 'E':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'FC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'SFC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(3, 3),
+                    stride=(2, 2),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="convf",
+                    num_group=input_channel)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf')
+        body = Act(data=body, act_type=config.net_act, name='reluf')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(1, 1),
+                    pad=(0, 0),
+                    stride=(1, 1),
+                    name="convf2")
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf2')
+        body = Act(data=body, act_type=config.net_act, name='reluf2')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GAP':
+        bn1 = mx.sym.BatchNorm(data=body,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='bn1')
+        relu1 = Act(data=bn1, act_type=config.net_act, name='relu1')
+        # Although kernel is not used here when global_pool=True, we should put one
+        pool1 = mx.sym.Pooling(data=relu1,
+                               global_pool=True,
+                               kernel=(7, 7),
+                               pool_type='avg',
+                               name='pool1')
+        flat = mx.sym.Flatten(data=pool1)
+        fc1 = mx.sym.FullyConnected(data=flat,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GNAP':  #mobilefacenet++
+        filters_in = 512  # param in mobilefacenet
+        if num_classes > filters_in:
+            body = mx.sym.Convolution(data=last_conv,
+                                      num_filter=num_classes,
+                                      kernel=(1, 1),
+                                      stride=(1, 1),
+                                      pad=(0, 0),
+                                      no_bias=True,
+                                      name='convx')
+            body = mx.sym.BatchNorm(data=body,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=0.9,
+                                    name='convx_bn')
+            body = Act(data=body, act_type=config.net_act, name='convx_relu')
+            filters_in = num_classes
+        else:
+            body = last_conv
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name='bn6f')
+
+        spatial_norm = body * body
+        spatial_norm = mx.sym.sum(data=spatial_norm, axis=1, keepdims=True)
+        spatial_sqrt = mx.sym.sqrt(spatial_norm)
+        #spatial_mean=mx.sym.mean(spatial_sqrt, axis=(1,2,3), keepdims=True)
+        spatial_mean = mx.sym.mean(spatial_sqrt)
+        spatial_div_inverse = mx.sym.broadcast_div(spatial_mean, spatial_sqrt)
+
+        spatial_attention_inverse = mx.symbol.tile(spatial_div_inverse,
+                                                   reps=(1, filters_in, 1, 1))
+        body = body * spatial_attention_inverse
+        #body = mx.sym.broadcast_mul(body, spatial_div_inverse)
+
+        fc1 = mx.sym.Pooling(body,
+                             kernel=(7, 7),
+                             global_pool=True,
+                             pool_type='avg')
+        if num_classes < filters_in:
+            fc1 = mx.sym.BatchNorm(data=fc1,
+                                   fix_gamma=True,
+                                   eps=2e-5,
+                                   momentum=0.9,
+                                   name='bn6w')
+            fc1 = mx.sym.FullyConnected(data=fc1,
+                                        num_hidden=num_classes,
+                                        name='pre_fc1')
+        else:
+            fc1 = mx.sym.Flatten(data=fc1)
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=0.9,
+                               name='fc1')
+    elif fc_type == "GDC":  #mobilefacenet_v1
+        conv_6_dw = Linear(last_conv,
+                           num_filter=input_channel,
+                           num_group=input_channel,
+                           kernel=(7, 7),
+                           pad=(0, 0),
+                           stride=(1, 1),
+                           name="conv_6dw7_7")
+        conv_6_f = mx.sym.FullyConnected(data=conv_6_dw,
+                                         num_hidden=num_classes,
+                                         name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=conv_6_f,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'F':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'G':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'H':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'I':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'J':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    return fc1
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    #print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=config.net_act, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=stride,
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    momentum=bn_mom,
+                                    eps=2e-5,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn3 + shortcut
+
+
+def residual_unit_v1l(data, num_filter, stride, dim_match, name, bottle_neck):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    workspace = config.workspace
+    bn_mom = config.bn_mom
+    memonger = False
+    use_se = config.net_se
+    act_type = config.net_act
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def get_head(data, version_input, num_filter):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {'bn_mom': bn_mom, 'workspace': workspace}
+    data = data - 127.5
+    data = data * 0.0078125
+    #data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    if version_input == 0:
+        body = Conv(data=data,
+                    num_filter=num_filter,
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        body = mx.sym.Pooling(data=body,
+                              kernel=(3, 3),
+                              stride=(2, 2),
+                              pad=(1, 1),
+                              pool_type='max')
+    else:
+        body = data
+        _num_filter = min(num_filter, 64)
+        body = Conv(data=body,
+                    num_filter=_num_filter,
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        #body = residual_unit_v3(body, _num_filter, (2, 2), False, name='head', **kwargs)
+        body = residual_unit_v1l(body,
+                                 _num_filter, (2, 2),
+                                 False,
+                                 name='head',
+                                 bottle_neck=False)
+    return body
diff --git a/insightface/recognition/arcface_mxnet/symbol/vargfacenet.py b/insightface/recognition/arcface_mxnet/symbol/vargfacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e8beb835a190725d430e9b4200ceca4360ac6
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/symbol/vargfacenet.py
@@ -0,0 +1,578 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+'''
+Author: Horizon Robotics Inc.
+The company is committed to be the global leader of edge AI platform.
+The model implemented in this scripts runs ~200fps on the Sunrise 2.
+Sunrise 2 is the second generation of an embedded AI chip designed by Horizon Robotics,
+targeting to empower AIoT devices by AI.
+
+Implemented the following paper:
+Mengjia Yan, Mengao Zhao, Zining Xu, Qian Zhang, Guoli Wang, Zhizhong Su. "VarGFaceNet: An Efficient Variable Group Convolutional Neural Network for Lightweight Face Recognition" (https://arxiv.org/abs/1910.04985)
+
+'''
+
+import os
+import sys
+
+import mxnet as mx
+import symbol_utils
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def get_setting_params(**kwargs):
+    # bn_params
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    bn_eps = kwargs.get('bn_eps', 2e-5)
+    fix_gamma = kwargs.get('fix_gamma', False)
+    use_global_stats = kwargs.get('use_global_stats', False)
+    # net_setting param
+    workspace = kwargs.get('workspace', 512)
+    act_type = kwargs.get('act_type', 'prelu')
+    use_se = kwargs.get('use_se', True)
+    se_ratio = kwargs.get('se_ratio', 4)
+    group_base = kwargs.get('group_base', 8)
+
+    setting_params = {}
+    setting_params['bn_mom'] = bn_mom
+    setting_params['bn_eps'] = bn_eps
+    setting_params['fix_gamma'] = fix_gamma
+    setting_params['use_global_stats'] = use_global_stats
+    setting_params['workspace'] = workspace
+    setting_params['act_type'] = act_type
+    setting_params['use_se'] = use_se
+    setting_params['se_ratio'] = se_ratio
+    setting_params['group_base'] = group_base
+
+    return setting_params
+
+
+def se_block(data, num_filter, setting_params, name):
+    se_ratio = setting_params['se_ratio']
+    act_type = setting_params['act_type']
+
+    pool1 = mx.sym.Pooling(data=data,
+                           global_pool=True,
+                           pool_type='avg',
+                           name=name + '_se_pool1')
+    conv1 = mx.sym.Convolution(data=pool1,
+                               num_filter=num_filter // se_ratio,
+                               kernel=(1, 1),
+                               stride=(1, 1),
+                               pad=(0, 0),
+                               name=name + "_se_conv1")
+    act1 = Act(data=conv1, act_type=act_type, name=name + '_se_act1')
+
+    conv2 = mx.sym.Convolution(data=act1,
+                               num_filter=num_filter,
+                               kernel=(1, 1),
+                               stride=(1, 1),
+                               pad=(0, 0),
+                               name=name + "_se_conv2")
+    act2 = mx.symbol.Activation(data=conv2,
+                                act_type='sigmoid',
+                                name=name + "_se_sigmoid")
+    out_data = mx.symbol.broadcast_mul(data, act2)
+    return out_data
+
+
+def separable_conv2d(data,
+                     in_channels,
+                     out_channels,
+                     kernel,
+                     pad,
+                     setting_params,
+                     stride=(1, 1),
+                     factor=1,
+                     bias=False,
+                     bn_dw_out=True,
+                     act_dw_out=True,
+                     bn_pw_out=True,
+                     act_pw_out=True,
+                     dilate=1,
+                     name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    group_base = setting_params['group_base']
+    act_type = setting_params['act_type']
+    assert in_channels % group_base == 0
+
+    # depthwise
+    dw_out = mx.sym.Convolution(data=data,
+                                num_filter=int(in_channels * factor),
+                                kernel=kernel,
+                                pad=pad,
+                                stride=stride,
+                                no_bias=False if bias else True,
+                                num_group=int(in_channels / group_base),
+                                dilate=(dilate, dilate),
+                                workspace=workspace,
+                                name=name + '_conv2d_depthwise')
+    if bn_dw_out:
+        dw_out = mx.sym.BatchNorm(data=dw_out,
+                                  fix_gamma=fix_gamma,
+                                  eps=bn_eps,
+                                  momentum=bn_mom,
+                                  use_global_stats=use_global_stats,
+                                  name=name + '_conv2d_depthwise_bn')
+    if act_dw_out:
+        dw_out = Act(data=dw_out,
+                     act_type=act_type,
+                     name=name + '_conv2d_depthwise_act')
+    # pointwise
+    pw_out = mx.sym.Convolution(data=dw_out,
+                                num_filter=out_channels,
+                                kernel=(1, 1),
+                                stride=(1, 1),
+                                pad=(0, 0),
+                                num_group=1,
+                                no_bias=False if bias else True,
+                                workspace=workspace,
+                                name=name + '_conv2d_pointwise')
+    if bn_pw_out:
+        pw_out = mx.sym.BatchNorm(data=pw_out,
+                                  fix_gamma=fix_gamma,
+                                  eps=bn_eps,
+                                  momentum=bn_mom,
+                                  use_global_stats=use_global_stats,
+                                  name=name + '_conv2d_pointwise_bn')
+    if act_pw_out:
+        pw_out = Act(data=pw_out,
+                     act_type=act_type,
+                     name=name + '_conv2d_pointwise_act')
+    return pw_out
+
+
+def vargnet_block(data,
+                  n_out_ch1,
+                  n_out_ch2,
+                  n_out_ch3,
+                  setting_params,
+                  factor=2,
+                  dim_match=True,
+                  multiplier=1,
+                  kernel=(3, 3),
+                  stride=(1, 1),
+                  dilate=1,
+                  with_dilate=False,
+                  name=None):
+    use_se = setting_params['use_se']
+    act_type = setting_params['act_type']
+
+    out_channels_1 = int(n_out_ch1 * multiplier)
+    out_channels_2 = int(n_out_ch2 * multiplier)
+    out_channels_3 = int(n_out_ch3 * multiplier)
+
+    pad = (((kernel[0] - 1) * dilate + 1) // 2,
+           ((kernel[1] - 1) * dilate + 1) // 2)
+
+    if with_dilate:
+        stride = (1, 1)
+    if dim_match:
+        short_cut = data
+    else:
+        short_cut = separable_conv2d(data=data,
+                                     in_channels=out_channels_1,
+                                     out_channels=out_channels_3,
+                                     kernel=kernel,
+                                     pad=pad,
+                                     setting_params=setting_params,
+                                     stride=stride,
+                                     factor=factor,
+                                     bias=False,
+                                     act_pw_out=False,
+                                     dilate=dilate,
+                                     name=name + '_shortcut')
+    sep1_data = separable_conv2d(data=data,
+                                 in_channels=out_channels_1,
+                                 out_channels=out_channels_2,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=stride,
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 name=name + '_sep1_data')
+    sep2_data = separable_conv2d(data=sep1_data,
+                                 in_channels=out_channels_2,
+                                 out_channels=out_channels_3,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=(1, 1),
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 act_pw_out=False,
+                                 name=name + '_sep2_data')
+
+    if use_se:
+        sep2_data = se_block(data=sep2_data,
+                             num_filter=out_channels_3,
+                             setting_params=setting_params,
+                             name=name)
+
+    out_data = sep2_data + short_cut
+    out_data = Act(data=out_data,
+                   act_type=act_type,
+                   name=name + '_out_data_act')
+    return out_data
+
+
+def vargnet_branch_merge_block(data,
+                               n_out_ch1,
+                               n_out_ch2,
+                               n_out_ch3,
+                               setting_params,
+                               factor=2,
+                               dim_match=False,
+                               multiplier=1,
+                               kernel=(3, 3),
+                               stride=(2, 2),
+                               dilate=1,
+                               with_dilate=False,
+                               name=None):
+    act_type = setting_params['act_type']
+
+    out_channels_1 = int(n_out_ch1 * multiplier)
+    out_channels_2 = int(n_out_ch2 * multiplier)
+    out_channels_3 = int(n_out_ch3 * multiplier)
+
+    pad = (((kernel[0] - 1) * dilate + 1) // 2,
+           ((kernel[1] - 1) * dilate + 1) // 2)
+
+    if with_dilate:
+        stride = (1, 1)
+    if dim_match:
+        short_cut = data
+    else:
+        short_cut = separable_conv2d(data=data,
+                                     in_channels=out_channels_1,
+                                     out_channels=out_channels_3,
+                                     kernel=kernel,
+                                     pad=pad,
+                                     setting_params=setting_params,
+                                     stride=stride,
+                                     factor=factor,
+                                     bias=False,
+                                     act_pw_out=False,
+                                     dilate=dilate,
+                                     name=name + '_shortcut')
+    sep1_data_brach1 = separable_conv2d(data=data,
+                                        in_channels=out_channels_1,
+                                        out_channels=out_channels_2,
+                                        kernel=kernel,
+                                        pad=pad,
+                                        setting_params=setting_params,
+                                        stride=stride,
+                                        factor=factor,
+                                        bias=False,
+                                        dilate=dilate,
+                                        act_pw_out=False,
+                                        name=name + '_sep1_data_branch')
+    sep1_data_brach2 = separable_conv2d(data=data,
+                                        in_channels=out_channels_1,
+                                        out_channels=out_channels_2,
+                                        kernel=kernel,
+                                        pad=pad,
+                                        setting_params=setting_params,
+                                        stride=stride,
+                                        factor=factor,
+                                        bias=False,
+                                        dilate=dilate,
+                                        act_pw_out=False,
+                                        name=name + '_sep2_data_branch')
+    sep1_data = sep1_data_brach1 + sep1_data_brach2
+    sep1_data = Act(data=sep1_data,
+                    act_type=act_type,
+                    name=name + '_sep1_data_act')
+    sep2_data = separable_conv2d(data=sep1_data,
+                                 in_channels=out_channels_2,
+                                 out_channels=out_channels_3,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=(1, 1),
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 act_pw_out=False,
+                                 name=name + '_sep2_data')
+    out_data = sep2_data + short_cut
+    out_data = Act(data=out_data,
+                   act_type=act_type,
+                   name=name + '_out_data_act')
+    return out_data
+
+
+def add_vargnet_conv_block(data,
+                           stage,
+                           units,
+                           in_channels,
+                           out_channels,
+                           setting_params,
+                           kernel=(3, 3),
+                           stride=(2, 2),
+                           multiplier=1,
+                           factor=2,
+                           dilate=1,
+                           with_dilate=False,
+                           name=None):
+    assert stage >= 2, 'stage is {}, stage must be set >=2'.format(stage)
+    data = vargnet_branch_merge_block(data=data,
+                                      n_out_ch1=in_channels,
+                                      n_out_ch2=out_channels,
+                                      n_out_ch3=out_channels,
+                                      setting_params=setting_params,
+                                      factor=factor,
+                                      dim_match=False,
+                                      multiplier=multiplier,
+                                      kernel=kernel,
+                                      stride=stride,
+                                      dilate=dilate,
+                                      with_dilate=with_dilate,
+                                      name=name +
+                                      '_stage_{}_unit_1'.format(stage))
+    for i in range(units - 1):
+        data = vargnet_block(data=data,
+                             n_out_ch1=out_channels,
+                             n_out_ch2=out_channels,
+                             n_out_ch3=out_channels,
+                             setting_params=setting_params,
+                             factor=factor,
+                             dim_match=True,
+                             multiplier=multiplier,
+                             kernel=kernel,
+                             stride=(1, 1),
+                             dilate=dilate,
+                             with_dilate=with_dilate,
+                             name=name +
+                             '_stage_{}_unit_{}'.format(stage, i + 2))
+    return data
+
+
+def add_head_block(data,
+                   num_filter,
+                   setting_params,
+                   multiplier,
+                   head_pooling=False,
+                   kernel=(3, 3),
+                   stride=(2, 2),
+                   pad=(1, 1),
+                   name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    act_type = setting_params['act_type']
+    channels = int(num_filter * multiplier)
+
+    conv1 = mx.sym.Convolution(data=data,
+                               num_filter=channels,
+                               kernel=kernel,
+                               pad=pad,
+                               stride=stride,
+                               no_bias=True,
+                               num_group=1,
+                               workspace=workspace,
+                               name=name + '_conv1')
+    bn1 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=fix_gamma,
+                           eps=bn_eps,
+                           momentum=bn_mom,
+                           use_global_stats=use_global_stats,
+                           name=name + '_conv1_bn')
+
+    act1 = Act(data=bn1, act_type=act_type, name=name + '_conv1_act')
+
+    if head_pooling:
+        head_data = mx.symbol.Pooling(data=act1,
+                                      kernel=(3, 3),
+                                      stride=(2, 2),
+                                      pad=(1, 1),
+                                      pool_type='max',
+                                      name=name + '_max_pooling')
+    else:
+        head_data = vargnet_block(data=act1,
+                                  n_out_ch1=num_filter,
+                                  n_out_ch2=num_filter,
+                                  n_out_ch3=num_filter,
+                                  setting_params=setting_params,
+                                  factor=1,
+                                  dim_match=False,
+                                  multiplier=multiplier,
+                                  kernel=kernel,
+                                  stride=(2, 2),
+                                  dilate=1,
+                                  with_dilate=False,
+                                  name=name + '_head_pooling')
+    return head_data
+
+
+def add_emb_block(data,
+                  input_channels,
+                  last_channels,
+                  emb_size,
+                  fc_type,
+                  setting_params,
+                  bias=False,
+                  name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    act_type = setting_params['act_type']
+    group_base = setting_params['group_base']
+    # last channels
+    if input_channels != last_channels:
+        data = mx.sym.Convolution(data=data,
+                                  num_filter=last_channels,
+                                  kernel=(1, 1),
+                                  pad=(0, 0),
+                                  stride=(1, 1),
+                                  no_bias=False if bias else True,
+                                  workspace=workspace,
+                                  name=name + '_convx')
+        data = mx.sym.BatchNorm(data=data,
+                                fix_gamma=fix_gamma,
+                                eps=bn_eps,
+                                momentum=bn_mom,
+                                use_global_stats=use_global_stats,
+                                name=name + '_convx_bn')
+        data = Act(data=data, act_type=act_type, name=name + '_convx_act')
+    # depthwise
+    convx_depthwise = mx.sym.Convolution(data=data,
+                                         num_filter=last_channels,
+                                         num_group=int(last_channels /
+                                                       group_base),
+                                         kernel=(7, 7),
+                                         pad=(0, 0),
+                                         stride=(1, 1),
+                                         no_bias=False if bias else True,
+                                         workspace=workspace,
+                                         name=name + '_convx_depthwise')
+    convx_depthwise = mx.sym.BatchNorm(data=convx_depthwise,
+                                       fix_gamma=fix_gamma,
+                                       eps=bn_eps,
+                                       momentum=bn_mom,
+                                       use_global_stats=use_global_stats,
+                                       name=name + '_convx_depthwise_bn')
+    # pointwise
+    convx_pointwise = mx.sym.Convolution(data=convx_depthwise,
+                                         num_filter=last_channels // 2,
+                                         kernel=(1, 1),
+                                         pad=(0, 0),
+                                         stride=(1, 1),
+                                         no_bias=False if bias else True,
+                                         workspace=workspace,
+                                         name=name + '_convx_pointwise')
+    convx_pointwise = mx.sym.BatchNorm(data=convx_pointwise,
+                                       fix_gamma=fix_gamma,
+                                       eps=bn_eps,
+                                       momentum=bn_mom,
+                                       use_global_stats=use_global_stats,
+                                       name=name + '_convx_pointwise_bn')
+    convx_pointwise = Act(data=convx_pointwise,
+                          act_type=act_type,
+                          name=name + '_convx_pointwise_act')
+
+    fc1 = symbol_utils.get_fc1(convx_pointwise, emb_size, fc_type)
+    return fc1
+
+
+def get_symbol():
+    multiplier = config.net_multiplier
+    emb_size = config.emb_size
+    fc_type = config.net_output
+
+    kwargs = {
+        'use_se': config.net_se,
+        'act_type': config.net_act,
+        'bn_mom': config.bn_mom,
+        'workspace': config.workspace,
+    }
+
+    setting_params = get_setting_params(**kwargs)
+
+    factor = 2
+    head_pooling = False
+    num_stage = 3
+    stage_list = [2, 3, 4]
+    units = [3, 7, 4]
+    filter_list = [32, 64, 128, 256]
+    last_channels = 1024
+    dilate_list = [1, 1, 1]
+    with_dilate_list = [False, False, False]
+
+    data = mx.sym.Variable(name='data')
+    data = mx.sym.identity(data=data, name='id')
+    data = data - 127.5
+    data = data * 0.0078125
+
+    body = add_head_block(data=data,
+                          num_filter=filter_list[0],
+                          setting_params=setting_params,
+                          multiplier=multiplier,
+                          head_pooling=head_pooling,
+                          kernel=(3, 3),
+                          stride=(1, 1),
+                          pad=(1, 1),
+                          name="vargface_head")
+
+    for i in range(num_stage):
+        body = add_vargnet_conv_block(data=body,
+                                      stage=stage_list[i],
+                                      units=units[i],
+                                      in_channels=filter_list[i],
+                                      out_channels=filter_list[i + 1],
+                                      setting_params=setting_params,
+                                      kernel=(3, 3),
+                                      stride=(2, 2),
+                                      multiplier=multiplier,
+                                      factor=factor,
+                                      dilate=dilate_list[i],
+                                      with_dilate=with_dilate_list[i],
+                                      name="vargface")
+    emb_feat = add_emb_block(data=body,
+                             input_channels=filter_list[3],
+                             last_channels=last_channels,
+                             emb_size=emb_size,
+                             fc_type=fc_type,
+                             setting_params=setting_params,
+                             bias=False,
+                             name='embed')
+    return emb_feat
+
+
+if __name__ == '__main__':
+    get_symbol()
diff --git a/insightface/recognition/arcface_mxnet/train.py b/insightface/recognition/arcface_mxnet/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2ebadb2cd0c8636fd85d561af844df51422418
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/train.py
@@ -0,0 +1,484 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import math
+import random
+import logging
+import sklearn
+import pickle
+import numpy as np
+import mxnet as mx
+from mxnet import ndarray as nd
+import argparse
+import mxnet.optimizer as optimizer
+from config import config, default, generate_config
+from metric import *
+sys.path.append(os.path.join(os.path.dirname(__file__),  'common'))
+import flops_counter
+import verification
+sys.path.append(os.path.join(os.path.dirname(__file__),  'symbol'))
+import fresnet
+import fmobilefacenet
+import fmobilenet
+import fmnasnet
+import fdensenet
+import vargfacenet
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train face network')
+    # general
+    parser.add_argument('--dataset',
+                        default=default.dataset,
+                        help='dataset config')
+    parser.add_argument('--network',
+                        default=default.network,
+                        help='network config')
+    parser.add_argument('--loss', default=default.loss, help='loss config')
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset, args.loss)
+    parser.add_argument('--models-root',
+                        default=default.models_root,
+                        help='root directory to save model.')
+    parser.add_argument('--pretrained',
+                        default=default.pretrained,
+                        help='pretrained model to load')
+    parser.add_argument('--pretrained-epoch',
+                        type=int,
+                        default=default.pretrained_epoch,
+                        help='pretrained epoch to load')
+    parser.add_argument(
+        '--ckpt',
+        type=int,
+        default=default.ckpt,
+        help=
+        'checkpoint saving option. 0: discard saving. 1: save when necessary. 2: always save'
+    )
+    parser.add_argument(
+        '--verbose',
+        type=int,
+        default=default.verbose,
+        help='do verification testing and model saving every verbose batches')
+    parser.add_argument('--lr',
+                        type=float,
+                        default=default.lr,
+                        help='start learning rate')
+    parser.add_argument('--lr-steps',
+                        type=str,
+                        default=default.lr_steps,
+                        help='steps of lr changing')
+    parser.add_argument('--wd',
+                        type=float,
+                        default=default.wd,
+                        help='weight decay')
+    parser.add_argument('--mom',
+                        type=float,
+                        default=default.mom,
+                        help='momentum')
+    parser.add_argument('--frequent',
+                        type=int,
+                        default=default.frequent,
+                        help='')
+    parser.add_argument('--per-batch-size',
+                        type=int,
+                        default=default.per_batch_size,
+                        help='batch size in each context')
+    parser.add_argument('--kvstore',
+                        type=str,
+                        default=default.kvstore,
+                        help='kvstore setting')
+    args = parser.parse_args()
+    return args
+
+
+def get_symbol(args):
+    embedding = eval(config.net_name).get_symbol()
+    all_label = mx.symbol.Variable('softmax_label')
+    gt_label = all_label
+    is_softmax = True
+    if config.loss_name == 'softmax':  #softmax
+        _weight = mx.symbol.Variable("fc7_weight",
+                                     shape=(config.num_classes,
+                                            config.emb_size),
+                                     lr_mult=config.fc7_lr_mult,
+                                     wd_mult=config.fc7_wd_mult,
+                                     init=mx.init.Normal(0.01))
+        if config.fc7_no_bias:
+            fc7 = mx.sym.FullyConnected(data=embedding,
+                                        weight=_weight,
+                                        no_bias=True,
+                                        num_hidden=config.num_classes,
+                                        name='fc7')
+        else:
+            _bias = mx.symbol.Variable('fc7_bias', lr_mult=2.0, wd_mult=0.0)
+            fc7 = mx.sym.FullyConnected(data=embedding,
+                                        weight=_weight,
+                                        bias=_bias,
+                                        num_hidden=config.num_classes,
+                                        name='fc7')
+    elif config.loss_name == 'margin_softmax':
+        _weight = mx.symbol.Variable("fc7_weight",
+                                     shape=(config.num_classes,
+                                            config.emb_size),
+                                     lr_mult=config.fc7_lr_mult,
+                                     wd_mult=config.fc7_wd_mult,
+                                     init=mx.init.Normal(0.01))
+        s = config.loss_s
+        _weight = mx.symbol.L2Normalization(_weight, mode='instance')
+        nembedding = mx.symbol.L2Normalization(
+            embedding, mode='instance', name='fc1n') * s
+        fc7 = mx.sym.FullyConnected(data=nembedding,
+                                    weight=_weight,
+                                    no_bias=True,
+                                    num_hidden=config.num_classes,
+                                    name='fc7')
+        if config.loss_m1 != 1.0 or config.loss_m2 != 0.0 or config.loss_m3 != 0.0:
+            if config.loss_m1 == 1.0 and config.loss_m2 == 0.0:
+                s_m = s * config.loss_m3
+                gt_one_hot = mx.sym.one_hot(gt_label,
+                                            depth=config.num_classes,
+                                            on_value=s_m,
+                                            off_value=0.0)
+                fc7 = fc7 - gt_one_hot
+            else:
+                zy = mx.sym.pick(fc7, gt_label, axis=1)
+                cos_t = zy / s
+                t = mx.sym.arccos(cos_t)
+                if config.loss_m1 != 1.0:
+                    t = t * config.loss_m1
+                if config.loss_m2 > 0.0:
+                    t = t + config.loss_m2
+                body = mx.sym.cos(t)
+                if config.loss_m3 > 0.0:
+                    body = body - config.loss_m3
+                new_zy = body * s
+                diff = new_zy - zy
+                diff = mx.sym.expand_dims(diff, 1)
+                gt_one_hot = mx.sym.one_hot(gt_label,
+                                            depth=config.num_classes,
+                                            on_value=1.0,
+                                            off_value=0.0)
+                body = mx.sym.broadcast_mul(gt_one_hot, diff)
+                fc7 = fc7 + body
+    elif config.loss_name.find('triplet') >= 0:
+        is_softmax = False
+        nembedding = mx.symbol.L2Normalization(embedding,
+                                               mode='instance',
+                                               name='fc1n')
+        anchor = mx.symbol.slice_axis(nembedding,
+                                      axis=0,
+                                      begin=0,
+                                      end=args.per_batch_size // 3)
+        positive = mx.symbol.slice_axis(nembedding,
+                                        axis=0,
+                                        begin=args.per_batch_size // 3,
+                                        end=2 * args.per_batch_size // 3)
+        negative = mx.symbol.slice_axis(nembedding,
+                                        axis=0,
+                                        begin=2 * args.per_batch_size // 3,
+                                        end=args.per_batch_size)
+        if config.loss_name == 'triplet':
+            ap = anchor - positive
+            an = anchor - negative
+            ap = ap * ap
+            an = an * an
+            ap = mx.symbol.sum(ap, axis=1, keepdims=1)  #(T,1)
+            an = mx.symbol.sum(an, axis=1, keepdims=1)  #(T,1)
+            triplet_loss = mx.symbol.Activation(data=(ap - an +
+                                                      config.triplet_alpha),
+                                                act_type='relu')
+            triplet_loss = mx.symbol.mean(triplet_loss)
+        else:
+            ap = anchor * positive
+            an = anchor * negative
+            ap = mx.symbol.sum(ap, axis=1, keepdims=1)  #(T,1)
+            an = mx.symbol.sum(an, axis=1, keepdims=1)  #(T,1)
+            ap = mx.sym.arccos(ap)
+            an = mx.sym.arccos(an)
+            triplet_loss = mx.symbol.Activation(data=(ap - an +
+                                                      config.triplet_alpha),
+                                                act_type='relu')
+            triplet_loss = mx.symbol.mean(triplet_loss)
+        triplet_loss = mx.symbol.MakeLoss(triplet_loss)
+    out_list = [mx.symbol.BlockGrad(embedding)]
+    if is_softmax:
+        softmax = mx.symbol.SoftmaxOutput(data=fc7,
+                                          label=gt_label,
+                                          name='softmax',
+                                          normalization='valid')
+        out_list.append(softmax)
+        if config.ce_loss:
+            #ce_loss = mx.symbol.softmax_cross_entropy(data=fc7, label = gt_label, name='ce_loss')/args.per_batch_size
+            body = mx.symbol.SoftmaxActivation(data=fc7)
+            body = mx.symbol.log(body)
+            _label = mx.sym.one_hot(gt_label,
+                                    depth=config.num_classes,
+                                    on_value=-1.0,
+                                    off_value=0.0)
+            body = body * _label
+            ce_loss = mx.symbol.sum(body) / args.per_batch_size
+            out_list.append(mx.symbol.BlockGrad(ce_loss))
+    else:
+        out_list.append(mx.sym.BlockGrad(gt_label))
+        out_list.append(triplet_loss)
+    out = mx.symbol.Group(out_list)
+    return out
+
+
+def train_net(args):
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    prefix = os.path.join(args.models_root,
+                          '%s-%s-%s' % (args.network, args.loss, args.dataset),
+                          'model')
+    prefix_dir = os.path.dirname(prefix)
+    print('prefix', prefix)
+    if not os.path.exists(prefix_dir):
+        os.makedirs(prefix_dir)
+    args.ctx_num = len(ctx)
+    args.batch_size = args.per_batch_size * args.ctx_num
+    args.rescale_threshold = 0
+    args.image_channel = config.image_shape[2]
+    config.batch_size = args.batch_size
+    config.per_batch_size = args.per_batch_size
+
+    data_dir = config.dataset_path
+    path_imgrec = None
+    path_imglist = None
+    image_size = config.image_shape[0:2]
+    assert len(image_size) == 2
+    assert image_size[0] == image_size[1]
+    print('image_size', image_size)
+    print('num_classes', config.num_classes)
+    path_imgrec = os.path.join(data_dir, "train.rec")
+
+    print('Called with argument:', args, config)
+    data_shape = (args.image_channel, image_size[0], image_size[1])
+    mean = None
+
+    begin_epoch = 0
+    if len(args.pretrained) == 0:
+        arg_params = None
+        aux_params = None
+        sym = get_symbol(args)
+        if config.net_name == 'spherenet':
+            data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
+            spherenet.init_weights(sym, data_shape_dict, args.num_layers)
+    else:
+        print('loading', args.pretrained, args.pretrained_epoch)
+        _, arg_params, aux_params = mx.model.load_checkpoint(
+            args.pretrained, args.pretrained_epoch)
+        sym = get_symbol(args)
+
+    if config.count_flops:
+        all_layers = sym.get_internals()
+        _sym = all_layers['fc1_output']
+        FLOPs = flops_counter.count_flops(_sym,
+                                          data=(1, 3, image_size[0],
+                                                image_size[1]))
+        _str = flops_counter.flops_str(FLOPs)
+        print('Network FLOPs: %s' % _str)
+
+    #label_name = 'softmax_label'
+    #label_shape = (args.batch_size,)
+    model = mx.mod.Module(
+        context=ctx,
+        symbol=sym,
+    )
+    val_dataiter = None
+
+    if config.loss_name.find('triplet') >= 0:
+        from triplet_image_iter import FaceImageIter
+        triplet_params = [
+            config.triplet_bag_size, config.triplet_alpha,
+            config.triplet_max_ap
+        ]
+        train_dataiter = FaceImageIter(
+            batch_size=args.batch_size,
+            data_shape=data_shape,
+            path_imgrec=path_imgrec,
+            shuffle=True,
+            rand_mirror=config.data_rand_mirror,
+            mean=mean,
+            cutoff=config.data_cutoff,
+            ctx_num=args.ctx_num,
+            images_per_identity=config.images_per_identity,
+            triplet_params=triplet_params,
+            mx_model=model,
+        )
+        _metric = LossValueMetric()
+        eval_metrics = [mx.metric.create(_metric)]
+    else:
+        #from image_iter import FaceImageIter
+        #train_dataiter = FaceImageIter(
+        #    batch_size=args.batch_size,
+        #    data_shape=data_shape,
+        #    path_imgrec=path_imgrec,
+        #    shuffle=True,
+        #    rand_mirror=config.data_rand_mirror,
+        #    mean=mean,
+        #    cutoff=config.data_cutoff,
+        #    color_jittering=config.data_color,
+        #    images_filter=config.data_images_filter,
+        #)
+        from image_iter import get_face_image_iter
+        train_dataiter = get_face_image_iter(config, data_shape, path_imgrec)
+        metric1 = AccMetric()
+        eval_metrics = [mx.metric.create(metric1)]
+        if config.ce_loss:
+            metric2 = LossValueMetric()
+            eval_metrics.append(mx.metric.create(metric2))
+
+    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
+        initializer = mx.init.Xavier(rnd_type='gaussian',
+                                     factor_type="out",
+                                     magnitude=2)  #resnet style
+    else:
+        initializer = mx.init.Xavier(rnd_type='uniform',
+                                     factor_type="in",
+                                     magnitude=2)
+    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
+    _rescale = 1.0 / args.ctx_num
+    opt = optimizer.SGD(learning_rate=args.lr,
+                        momentum=args.mom,
+                        wd=args.wd,
+                        rescale_grad=_rescale)
+    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
+
+    ver_list = []
+    ver_name_list = []
+    for name in config.val_targets:
+        path = os.path.join(data_dir, name + ".bin")
+        if os.path.exists(path):
+            data_set = verification.load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+            print('ver', name)
+
+    def ver_test(nbatch):
+        results = []
+        for i in range(len(ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                ver_list[i], model, args.batch_size, 10, None, None)
+            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
+            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
+            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                  (ver_name_list[i], nbatch, acc2, std2))
+            results.append(acc2)
+        return results
+
+    highest_acc = [0.0, 0.0]  #lfw and target
+    #for i in range(len(ver_list)):
+    #  highest_acc.append(0.0)
+    global_step = [0]
+    save_step = [0]
+    lr_steps = [int(x) for x in args.lr_steps.split(',')]
+    print('lr_steps', lr_steps)
+
+    def _batch_callback(param):
+        #global global_step
+        global_step[0] += 1
+        mbatch = global_step[0]
+        for step in lr_steps:
+            if mbatch == step:
+                opt.lr *= 0.1
+                print('lr change to', opt.lr)
+                break
+
+        _cb(param)
+        if mbatch % 1000 == 0:
+            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
+
+        if mbatch >= 0 and mbatch % args.verbose == 0:
+            acc_list = ver_test(mbatch)
+            save_step[0] += 1
+            msave = save_step[0]
+            do_save = False
+            is_highest = False
+            if len(acc_list) > 0:
+                #lfw_score = acc_list[0]
+                #if lfw_score>highest_acc[0]:
+                #  highest_acc[0] = lfw_score
+                #  if lfw_score>=0.998:
+                #    do_save = True
+                score = sum(acc_list)
+                if acc_list[-1] >= highest_acc[-1]:
+                    if acc_list[-1] > highest_acc[-1]:
+                        is_highest = True
+                    else:
+                        if score >= highest_acc[0]:
+                            is_highest = True
+                            highest_acc[0] = score
+                    highest_acc[-1] = acc_list[-1]
+                    #if lfw_score>=0.99:
+                    #  do_save = True
+            if is_highest:
+                do_save = True
+            if args.ckpt == 0:
+                do_save = False
+            elif args.ckpt == 2:
+                do_save = True
+            elif args.ckpt == 3:
+                msave = 1
+
+            if do_save:
+                print('saving', msave)
+                arg, aux = model.get_params()
+                if config.ckpt_embedding:
+                    all_layers = model.symbol.get_internals()
+                    _sym = all_layers['fc1_output']
+                    _arg = {}
+                    for k in arg:
+                        if not k.startswith('fc7'):
+                            _arg[k] = arg[k]
+                    mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux)
+                else:
+                    mx.model.save_checkpoint(prefix, msave, model.symbol, arg,
+                                             aux)
+            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
+        if config.max_steps > 0 and mbatch > config.max_steps:
+            sys.exit(0)
+
+    epoch_cb = None
+
+    model.fit(
+        train_dataiter,
+        begin_epoch=begin_epoch,
+        num_epoch=999999,
+        eval_data=val_dataiter,
+        eval_metric=eval_metrics,
+        kvstore=args.kvstore,
+        optimizer=opt,
+        #optimizer_params   = optimizer_params,
+        initializer=initializer,
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True,
+        batch_end_callback=_batch_callback,
+        epoch_end_callback=epoch_cb)
+
+
+def main():
+    global args
+    args = parse_args()
+    train_net(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/recognition/arcface_mxnet/train_parall.py b/insightface/recognition/arcface_mxnet/train_parall.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b916fa84bfa7a8da92d5664fd8a88c6a089bc
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/train_parall.py
@@ -0,0 +1,451 @@
+'''
+@author: insightface
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import math
+import random
+import logging
+import pickle
+import sklearn
+import numpy as np
+#from image_iter import FaceImageIter
+from image_iter import get_face_image_iter
+import mxnet as mx
+from mxnet import ndarray as nd
+import argparse
+import mxnet.optimizer as optimizer
+sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
+import flops_counter
+from config import config, default, generate_config
+import verification
+sys.path.append(os.path.join(os.path.dirname(__file__), 'symbol'))
+import fresnet
+import fmobilefacenet
+import fmobilenet
+import fmnasnet
+import fdensenet
+import vargfacenet
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train parall face network')
+    # general
+    parser.add_argument('--dataset',
+                        default=default.dataset,
+                        help='dataset config')
+    parser.add_argument('--network',
+                        default=default.network,
+                        help='network config')
+    parser.add_argument('--loss', default=default.loss, help='loss config')
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset, args.loss)
+    parser.add_argument('--models-root',
+                        default=default.models_root,
+                        help='root directory to save model.')
+    parser.add_argument('--pretrained',
+                        default=default.pretrained,
+                        help='pretrained model to load')
+    parser.add_argument('--pretrained-epoch',
+                        type=int,
+                        default=default.pretrained_epoch,
+                        help='pretrained epoch to load')
+    parser.add_argument(
+        '--ckpt',
+        type=int,
+        default=default.ckpt,
+        help=
+        'checkpoint saving option. 0: discard saving. 1: save when necessary. 2: always save'
+    )
+    parser.add_argument(
+        '--verbose',
+        type=int,
+        default=default.verbose,
+        help='do verification testing and model saving every verbose batches')
+    parser.add_argument('--lr',
+                        type=float,
+                        default=default.lr,
+                        help='start learning rate')
+    parser.add_argument('--lr-steps',
+                        type=str,
+                        default=default.lr_steps,
+                        help='steps of lr changing')
+    parser.add_argument('--wd',
+                        type=float,
+                        default=default.wd,
+                        help='weight decay')
+    parser.add_argument('--mom',
+                        type=float,
+                        default=default.mom,
+                        help='momentum')
+    parser.add_argument('--frequent',
+                        type=int,
+                        default=default.frequent,
+                        help='')
+    parser.add_argument('--per-batch-size',
+                        type=int,
+                        default=default.per_batch_size,
+                        help='batch size in each context')
+    parser.add_argument('--kvstore',
+                        type=str,
+                        default=default.kvstore,
+                        help='kvstore setting')
+    parser.add_argument('--worker-id',
+                        type=int,
+                        default=0,
+                        help='worker id for dist training, starts from 0')
+    parser.add_argument('--extra-model-name',
+                        type=str,
+                        default='',
+                        help='extra model name')
+    parser.add_argument('--fp16-scale', type=float, default=0.0, help='')
+    args = parser.parse_args()
+    return args
+
+
+def get_symbol_embedding(embedding=None):
+    if embedding is None:
+        embedding = eval(config.net_name).get_symbol()
+    all_label = mx.symbol.Variable('softmax_label')
+    #embedding = mx.symbol.BlockGrad(embedding)
+    all_label = mx.symbol.BlockGrad(all_label)
+    out_list = [embedding, all_label]
+    out = mx.symbol.Group(out_list)
+    return out
+
+
+def get_symbol_arcface(args):
+    embedding = mx.symbol.Variable('data')
+    all_label = mx.symbol.Variable('softmax_label')
+    gt_label = all_label
+    is_softmax = True
+    #print('call get_sym_arcface with', args, config)
+    _weight = mx.symbol.Variable("fc7_%d_weight" % args._ctxid,
+                                 shape=(args.ctx_num_classes, config.emb_size),
+                                 lr_mult=config.fc7_lr_mult,
+                                 wd_mult=config.fc7_wd_mult)
+    if config.loss_name == 'softmax':  #softmax
+        fc7 = mx.sym.FullyConnected(data=embedding,
+                                    weight=_weight,
+                                    no_bias=True,
+                                    num_hidden=args.ctx_num_classes,
+                                    name='fc7_%d' % args._ctxid)
+    elif config.loss_name == 'margin_softmax':
+        _weight = mx.symbol.L2Normalization(_weight, mode='instance')
+        nembedding = mx.symbol.L2Normalization(embedding,
+                                               mode='instance',
+                                               name='fc1n_%d' % args._ctxid)
+        fc7 = mx.sym.FullyConnected(data=nembedding,
+                                    weight=_weight,
+                                    no_bias=True,
+                                    num_hidden=args.ctx_num_classes,
+                                    name='fc7_%d' % args._ctxid)
+        if config.loss_m1 != 1.0 or config.loss_m2 != 0.0 or config.loss_m3 != 0.0:
+            gt_one_hot = mx.sym.one_hot(gt_label,
+                                        depth=args.ctx_num_classes,
+                                        on_value=1.0,
+                                        off_value=0.0)
+            if config.loss_m1 == 1.0 and config.loss_m2 == 0.0:
+                _one_hot = gt_one_hot * config.loss_m3
+                fc7 = fc7 - _one_hot
+            else:
+                fc7_onehot = fc7 * gt_one_hot
+                cos_t = fc7_onehot
+                t = mx.sym.arccos(cos_t)
+                if config.loss_m1 != 1.0:
+                    t = t * config.loss_m1
+                if config.loss_m2 != 0.0:
+                    t = t + config.loss_m2
+                margin_cos = mx.sym.cos(t)
+                if config.loss_m3 != 0.0:
+                    margin_cos = margin_cos - config.loss_m3
+                margin_fc7 = margin_cos
+                margin_fc7_onehot = margin_fc7 * gt_one_hot
+                diff = margin_fc7_onehot - fc7_onehot
+                fc7 = fc7 + diff
+        fc7 = fc7 * config.loss_s
+
+    out_list = []
+    out_list.append(fc7)
+    if config.loss_name == 'softmax':  #softmax
+        out_list.append(gt_label)
+    out = mx.symbol.Group(out_list)
+    return out
+
+
+def train_net(args):
+    #_seed = 727
+    #random.seed(_seed)
+    #np.random.seed(_seed)
+    #mx.random.seed(_seed)
+    config.fp16 = False
+    config.fp16_scale = 0.0
+    if args.fp16_scale>0.0:
+        config.fp16 = True
+        config.fp16_scale = args.fp16_scale
+        print('use fp16, scale=', config.fp16_scale)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    if len(args.extra_model_name) == 0:
+        prefix = os.path.join(
+            args.models_root,
+            '%s-%s-%s' % (args.network, args.loss, args.dataset), 'model')
+    else:
+        prefix = os.path.join(
+            args.models_root, '%s-%s-%s-%s' %
+            (args.network, args.loss, args.dataset, args.extra_model_name),
+            'model')
+    prefix_dir = os.path.dirname(prefix)
+    print('prefix', prefix)
+    if not os.path.exists(prefix_dir):
+        os.makedirs(prefix_dir)
+    args.ctx_num = len(ctx)
+    if args.per_batch_size == 0:
+        args.per_batch_size = 128
+    args.batch_size = args.per_batch_size * args.ctx_num
+    args.rescale_threshold = 0
+    args.image_channel = config.image_shape[2]
+    config.batch_size = args.batch_size
+    config.per_batch_size = args.per_batch_size
+    data_dir = config.dataset_path
+    path_imgrec = None
+    path_imglist = None
+    image_size = config.image_shape[0:2]
+    assert len(image_size) == 2
+    assert image_size[0] == image_size[1]
+    print('image_size', image_size)
+    print('num_classes', config.num_classes)
+    path_imgrec = os.path.join(data_dir, "train.rec")
+
+    data_shape = (args.image_channel, image_size[0], image_size[1])
+
+    num_workers = config.num_workers
+    global_num_ctx = num_workers * args.ctx_num
+    if config.num_classes % global_num_ctx == 0:
+        args.ctx_num_classes = config.num_classes // global_num_ctx
+    else:
+        args.ctx_num_classes = config.num_classes // global_num_ctx + 1
+    args.local_num_classes = args.ctx_num_classes * args.ctx_num
+    args.local_class_start = args.local_num_classes * args.worker_id
+
+    #if len(args.partial)==0:
+    #  local_classes_range = (0, args.num_classes)
+    #else:
+    #  _vec = args.partial.split(',')
+    #  local_classes_range = (int(_vec[0]), int(_vec[1]))
+
+    #args.partial_num_classes = local_classes_range[1] - local_classes_range[0]
+    #args.partial_start = local_classes_range[0]
+
+    print('Called with argument:', args, config)
+    mean = None
+
+    begin_epoch = 0
+    base_lr = args.lr
+    base_wd = args.wd
+    base_mom = args.mom
+    arg_params = None
+    aux_params = None
+    if len(args.pretrained) == 0:
+        esym = get_symbol_embedding()
+        asym = get_symbol_arcface
+    else:
+        #assert False
+        print('loading', args.pretrained, args.pretrained_epoch)
+        pretrain_esym, arg_params, aux_params = mx.model.load_checkpoint(
+            args.pretrained, args.pretrained_epoch)
+        esym = get_symbol_embedding(pretrain_esym)
+        asym = get_symbol_arcface
+
+    if config.count_flops:
+        all_layers = esym.get_internals()
+        _sym = all_layers['fc1_output']
+        FLOPs = flops_counter.count_flops(_sym,
+                                          data=(1, 3, image_size[0],
+                                                image_size[1]))
+        _str = flops_counter.flops_str(FLOPs)
+        print('Network FLOPs: %s' % _str)
+
+    if config.num_workers == 1:
+        from parall_module_local_v1 import ParallModule
+    else:
+        from parall_module_dist import ParallModule
+
+    model = ParallModule(
+        context=ctx,
+        symbol=esym,
+        data_names=['data'],
+        label_names=['softmax_label'],
+        asymbol=asym,
+        args=args,
+    )
+    val_dataiter = None
+
+    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
+        initializer = mx.init.Xavier(rnd_type='gaussian',
+                                     factor_type="out",
+                                     magnitude=2)  #resnet style
+    else:
+        initializer = mx.init.Xavier(rnd_type='uniform',
+                                     factor_type="in",
+                                     magnitude=2)
+
+    _rescale = 1.0 / args.batch_size
+    if config.fp16:
+        opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=True)
+    else:
+        opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=False)
+    opt_fc7 = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=False)
+    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
+
+    ver_list = []
+    ver_name_list = []
+    for name in config.val_targets:
+        path = os.path.join(data_dir, name + ".bin")
+        if os.path.exists(path):
+            data_set = verification.load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+            print('ver', name)
+
+    def ver_test(nbatch):
+        results = []
+        for i in range(len(ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                ver_list[i], model, args.batch_size, 10, None, None)
+            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
+            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
+            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                  (ver_name_list[i], nbatch, acc2, std2))
+            results.append(acc2)
+        return results
+
+    highest_acc = [0.0, 0.0]  #lfw and target
+    #for i in range(len(ver_list)):
+    #  highest_acc.append(0.0)
+    global_step = [0]
+    save_step = [0]
+    lr_steps = [int(x) for x in args.lr_steps.split(',')]
+    print('lr_steps', lr_steps)
+
+    def _batch_callback(param):
+        #global global_step
+        global_step[0] += 1
+        mbatch = global_step[0]
+        for step in lr_steps:
+            if mbatch == step:
+                opt.lr *= 0.1
+                opt_fc7.lr *= 0.1
+                print('lr change to', opt.lr, opt_fc7.lr)
+                break
+
+        _cb(param)
+        if mbatch % 1000 == 0:
+            print('lr-batch-epoch:', opt.lr, opt_fc7.lr, param.nbatch, param.epoch)
+
+        if mbatch >= 0 and mbatch % args.verbose == 0:
+            acc_list = ver_test(mbatch)
+            save_step[0] += 1
+            msave = save_step[0]
+            do_save = False
+            is_highest = False
+            if len(acc_list) > 0:
+                #lfw_score = acc_list[0]
+                #if lfw_score>highest_acc[0]:
+                #  highest_acc[0] = lfw_score
+                #  if lfw_score>=0.998:
+                #    do_save = True
+                score = sum(acc_list)
+                if acc_list[-1] >= highest_acc[-1]:
+                    if acc_list[-1] > highest_acc[-1]:
+                        is_highest = True
+                    else:
+                        if score >= highest_acc[0]:
+                            is_highest = True
+                            highest_acc[0] = score
+                    highest_acc[-1] = acc_list[-1]
+                    #if lfw_score>=0.99:
+                    #  do_save = True
+            if is_highest:
+                do_save = True
+            if args.ckpt == 0:
+                do_save = False
+            elif args.ckpt == 2:
+                do_save = True
+            elif args.ckpt == 3:
+                msave = 1
+
+            if do_save:
+                print('saving', msave)
+                arg, aux = model.get_export_params()
+                all_layers = model.symbol.get_internals()
+                _sym = all_layers['fc1_output']
+                mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
+            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
+        if config.max_steps > 0 and mbatch > config.max_steps:
+            msave = 0
+            config.fp16 = False
+            print('saving last', msave)
+            arg, aux = model.get_export_params()
+            _sym = eval(config.net_name).get_symbol()
+            mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
+            sys.exit(0)
+
+    epoch_cb = None
+    train_dataiter = get_face_image_iter(config, data_shape, path_imgrec)
+    #train_dataiter = FaceImageIter(
+    #    batch_size=args.batch_size,
+    #    data_shape=data_shape,
+    #    path_imgrec=path_imgrec,
+    #    shuffle=True,
+    #    rand_mirror=config.data_rand_mirror,
+    #    mean=mean,
+    #    cutoff=config.data_cutoff,
+    #    color_jittering=config.data_color,
+    #    images_filter=config.data_images_filter,
+    #)
+    #train_dataiter = mx.io.PrefetchingIter(train_dataiter)
+
+    model.fit(
+        train_dataiter,
+        begin_epoch=begin_epoch,
+        num_epoch=999999,
+        eval_data=val_dataiter,
+        #eval_metric        = eval_metrics,
+        kvstore=args.kvstore,
+        optimizer=[opt, opt_fc7],
+        #optimizer_params   = optimizer_params,
+        initializer=initializer,
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True,
+        batch_end_callback=_batch_callback,
+        epoch_end_callback=epoch_cb)
+
+
+def main():
+    global args
+    args = parse_args()
+    train_net(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/recognition/arcface_mxnet/triplet_image_iter.py b/insightface/recognition/arcface_mxnet/triplet_image_iter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df39acf8ba898eb6360613e815435906cf23221
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/triplet_image_iter.py
@@ -0,0 +1,628 @@
+# THIS FILE IS FOR EXPERIMENTS, USE image_iter.py FOR NORMAL IMAGE LOADING.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import logging
+import sys
+import numbers
+import math
+import sklearn
+import datetime
+import numpy as np
+import cv2
+
+import mxnet as mx
+from mxnet import ndarray as nd
+#from . import _ndarray_internal as _internal
+#from mxnet._ndarray_internal import _cvimresize as imresize
+#from ._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder
+from mxnet import io
+from mxnet import recordio
+sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
+import face_preprocess
+
+logger = logging.getLogger()
+
+
+class FaceImageIter(io.DataIter):
+    def __init__(self,
+                 batch_size,
+                 data_shape,
+                 path_imgrec=None,
+                 shuffle=False,
+                 aug_list=None,
+                 rand_mirror=False,
+                 cutoff=0,
+                 ctx_num=0,
+                 images_per_identity=0,
+                 triplet_params=None,
+                 mx_model=None,
+                 data_name='data',
+                 label_name='softmax_label',
+                 **kwargs):
+        super(FaceImageIter, self).__init__()
+        assert path_imgrec
+        assert shuffle
+        logging.info('loading recordio %s...', path_imgrec)
+        path_imgidx = path_imgrec[0:-4] + ".idx"
+        self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+        s = self.imgrec.read_idx(0)
+        header, _ = recordio.unpack(s)
+        assert header.flag > 0
+        print('header0 label', header.label)
+        self.header0 = (int(header.label[0]), int(header.label[1]))
+        #assert(header.flag==1)
+        self.imgidx = range(1, int(header.label[0]))
+        self.id2range = {}
+        self.seq_identity = range(int(header.label[0]), int(header.label[1]))
+        for identity in self.seq_identity:
+            s = self.imgrec.read_idx(identity)
+            header, _ = recordio.unpack(s)
+            a, b = int(header.label[0]), int(header.label[1])
+            self.id2range[identity] = (a, b)
+
+        print('id2range', len(self.id2range))
+        self.seq = self.imgidx
+        print(len(self.seq))
+
+        self.check_data_shape(data_shape)
+        self.provide_data = [(data_name, (batch_size, ) + data_shape)]
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+        self.shuffle = shuffle
+        self.image_size = '%d,%d' % (data_shape[1], data_shape[2])
+        self.rand_mirror = rand_mirror
+        print('rand_mirror', rand_mirror)
+        self.cutoff = cutoff
+        #self.cast_aug = mx.image.CastAug()
+        #self.color_aug = mx.image.ColorJitterAug(0.4, 0.4, 0.4)
+        self.ctx_num = ctx_num
+        self.per_batch_size = int(self.batch_size / self.ctx_num)
+        self.images_per_identity = images_per_identity
+        if self.images_per_identity > 0:
+            self.identities = int(self.per_batch_size /
+                                  self.images_per_identity)
+            self.per_identities = self.identities
+            self.repeat = 3000000.0 / (self.images_per_identity *
+                                       len(self.id2range))
+            self.repeat = int(self.repeat)
+            print(self.images_per_identity, self.identities, self.repeat)
+        self.mx_model = mx_model
+        self.triplet_params = triplet_params
+        self.triplet_mode = False
+        #self.provide_label = None
+        self.provide_label = [(label_name, (batch_size, ))]
+        if self.triplet_params is not None:
+            assert self.images_per_identity > 0
+            assert self.mx_model is not None
+            self.triplet_bag_size = self.triplet_params[0]
+            self.triplet_alpha = self.triplet_params[1]
+            self.triplet_max_ap = self.triplet_params[2]
+            assert self.triplet_bag_size > 0
+            assert self.triplet_alpha >= 0.0
+            assert self.triplet_alpha <= 1.0
+            self.triplet_mode = True
+            self.triplet_cur = 0
+            self.triplet_seq = []
+            self.triplet_reset()
+            self.seq_min_size = self.batch_size * 2
+        self.cur = 0
+        self.nbatch = 0
+        self.is_init = False
+        self.times = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        #self.reset()
+
+    def pairwise_dists(self, embeddings):
+        nd_embedding_list = []
+        for i in range(self.ctx_num):
+            nd_embedding = mx.nd.array(embeddings, mx.gpu(i))
+            nd_embedding_list.append(nd_embedding)
+        nd_pdists = []
+        pdists = []
+        for idx in range(embeddings.shape[0]):
+            emb_idx = idx % self.ctx_num
+            nd_embedding = nd_embedding_list[emb_idx]
+            a_embedding = nd_embedding[idx]
+            body = mx.nd.broadcast_sub(a_embedding, nd_embedding)
+            body = body * body
+            body = mx.nd.sum_axis(body, axis=1)
+            nd_pdists.append(body)
+            if len(nd_pdists
+                   ) == self.ctx_num or idx == embeddings.shape[0] - 1:
+                for x in nd_pdists:
+                    pdists.append(x.asnumpy())
+                nd_pdists = []
+        return pdists
+
+    def pick_triplets(self, embeddings, nrof_images_per_class):
+        emb_start_idx = 0
+        triplets = []
+        people_per_batch = len(nrof_images_per_class)
+        #self.time_reset()
+        pdists = self.pairwise_dists(embeddings)
+        #self.times[3] += self.time_elapsed()
+
+        for i in range(people_per_batch):
+            nrof_images = int(nrof_images_per_class[i])
+            for j in range(1, nrof_images):
+                #self.time_reset()
+                a_idx = emb_start_idx + j - 1
+                #neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
+                neg_dists_sqr = pdists[a_idx]
+                #self.times[3] += self.time_elapsed()
+
+                for pair in range(
+                        j, nrof_images):  # For every possible positive pair.
+                    p_idx = emb_start_idx + pair
+                    #self.time_reset()
+                    pos_dist_sqr = np.sum(
+                        np.square(embeddings[a_idx] - embeddings[p_idx]))
+                    #self.times[4] += self.time_elapsed()
+                    #self.time_reset()
+                    neg_dists_sqr[emb_start_idx:emb_start_idx +
+                                  nrof_images] = np.NaN
+                    if self.triplet_max_ap > 0.0:
+                        if pos_dist_sqr > self.triplet_max_ap:
+                            continue
+                    all_neg = np.where(
+                        np.logical_and(
+                            neg_dists_sqr - pos_dist_sqr < self.triplet_alpha,
+                            pos_dist_sqr <
+                            neg_dists_sqr))[0]  # FaceNet selection
+                    #self.times[5] += self.time_elapsed()
+                    #self.time_reset()
+                    #all_neg = np.where(neg_dists_sqr-pos_dist_sqr<alpha)[0] # VGG Face selecction
+                    nrof_random_negs = all_neg.shape[0]
+                    if nrof_random_negs > 0:
+                        rnd_idx = np.random.randint(nrof_random_negs)
+                        n_idx = all_neg[rnd_idx]
+                        triplets.append((a_idx, p_idx, n_idx))
+            emb_start_idx += nrof_images
+        np.random.shuffle(triplets)
+        return triplets
+
+    def triplet_reset(self):
+        #reset self.oseq by identities seq
+        self.triplet_cur = 0
+        ids = []
+        for k in self.id2range:
+            ids.append(k)
+        random.shuffle(ids)
+        self.triplet_seq = []
+        for _id in ids:
+            v = self.id2range[_id]
+            _list = range(*v)
+            random.shuffle(_list)
+            if len(_list) > self.images_per_identity:
+                _list = _list[0:self.images_per_identity]
+            self.triplet_seq += _list
+        print('triplet_seq', len(self.triplet_seq))
+        assert len(self.triplet_seq) >= self.triplet_bag_size
+
+    def time_reset(self):
+        self.time_now = datetime.datetime.now()
+
+    def time_elapsed(self):
+        time_now = datetime.datetime.now()
+        diff = time_now - self.time_now
+        return diff.total_seconds()
+
+    def select_triplets(self):
+        self.seq = []
+        while len(self.seq) < self.seq_min_size:
+            self.time_reset()
+            embeddings = None
+            bag_size = self.triplet_bag_size
+            batch_size = self.batch_size
+            #data = np.zeros( (bag_size,)+self.data_shape )
+            #label = np.zeros( (bag_size,) )
+            tag = []
+            #idx = np.zeros( (bag_size,) )
+            print('eval %d images..' % bag_size, self.triplet_cur)
+            print('triplet time stat', self.times)
+            if self.triplet_cur + bag_size > len(self.triplet_seq):
+                self.triplet_reset()
+                #bag_size = min(bag_size, len(self.triplet_seq))
+                print('eval %d images..' % bag_size, self.triplet_cur)
+            self.times[0] += self.time_elapsed()
+            self.time_reset()
+            #print(data.shape)
+            data = nd.zeros(self.provide_data[0][1])
+            label = None
+            if self.provide_label is not None:
+                label = nd.zeros(self.provide_label[0][1])
+            ba = 0
+            while True:
+                bb = min(ba + batch_size, bag_size)
+                if ba >= bb:
+                    break
+                _count = bb - ba
+                #data = nd.zeros( (_count,)+self.data_shape )
+                #_batch = self.data_iter.next()
+                #_data = _batch.data[0].asnumpy()
+                #print(_data.shape)
+                #_label = _batch.label[0].asnumpy()
+                #data[ba:bb,:,:,:] = _data
+                #label[ba:bb] = _label
+                for i in range(ba, bb):
+                    #print(ba, bb, self.triplet_cur, i, len(self.triplet_seq))
+                    _idx = self.triplet_seq[i + self.triplet_cur]
+                    s = self.imgrec.read_idx(_idx)
+                    header, img = recordio.unpack(s)
+                    img = self.imdecode(img)
+                    data[i - ba][:] = self.postprocess_data(img)
+                    _label = header.label
+                    if not isinstance(_label, numbers.Number):
+                        _label = _label[0]
+                    if label is not None:
+                        label[i - ba][:] = _label
+                    tag.append((int(_label), _idx))
+                    #idx[i] = _idx
+
+                db = mx.io.DataBatch(data=(data, ))
+                self.mx_model.forward(db, is_train=False)
+                net_out = self.mx_model.get_outputs()
+                #print('eval for selecting triplets',ba,bb)
+                #print(net_out)
+                #print(len(net_out))
+                #print(net_out[0].asnumpy())
+                net_out = net_out[0].asnumpy()
+                #print(net_out)
+                #print('net_out', net_out.shape)
+                if embeddings is None:
+                    embeddings = np.zeros((bag_size, net_out.shape[1]))
+                embeddings[ba:bb, :] = net_out
+                ba = bb
+            assert len(tag) == bag_size
+            self.triplet_cur += bag_size
+            embeddings = sklearn.preprocessing.normalize(embeddings)
+            self.times[1] += self.time_elapsed()
+            self.time_reset()
+            nrof_images_per_class = [1]
+            for i in range(1, bag_size):
+                if tag[i][0] == tag[i - 1][0]:
+                    nrof_images_per_class[-1] += 1
+                else:
+                    nrof_images_per_class.append(1)
+
+            triplets = self.pick_triplets(embeddings,
+                                          nrof_images_per_class)  # shape=(T,3)
+            print('found triplets', len(triplets))
+            ba = 0
+            while True:
+                bb = ba + self.per_batch_size // 3
+                if bb > len(triplets):
+                    break
+                _triplets = triplets[ba:bb]
+                for i in range(3):
+                    for triplet in _triplets:
+                        _pos = triplet[i]
+                        _idx = tag[_pos][1]
+                        self.seq.append(_idx)
+                ba = bb
+            self.times[2] += self.time_elapsed()
+
+    def hard_mining_reset(self):
+        #import faiss
+        from annoy import AnnoyIndex
+        data = nd.zeros(self.provide_data[0][1])
+        label = nd.zeros(self.provide_label[0][1])
+        #label = np.zeros( self.provide_label[0][1] )
+        X = None
+        ba = 0
+        batch_num = 0
+        while ba < len(self.oseq):
+            batch_num += 1
+            if batch_num % 10 == 0:
+                print('loading batch', batch_num, ba)
+            bb = min(ba + self.batch_size, len(self.oseq))
+            _count = bb - ba
+            for i in range(_count):
+                idx = self.oseq[i + ba]
+                s = self.imgrec.read_idx(idx)
+                header, img = recordio.unpack(s)
+                img = self.imdecode(img)
+                data[i][:] = self.postprocess_data(img)
+                label[i][:] = header.label
+            db = mx.io.DataBatch(data=(data, self.data_extra), label=(label, ))
+            self.mx_model.forward(db, is_train=False)
+            net_out = self.mx_model.get_outputs()
+            embedding = net_out[0].asnumpy()
+            nembedding = sklearn.preprocessing.normalize(embedding)
+            if _count < self.batch_size:
+                nembedding = nembedding[0:_count, :]
+            if X is None:
+                X = np.zeros((len(self.id2range), nembedding.shape[1]),
+                             dtype=np.float32)
+            nplabel = label.asnumpy()
+            for i in range(_count):
+                ilabel = int(nplabel[i])
+                #print(ilabel, ilabel.__class__)
+                X[ilabel] += nembedding[i]
+            ba = bb
+        X = sklearn.preprocessing.normalize(X)
+        d = X.shape[1]
+        t = AnnoyIndex(d, metric='euclidean')
+        for i in range(X.shape[0]):
+            t.add_item(i, X[i])
+        print('start to build index')
+        t.build(20)
+        print(X.shape)
+        k = self.per_identities
+        self.seq = []
+        for i in range(X.shape[0]):
+            nnlist = t.get_nns_by_item(i, k)
+            assert nnlist[0] == i
+            for _label in nnlist:
+                assert _label < len(self.id2range)
+                _id = self.header0[0] + _label
+                v = self.id2range[_id]
+                _list = range(*v)
+                if len(_list) < self.images_per_identity:
+                    random.shuffle(_list)
+                else:
+                    _list = np.random.choice(_list,
+                                             self.images_per_identity,
+                                             replace=False)
+                for i in range(self.images_per_identity):
+                    _idx = _list[i % len(_list)]
+                    self.seq.append(_idx)
+        #faiss_params = [20,5]
+        #quantizer = faiss.IndexFlatL2(d)  # the other index
+        #index = faiss.IndexIVFFlat(quantizer, d, faiss_params[0], faiss.METRIC_L2)
+        #assert not index.is_trained
+        #index.train(X)
+        #index.add(X)
+        #assert index.is_trained
+        #print('trained')
+        #index.nprobe = faiss_params[1]
+        #D, I = index.search(X, k)     # actual search
+        #print(I.shape)
+        #self.seq = []
+        #for i in range(I.shape[0]):
+        #  #assert I[i][0]==i
+        #  for j in range(k):
+        #    _label = I[i][j]
+        #    assert _label<len(self.id2range)
+        #    _id = self.header0[0]+_label
+        #    v = self.id2range[_id]
+        #    _list = range(*v)
+        #    if len(_list)<self.images_per_identity:
+        #      random.shuffle(_list)
+        #    else:
+        #      _list = np.random.choice(_list, self.images_per_identity, replace=False)
+        #    for i in range(self.images_per_identity):
+        #      _idx = _list[i%len(_list)]
+        #      self.seq.append(_idx)
+
+    def reset(self):
+        """Resets the iterator to the beginning of the data."""
+        print('call reset()')
+        self.cur = 0
+        if self.images_per_identity > 0:
+            if self.triplet_mode:
+                self.select_triplets()
+            elif not self.hard_mining:
+                self.seq = []
+                idlist = []
+                for _id in self.id2range:
+                    v = self.id2range[_id]
+                    idlist.append((_id, range(*v)))
+                for r in range(self.repeat):
+                    if r % 10 == 0:
+                        print('repeat', r)
+                    if self.shuffle:
+                        random.shuffle(idlist)
+                    for item in idlist:
+                        _id = item[0]
+                        _list = item[1]
+                        #random.shuffle(_list)
+                        if len(_list) < self.images_per_identity:
+                            random.shuffle(_list)
+                        else:
+                            _list = np.random.choice(_list,
+                                                     self.images_per_identity,
+                                                     replace=False)
+                        for i in range(self.images_per_identity):
+                            _idx = _list[i % len(_list)]
+                            self.seq.append(_idx)
+            else:
+                self.hard_mining_reset()
+            print('seq len', len(self.seq))
+        else:
+            if self.shuffle:
+                random.shuffle(self.seq)
+        if self.seq is None and self.imgrec is not None:
+            self.imgrec.reset()
+
+    def num_samples(self):
+        return len(self.seq)
+
+    def next_sample(self):
+        while True:
+            if self.cur >= len(self.seq):
+                raise StopIteration
+            idx = self.seq[self.cur]
+            self.cur += 1
+            s = self.imgrec.read_idx(idx)
+            header, img = recordio.unpack(s)
+            label = header.label
+            if not isinstance(label, numbers.Number):
+                label = label[0]
+            return label, img, None, None
+
+    def brightness_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        src *= alpha
+        return src
+
+    def contrast_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = np.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
+        src *= alpha
+        src += gray
+        return src
+
+    def saturation_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = np.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = np.sum(gray, axis=2, keepdims=True)
+        gray *= (1.0 - alpha)
+        src *= alpha
+        src += gray
+        return src
+
+    def color_aug(self, img, x):
+        augs = [self.brightness_aug, self.contrast_aug, self.saturation_aug]
+        random.shuffle(augs)
+        for aug in augs:
+            #print(img.shape)
+            img = aug(img, x)
+            #print(img.shape)
+        return img
+
+    def mirror_aug(self, img):
+        _rd = random.randint(0, 1)
+        if _rd == 1:
+            for c in range(img.shape[2]):
+                img[:, :, c] = np.fliplr(img[:, :, c])
+        return img
+
+    def next(self):
+        if not self.is_init:
+            self.reset()
+            self.is_init = True
+        """Returns the next batch of data."""
+        #print('in next', self.cur, self.labelcur)
+        self.nbatch += 1
+        batch_size = self.batch_size
+        c, h, w = self.data_shape
+        batch_data = nd.empty((batch_size, c, h, w))
+        if self.provide_label is not None:
+            batch_label = nd.empty(self.provide_label[0][1])
+        i = 0
+        try:
+            while i < batch_size:
+                label, s, bbox, landmark = self.next_sample()
+                _data = self.imdecode(s)
+                if self.rand_mirror:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        _data = mx.ndarray.flip(data=_data, axis=1)
+                if self.cutoff > 0:
+                    centerh = random.randint(0, _data.shape[0] - 1)
+                    centerw = random.randint(0, _data.shape[1] - 1)
+                    half = self.cutoff // 2
+                    starth = max(0, centerh - half)
+                    endh = min(_data.shape[0], centerh + half)
+                    startw = max(0, centerw - half)
+                    endw = min(_data.shape[1], centerw + half)
+                    _data = _data.astype('float32')
+                    #print(starth, endh, startw, endw, _data.shape)
+                    _data[starth:endh, startw:endw, :] = 127.5
+                #_npdata = _data.asnumpy()
+                #if landmark is not None:
+                #  _npdata = face_preprocess.preprocess(_npdata, bbox = bbox, landmark=landmark, image_size=self.image_size)
+                #if self.rand_mirror:
+                #  _npdata = self.mirror_aug(_npdata)
+                #if self.mean is not None:
+                #  _npdata = _npdata.astype(np.float32)
+                #  _npdata -= self.mean
+                #  _npdata *= 0.0078125
+                #nimg = np.zeros(_npdata.shape, dtype=np.float32)
+                #nimg[self.patch[1]:self.patch[3],self.patch[0]:self.patch[2],:] = _npdata[self.patch[1]:self.patch[3], self.patch[0]:self.patch[2], :]
+                #_data = mx.nd.array(nimg)
+                data = [_data]
+                try:
+                    self.check_valid_image(data)
+                except RuntimeError as e:
+                    logging.debug('Invalid image, skipping:  %s', str(e))
+                    continue
+                #print('aa',data[0].shape)
+                #data = self.augmentation_transform(data)
+                #print('bb',data[0].shape)
+                for datum in data:
+                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
+                    #print(datum.shape)
+                    batch_data[i][:] = self.postprocess_data(datum)
+                    if self.provide_label is not None:
+                        batch_label[i][:] = label
+                    i += 1
+        except StopIteration:
+            if i < batch_size:
+                raise StopIteration
+
+        #print('next end', batch_size, i)
+        _label = None
+        if self.provide_label is not None:
+            _label = [batch_label]
+        return io.DataBatch([batch_data], _label, batch_size - i)
+
+    def check_data_shape(self, data_shape):
+        """Checks if the input data shape is valid"""
+        if not len(data_shape) == 3:
+            raise ValueError(
+                'data_shape should have length 3, with dimensions CxHxW')
+        if not data_shape[0] == 3:
+            raise ValueError(
+                'This iterator expects inputs to have 3 channels.')
+
+    def check_valid_image(self, data):
+        """Checks if the input data is valid"""
+        if len(data[0].shape) == 0:
+            raise RuntimeError('Data shape is wrong')
+
+    def imdecode(self, s):
+        """Decodes a string or byte string to an NDArray.
+        See mx.img.imdecode for more details."""
+        img = mx.image.imdecode(s)  #mx.ndarray
+        return img
+
+    def read_image(self, fname):
+        """Reads an input image `fname` and returns the decoded raw bytes.
+
+        Example usage:
+        ----------
+        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
+        """
+        with open(os.path.join(self.path_root, fname), 'rb') as fin:
+            img = fin.read()
+        return img
+
+    def augmentation_transform(self, data):
+        """Transforms input data with specified augmentation."""
+        for aug in self.auglist:
+            data = [ret for src in data for ret in aug(src)]
+        return data
+
+    def postprocess_data(self, datum):
+        """Final postprocessing step before image is loaded into the batch."""
+        return nd.transpose(datum, axes=(2, 0, 1))
+
+
+class FaceImageIterList(io.DataIter):
+    def __init__(self, iter_list):
+        assert len(iter_list) > 0
+        self.provide_data = iter_list[0].provide_data
+        self.provide_label = iter_list[0].provide_label
+        self.iter_list = iter_list
+        self.cur_iter = None
+
+    def reset(self):
+        self.cur_iter.reset()
+
+    def next(self):
+        self.cur_iter = random.choice(self.iter_list)
+        while True:
+            try:
+                ret = self.cur_iter.next()
+            except StopIteration:
+                self.cur_iter.reset()
+                continue
+            return ret
diff --git a/insightface/recognition/arcface_mxnet/verification.py b/insightface/recognition/arcface_mxnet/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f90a64300d309fbfd664e115d1dafe592b9693
--- /dev/null
+++ b/insightface/recognition/arcface_mxnet/verification.py
@@ -0,0 +1,680 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import sys
+import numpy as np
+from scipy import misc
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import sklearn
+import cv2
+import math
+import datetime
+import pickle
+from sklearn.decomposition import PCA
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    #print('pca', pca)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        #print('train_set', train_set)
+        #print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            #print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            #print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        #print('threshold', thresholds[best_threshold_index])
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    #print(true_accept, false_accept)
+    #print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  #py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  #py3
+    data_list = []
+    for flip in [0, 1]:
+        data = nd.empty(
+            (len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][i][:] = img
+        if i % 1000 == 0:
+            print('loading bin', i)
+    print(data_list[0].shape)
+    return (data_list, issame_list)
+
+
+def test(data_set,
+         mx_model,
+         batch_size,
+         nfolds=10,
+         data_extra=None,
+         label_shape=None):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            #print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            #_arg, _aux = model.get_params()
+            #__arg = {}
+            #for k,v in _arg.iteritems():
+            #  __arg[k] = v.as_in_context(_ctx)
+            #_arg = __arg
+            #_arg["data"] = _data.as_in_context(_ctx)
+            #_arg["softmax_label"] = _label.as_in_context(_ctx)
+            #for k,v in _arg.iteritems():
+            #  print(k,v.context)
+            #exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
+            #exe.forward(is_train=False)
+            #net_out = exe.outputs
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            #print(_embeddings.shape)
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            #print(_em.shape, _norm)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    #_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=10)
+    #acc1, std1 = np.mean(accuracy), np.std(accuracy)
+
+    #print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    #embeddings = np.concatenate(embeddings_list, axis=1)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def test_badcase(data_set,
+                 mx_model,
+                 batch_size,
+                 name='',
+                 data_extra=None,
+                 label_shape=None):
+    print('testing verification badcase..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            #print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    thresholds = np.arange(0, 4, 0.01)
+    actual_issame = np.asarray(issame_list)
+    nrof_folds = 10
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    data = data_list[0]
+
+    pouts = []
+    nouts = []
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        #print(train_set)
+        #print(train_set.__class__)
+        for threshold_idx, threshold in enumerate(thresholds):
+            p2 = dist[train_set]
+            p3 = actual_issame[train_set]
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, p2, p3)
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+        best_threshold = thresholds[best_threshold_index]
+        for iid in test_set:
+            ida = iid * 2
+            idb = ida + 1
+            asame = actual_issame[iid]
+            _dist = dist[iid]
+            violate = _dist - best_threshold
+            if not asame:
+                violate *= -1.0
+            if violate > 0.0:
+                imga = data[ida].asnumpy().transpose(
+                    (1, 2, 0))[..., ::-1]  #to bgr
+                imgb = data[idb].asnumpy().transpose((1, 2, 0))[..., ::-1]
+                #print(imga.shape, imgb.shape, violate, asame, _dist)
+                if asame:
+                    pouts.append((imga, imgb, _dist, best_threshold, ida))
+                else:
+                    nouts.append((imga, imgb, _dist, best_threshold, ida))
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    acc = np.mean(accuracy)
+    pouts = sorted(pouts, key=lambda x: x[2], reverse=True)
+    nouts = sorted(nouts, key=lambda x: x[2], reverse=False)
+    print(len(pouts), len(nouts))
+    print('acc', acc)
+    gap = 10
+    image_shape = (112, 224, 3)
+    out_dir = "./badcases"
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    if len(nouts) > 0:
+        threshold = nouts[0][3]
+    else:
+        threshold = pouts[-1][3]
+
+    for item in [(pouts, 'positive(false_negative).png'),
+                 (nouts, 'negative(false_positive).png')]:
+        cols = 4
+        rows = 8000
+        outs = item[0]
+        if len(outs) == 0:
+            continue
+        #if len(outs)==9:
+        #  cols = 3
+        #  rows = 3
+
+        _rows = int(math.ceil(len(outs) / cols))
+        rows = min(rows, _rows)
+        hack = {}
+
+        if name.startswith('cfp') and item[1].startswith('pos'):
+            hack = {
+                0: 'manual/238_13.jpg.jpg',
+                6: 'manual/088_14.jpg.jpg',
+                10: 'manual/470_14.jpg.jpg',
+                25: 'manual/238_13.jpg.jpg',
+                28: 'manual/143_11.jpg.jpg'
+            }
+
+        filename = item[1]
+        if len(name) > 0:
+            filename = name + "_" + filename
+        filename = os.path.join(out_dir, filename)
+        img = np.zeros((image_shape[0] * rows + 20, image_shape[1] * cols +
+                        (cols - 1) * gap, 3),
+                       dtype=np.uint8)
+        img[:, :, :] = 255
+        text_color = (0, 0, 153)
+        text_color = (255, 178, 102)
+        text_color = (153, 255, 51)
+        for outi, out in enumerate(outs):
+            row = outi // cols
+            col = outi % cols
+            if row == rows:
+                break
+            imga = out[0].copy()
+            imgb = out[1].copy()
+            if outi in hack:
+                idx = out[4]
+                print('noise idx', idx)
+                aa = hack[outi]
+                imgb = cv2.imread(aa)
+                #if aa==1:
+                #  imgb = cv2.transpose(imgb)
+                #  imgb = cv2.flip(imgb, 1)
+                #elif aa==3:
+                #  imgb = cv2.transpose(imgb)
+                #  imgb = cv2.flip(imgb, 0)
+                #else:
+                #  for ii in range(2):
+                #    imgb = cv2.transpose(imgb)
+                #    imgb = cv2.flip(imgb, 1)
+            dist = out[2]
+            _img = np.concatenate((imga, imgb), axis=1)
+            k = "%.3f" % dist
+            #print(k)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            cv2.putText(_img, k, (80, image_shape[0] // 2 + 7), font, 0.6,
+                        text_color, 2)
+            #_filename = filename+"_%d.png"%outi
+            #cv2.imwrite(_filename, _img)
+            img[row * image_shape[0]:(row + 1) * image_shape[0],
+                (col * image_shape[1] +
+                 gap * col):((col + 1) * image_shape[1] + gap * col), :] = _img
+        #threshold = outs[0][3]
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        k = "threshold: %.3f" % threshold
+        cv2.putText(img, k, (img.shape[1] // 2 - 70, img.shape[0] - 5), font,
+                    0.6, text_color, 2)
+        cv2.imwrite(filename, img)
+
+
+def dumpR(data_set,
+          mx_model,
+          batch_size,
+          name='',
+          data_extra=None,
+          label_shape=None):
+    print('dump verification embedding..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            #print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join('temp.bin')
+    with open(outname, 'wb') as f:
+        pickle.dump((embeddings, issame_list),
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='do verification')
+    # general
+    parser.add_argument('--data-dir', default='', help='')
+    parser.add_argument('--model',
+                        default='../model/softmax,50',
+                        help='path to load model.')
+    parser.add_argument('--target',
+                        default='lfw,cfp_ff,cfp_fp,agedb_30',
+                        help='test targets.')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+    parser.add_argument('--batch-size', default=32, type=int, help='')
+    parser.add_argument('--max', default='', type=str, help='')
+    parser.add_argument('--mode', default=0, type=int, help='')
+    parser.add_argument('--nfolds', default=10, type=int, help='')
+    args = parser.parse_args()
+    #sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
+    #import face_image
+    #prop = face_image.load_property(args.data_dir)
+    #image_size = prop.image_size
+    image_size = [112, 112]
+    print('image_size', image_size)
+    ctx = mx.gpu(args.gpu)
+    nets = []
+    vec = args.model.split(',')
+    prefix = args.model.split(',')[0]
+    epochs = []
+    if len(vec) == 1:
+        pdir = os.path.dirname(prefix)
+        for fname in os.listdir(pdir):
+            if not fname.endswith('.params'):
+                continue
+            _file = os.path.join(pdir, fname)
+            if _file.startswith(prefix):
+                epoch = int(fname.split('.')[0].split('-')[1])
+                epochs.append(epoch)
+        epochs = sorted(epochs, reverse=True)
+        if len(args.max) > 0:
+            _max = [int(x) for x in args.max.split(',')]
+            assert len(_max) == 2
+            if len(epochs) > _max[1]:
+                epochs = epochs[_max[0]:_max[1]]
+
+    else:
+        epochs = [int(x) for x in vec[1].split('|')]
+    print('model number', len(epochs))
+    time0 = datetime.datetime.now()
+    for epoch in epochs:
+        print('loading', prefix, epoch)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        #arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+        all_layers = sym.get_internals()
+        sym = all_layers['fc1_output']
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+        model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+                                          image_size[1]))])
+        model.set_params(arg_params, aux_params)
+        nets.append(model)
+    time_now = datetime.datetime.now()
+    diff = time_now - time0
+    print('model loading time', diff.total_seconds())
+
+    ver_list = []
+    ver_name_list = []
+    for name in args.target.split(','):
+        path = os.path.join(args.data_dir, name + ".bin")
+        if os.path.exists(path):
+            print('loading.. ', name)
+            data_set = load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+
+    if args.mode == 0:
+        for i in range(len(ver_list)):
+            results = []
+            for model in nets:
+                acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+                    ver_list[i], model, args.batch_size, args.nfolds)
+                print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+                print('[%s]Accuracy: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc1, std1))
+                print('[%s]Accuracy-Flip: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc2, std2))
+                results.append(acc2)
+            print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+    elif args.mode == 1:
+        model = nets[0]
+        test_badcase(ver_list[0], model, args.batch_size, args.target)
+    else:
+        model = nets[0]
+        dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/arcface_oneflow/README.md b/insightface/recognition/arcface_oneflow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..21f980c8685e3b0cb5cbf54d2e03c2c1c294773c
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/README.md
@@ -0,0 +1,266 @@
+
+# InsightFace in OneFlow
+
+[English](README.md) **|** [简体中文](README_CH.md)
+
+It introduces how to train InsightFace in OneFlow, and do verification over the validation datasets via the well-toned networks.
+
+## Contents
+
+\- [InsightFace in OneFlow](#insightface-in-oneflow)
+
+ \- [Contents](#contents)
+
+ \- [Background](#background)
+
+  \- [InsightFace opensource project](#insightface-opensource-project)
+
+  \- [Implementation in OneFlow](#implementation-in-oneflow)
+
+ \- [Preparations](#preparations)
+
+  \- [Install OneFlow](#install-oneflow)
+
+  \- [Data preparations](#data-preparations)
+
+   \- [1. Download datasets](#1-download-datasets)
+
+   \- [2. Transformation from MS1M recordio to OFRecord](#2-transformation-from-ms1m-recordio-to-ofrecord)
+
+ \- [Training and verification](#training-and-verification)
+
+  \- [Training](#training)
+
+  \- [OneFLow2ONNX](#OneFLow2ONNX)
+
+## Background
+
+### InsightFace opensource project
+
+[InsightFace](https://github.com/deepinsight/insightface) is an open-source 2D&3D deep face analysis toolbox, mainly based on MXNet.
+
+In InsightFace, it supports:
+
+
+
+- Datasets typically used for face recognition, such as CASIA-Webface、MS1M、VGG2(Provided with the form of a binary file which could run in MXNet, [here](https://github.com/deepinsight/insightface/wiki/Dataset-Zoo) is more details about the datasets and how to download.
+
+
+
+* Backbones of ResNet, MobilefaceNet, InceptionResNet_v2, and other deep-learning networks to apply in facial recognition. 
+
+* Implementation of different loss functions, including SphereFace Loss、Softmax Loss、SphereFace Loss, etc.
+
+  
+
+### Implementation in OneFlow
+
+Based upon the currently existing work of Insightface, OneFlow ported basic models from it, and now OneFlow supports:
+
+
+
+- Training datasets of MS1M、Glint360k, and validation datasets of Lfw、Cfp_fp and Agedb_30, scripts for training and validating.
+
+- Backbones of ResNet100 and MobileFaceNet to recognize faces.
+
+- Loss function, e.g. Softmax Loss and Margin Softmax Loss（including Arcface、Cosface and Combined Loss）.
+
+- Model parallelism and [Partial FC](https://github.com/deepinsight/insightface/tree/760d6de043d7f654c5963391271f215dab461547/recognition/partial_fc#partial-fc) optimization.
+
+- Model transformation via MXNet.
+
+
+
+To be coming further:
+
+- Additional datasets transformation.
+
+- Plentiful backbones.
+
+- Full-scale loss functions implementation.
+
+- Incremental tutorial on the distributed configuration.
+
+
+
+This project is open for every developer to PR, new implementation and animated discussion will be most welcome.
+
+
+
+## Preparations
+
+First of all, before execution, please make sure that:
+
+1. Install OneFlow
+
+2. Prepare training and validation datasets in form of OFRecord.
+
+
+
+### Install OneFlow
+
+
+
+According to steps in [Install OneFlow](https://github.com/Oneflow-Inc/oneflow#install-oneflow) install the newest release master whl packages.
+
+```
+python3 -m pip install oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu102/6aa719d70119b65837b25cc5f186eb19ef2b7891/index.html --user
+```
+
+
+
+### Data preparations
+
+According to [Load and Prepare OFRecord Datasets](https://docs.oneflow.org/en/extended_topics/how_to_make_ofdataset.html), datasets should be converted into the form of OFREcord, to test InsightFace.
+
+
+
+It has provided a set of datasets related to face recognition tasks, which have been pre-processed via face alignment or other processions already in [InsightFace](https://github.com/deepinsight/insightface). The corresponding datasets could be downloaded from [here](https://github.com/deepinsight/insightface/wiki/Dataset-Zoo) and should be converted into OFRecord, which performs better in OneFlow. Considering the cumbersome steps, it is suggested to download converted OFrecord datasets:
+
+[MS1M-ArcFace(face_emore)](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/train_ofrecord.tar.gz)
+
+[MS1MV3](https://oneflow-public.oss-cn-beijing.aliyuncs.com/facedata/MS1V3/oneflow/ms1m-retinaface-t1.zip)
+
+It illustrates how to convert downloaded datasets into OFRecords, and take MS1M-ArcFace as an example in the following.
+
+#### 1. Download datasets
+
+The structure of the downloaded MS1M-ArcFace is shown as follown：
+
+
+
+```
+faces_emore/
+
+​    train.idx
+
+​    train.rec
+
+​    property
+
+​    lfw.bin
+
+​    cfp_fp.bin
+
+​    agedb_30.bin
+```
+
+The first three files are MXNet recordio format files of MS1M training dataset, the last three `.bin` files are different validation datasets.
+
+
+
+#### 2. Transformation from MS1M recordio to OFRecord
+Only need to execute 2.1 or 2.2
+2.1 Use Python scripts directly
+
+Run 
+```
+python tools/mx_recordio_2_ofrecord_shuffled_npart.py  --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train --num_part 16
+```
+And you will get the number of `part_num` parts of OFRecord, it's 16 parts in this example, it showed like this
+```
+tree ofrecord/test/
+ofrecord/test/
+|-- _SUCCESS
+|-- part-00000
+|-- part-00001
+|-- part-00002
+|-- part-00003
+|-- part-00004
+|-- part-00005
+|-- part-00006
+|-- part-00007
+|-- part-00008
+|-- part-00009
+|-- part-00010
+|-- part-00011
+|-- part-00012
+|-- part-00013
+|-- part-00014
+`-- part-00015
+
+0 directories, 17 files
+```
+
+
+2.2 Use Python scripts + Spark Shuffle + Spark partition
+
+Run
+
+```
+python tools/dataset_convert/mx_recordio_2_ofrecord.py --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train
+```
+
+And you will get one part of OFRecord(`part-0`) with all data in this way. Then you should use Spark to shuffle and partition.
+1. Get jar package available
+You can download Spark-oneflow-connector-assembly-0.1.0.jar via [Github](https://github.com/Oneflow-Inc/spark-oneflow-connector) or [OSS](https://oneflow-public.oss-cn-beijing.aliyuncs.com/spark-oneflow-connector/spark-oneflow-connector-assembly-0.1.1.jar)
+
+2. Run in Spark
+Assign that you have already installed and configured Spark.
+Run
+```
+//Start Spark 
+./Spark-2.4.3-bin-hadoop2.7/bin/Spark-shell --jars ~/Spark-oneflow-connector-assembly-0.1.0.jar --driver-memory=64G --conf Spark.local.dir=/tmp/
+// shuffle and partition in 16 parts
+import org.oneflow.Spark.functions._
+Spark.read.chunk("data_path").shuffle().repartition(16).write.chunk("new_data_path")
+sc.formatFilenameAsOneflowStyle("new_data_path")
+```
+Hence you will get 16 parts of OFRecords, it shown like this
+```
+tree ofrecord/test/
+ofrecord/test/
+|-- _SUCCESS
+|-- part-00000
+|-- part-00001
+|-- part-00002
+|-- part-00003
+|-- part-00004
+|-- part-00005
+|-- part-00006
+|-- part-00007
+|-- part-00008
+|-- part-00009
+|-- part-00010
+|-- part-00011
+|-- part-00012
+|-- part-00013
+|-- part-00014
+`-- part-00015
+
+0 directories, 17 files
+```
+
+
+## Training and verification
+
+
+
+### Training
+
+To reduce the usage cost of user, OneFlow draws close the scripts to Torch style, you can directly modify parameters via configs/*.py
+
+#### eager 
+```
+./train_ddp.sh
+```
+#### Graph
+```
+train_graph_distributed.sh
+```
+
+
+### Varification
+
+Moreover, OneFlow offers a validation script to do verification separately, val.py, which facilitates you to check the precision of the pre-training model saved.
+
+```
+./val.sh
+
+```
+## OneFLow2ONNX
+
+```
+pip install oneflow-onnx==0.5.1
+./convert.sh
+```
\ No newline at end of file
diff --git a/insightface/recognition/arcface_oneflow/README_CH.md b/insightface/recognition/arcface_oneflow/README_CH.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a62f825db4e10d0db9068a9bc00c4cd3f8a11a9
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/README_CH.md
@@ -0,0 +1,231 @@
+# InsightFace 在 OneFlow 中的实现
+
+
+[English](README.md) **|** [简体中文](README_CH.md)
+
+本文介绍如何在 OneFlow 中训练 InsightFace，并在验证数据集上对训练好的网络进行验证。
+
+## 目录
+- [InsightFace 在 OneFlow 中的实现](#insightface-在-oneflow-中的实现)
+  - [目录](#目录)
+  - [背景介绍](#背景介绍)
+    - [InsightFace 开源项目](#insightface-开源项目)
+    - [InsightFace 在 OneFlow 中的实现](#insightface-在-oneflow-中的实现-1)
+  - [准备工作](#准备工作)
+    - [安装 OneFlow](#安装-oneflow)
+    - [准备数据集](#准备数据集)
+      - [1. 下载数据集](#1-下载数据集)
+      - [2. 将训练数据集 MS1M 从 recordio 格式转换为 OFRecord 格式](#2-将训练数据集-ms1m-从-recordio-格式转换为-ofrecord-格式)
+    
+  - [训练和验证](#训练和验证)
+    - [训练](#训练)
+    - [验证](#验证)
+    - [OneFLow2ONNX](#OneFLow2ONNX)
+    
+
+## 背景介绍
+
+###  InsightFace 开源项目
+
+[InsightFace 原仓库](https://github.com/deepinsight/insightface)是基于 MXNet 实现的人脸识别研究开源项目。
+
+在该项目中，集成了：
+
+* CASIA-Webface、MS1M、VGG2 等用于人脸识别研究常用的数据集（以 MXNet 支持的二进制形式提供，可以从[这里](https://github.com/deepinsight/insightface/wiki/Dataset-Zoo)查看数据集的详细说明以及下载链接）。
+
+* 以 ResNet、MobileFaceNet、InceptionResNet_v2 等深度学习网络作为 Backbone 的人脸识别模型。
+* 涵盖 SphereFace Loss、Softmax Loss、SphereFace Loss 等多种损失函数的实现。
+
+
+
+### InsightFace 在 OneFlow 中的实现
+
+在 InsightFace 开源项目已有的工作基础上，OneFlow 对 InsightFace 基本的人脸识别模型进行了移植，目前已实现的功能包括：
+
+* 支持了使用 MS1M、Glint360k 作为训练数据集，Lfw、Cfp_fp 以及 Agedb_30 作为验证数据集，提供了对网络进行训练和验证的脚本。
+* 支持 ResNet100 和 MobileFaceNet 作为人脸识别模型的 Backbone 网络。
+* 实现了 Softmax Loss 以及 Margin Softmax Loss（包括 Nsoftmax、Arcface、Cosface 和 Combined Loss 等）。
+* 实现了模型并行和 Partial FC 优化。
+* 实现了 MXNet 的模型转换。
+
+
+未来将计划逐步完善：
+
+* 更多的数据集转换。
+* 更丰富的 Backbone 网络。
+* 更全面的损失函数实现。
+* 增加分布式运行的说明。
+
+
+
+我们对所有的开发者开放 PR，非常欢迎您加入新的实现以及参与讨论。
+
+## 准备工作
+
+在开始运行前，请先确定：
+
+1. 安装 OneFlow。
+2. 准备训练和验证的 OFRecord 数据集。
+
+
+
+###  安装 OneFlow
+
+根据 [Install OneFlow](https://github.com/Oneflow-Inc/oneflow#install-oneflow) 的步骤进行安装最新 master whl 包即可。
+
+```
+python3 -m pip install oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu102/6aa719d70119b65837b25cc5f186eb19ef2b7891/index.html --user
+```
+
+### 准备数据集
+
+根据 [加载与准备 OFRecord 数据集](https://docs.oneflow.org/extended_topics/how_to_make_ofdataset.html) 准备 ImageNet 的 OFReocord 数据集，用以进行 InsightFace 的测试。
+
+[InsightFace 原仓库](https://github.com/deepinsight/insightface)中提供了一系列人脸识别任务相关的数据集，已经完成了人脸对齐等预处理过程。请从[这里](https://github.com/deepinsight/insightface/wiki/Dataset-Zoo)下载相应的数据集，并且转换成 OneFlow 可以识别的 OFRecord 格式。考虑到步骤繁琐，也可以直接下载已经转好的 OFRecord 数据集：
+
+[ MS1M-ArcFace(face_emore)](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/train_ofrecord.tar.gz)
+
+[MS1MV3](https://oneflow-public.oss-cn-beijing.aliyuncs.com/facedata/MS1V3/oneflow/ms1m-retinaface-t1.zip)
+
+下面以数据集 MS1M-ArcFace 为例，展示如何将下载到的数据集转换成 OFRecord 格式。
+
+#### 1. 下载数据集
+
+下载好的 MS1M-ArcFace 数据集，内容如下：
+
+```
+faces_emore/
+       train.idx
+       train.rec
+       property
+       lfw.bin
+       cfp_fp.bin
+       agedb_30.bin
+```
+
+
+
+前三个文件是训练数据集 MS1M 的 MXNet 的 recordio 格式相关的文件，后三个 `.bin` 文件是三个不同的验证数据集。
+
+
+
+#### 2. 将训练数据集 MS1M 从 recordio 格式转换为 OFRecord 格式
+训练数据集转换有两种方式: (2.1部分)直接使用python脚本生成n个shuffle过的数据part，或(2.2部分)python脚本生成一个part，再根据需要用spark做shuffle和partition。
+2.1 直接使用 Python 脚本
+
+运行： 
+```
+python tools/dataset_convert/mx_recordio_2_ofrecord_shuffled_npart.py  --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train --num_part 16
+```
+成功后将得到 `num_part` 数量个 OFRecord，本示例中为 16 个，显示如下：
+
+```
+tree ofrecord/test/
+ofrecord/test/
+|-- _SUCCESS
+|-- part-00000
+|-- part-00001
+|-- part-00002
+|-- part-00003
+|-- part-00004
+|-- part-00005
+|-- part-00006
+|-- part-00007
+|-- part-00008
+|-- part-00009
+|-- part-00010
+|-- part-00011
+|-- part-00012
+|-- part-00013
+|-- part-00014
+`-- part-00015
+
+0 directories, 17 files
+```
+2.2 Python 脚本 + Spark Shuffle + Spark Partition
+
+运行：
+
+```
+python tools/dataset_convert/mx_recordio_2_ofrecord.py --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train
+```
+成功后将得到一个包含所有数据的 OFReocrd（`part-0`），需要进一步使用 Spark 进行 Shuffle 和 Partition。
+成功安装和部署 Spark 后， 您需要：
+1. 下载工具 jar 包
+   
+
+您可以通过 [Github](https://github.com/Oneflow-Inc/spark-oneflow-connector) 或者 [OSS](https://oneflow-public.oss-cn-beijing.aliyuncs.com/spark-oneflow-connector/spark-oneflow-connector-assembly-0.1.1.jar) 下载 Spark-oneflow-connector-assembly-0.1.0.jar 文件。
+1. 运行 Spark 命令
+
+
+运行
+```
+//Start Spark 
+./Spark-2.4.3-bin-hadoop2.7/bin/Spark-shell --jars ~/Spark-oneflow-connector-assembly-0.1.0.jar --driver-memory=64G --conf Spark.local.dir=/tmp/
+// shuffle and partition in 16 parts
+import org.oneflow.Spark.functions._
+Spark.read.chunk("data_path").shuffle().repartition(16).write.chunk("new_data_path")
+sc.formatFilenameAsOneflowStyle("new_data_path")
+```
+然后就可以得到 16 个 part 的 OFRecord，显示如下
+```
+tree ofrecord/test/
+ofrecord/test/
+|-- _SUCCESS
+|-- part-00000
+|-- part-00001
+|-- part-00002
+|-- part-00003
+|-- part-00004
+|-- part-00005
+|-- part-00006
+|-- part-00007
+|-- part-00008
+|-- part-00009
+|-- part-00010
+|-- part-00011
+|-- part-00012
+|-- part-00013
+|-- part-00014
+`-- part-00015
+
+0 directories, 17 files
+```
+
+
+
+
+## 训练和验证
+
+### 训练
+
+为了减小用户使用的迁移成本，OneFlow 的脚本已经调整为 Torch 实现的风格，用户可以使用 configs/*.py 直接修改参数。
+
+
+运行脚本：
+
+#### eager 
+```
+./train_ddp.sh
+```
+#### Graph
+```
+train_graph_distributed.sh
+```
+
+### 验证
+
+另外，为了方便查看保存下来的预训练模型精度，我们提供了一个仅在验证数据集上单独执行验证过程的脚本。
+
+运行
+
+```
+./val.sh
+```
+
+## OneFLow2ONNX
+
+```
+pip install oneflow-onnx==0.5.1
+./convert.sh
+```
\ No newline at end of file
diff --git a/insightface/recognition/arcface_oneflow/backbones/__init__.py b/insightface/recognition/arcface_oneflow/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2448255b4fb13af588954f18cf562b975352aad6
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/backbones/__init__.py
@@ -0,0 +1,16 @@
+from .ir_resnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+
+
+def get_model(name, **kwargs):
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    else:
+        raise ValueError()
diff --git a/insightface/recognition/arcface_oneflow/backbones/ir_resnet.py b/insightface/recognition/arcface_oneflow/backbones/ir_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9a995715f34cf1ea6b1ea0093c7adb4389eab7
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/backbones/ir_resnet.py
@@ -0,0 +1,219 @@
+import oneflow as flow
+import oneflow.nn as nn
+from typing import Type, Any, Callable, Union, List, Optional
+
+
+def conv3x3(
+    in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1
+) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        groups=1,
+        base_width=64,
+        dilation=1,
+    ):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.ReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+
+    def __init__(
+        self,
+        block,
+        layers,
+        dropout=0,
+        num_features=512,
+        zero_init_residual=False,
+        groups=1,
+        width_per_group=64,
+        replace_stride_with_dilation=None,
+        fp16=False,
+    ):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                "or a 3-element tuple, got {}".format(replace_stride_with_dilation)
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.ReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]
+        )
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]
+        )
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]
+        )
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05,),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.prelu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.bn2(x)
+        x = flow.flatten(x, 1)
+        x = self.dropout(x)
+        x = self.fc(x)
+        x = self.features(x)
+
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet(
+        "iresnet18", IBasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs
+    )
+
+
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet(
+        "iresnet34", IBasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs
+    )
+
+
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet(
+        "iresnet50", IBasicBlock, [3, 4, 14, 3], pretrained, progress, **kwargs
+    )
+
+
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet(
+        "iresnet100", IBasicBlock, [3, 13, 30, 3], pretrained, progress, **kwargs
+    )
+
+
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet(
+        "iresnet200", IBasicBlock, [6, 26, 60, 6], pretrained, progress, **kwargs
+    )
diff --git a/insightface/recognition/arcface_oneflow/configs/__init__.py b/insightface/recognition/arcface_oneflow/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_oneflow/configs/base.py b/insightface/recognition/arcface_oneflow/configs/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4272484d2d22667f93d04bedd511fba58fe15db
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/base.py
@@ -0,0 +1,68 @@
+from pickle import TRUE
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = "ms1mv3_arcface_r50"
+
+config.dataset = "ms1m-retinaface-t1"
+config.embedding_size = 512
+config.fp16 = False
+config.model_parallel = False
+config.sample_rate = 1.0
+config.partial_fc = False
+config.graph = True
+config.synthetic = False
+config.scale_grad = False
+
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+config.val_image_num = {"lfw": 12000, "cfp_fp": 14000, "agedb_30": 12000}
+if config.dataset == "emore":
+    config.ofrecord_path = "/train_tmp/faces_emore"
+    config.num_classes = 85742
+    config.num_image = 5822653
+    config.num_epoch = 16
+    config.warmup_epoch = -1
+    config.decay_epoch = [
+        8,
+        14,
+    ]
+    config.val_targets = [
+        "lfw",
+    ]
+
+elif config.dataset == "ms1m-retinaface-t1":
+    config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord"
+    config.num_classes = 93431
+    config.num_image = 5179510
+    config.num_epoch = 25
+    config.warmup_epoch = -1
+    config.decay_epoch = [11, 17, 22]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+elif config.dataset == "glint360k":
+    config.ofrecord_path = "/train_tmp/glint360k"
+    config.num_classes = 360232
+    config.num_image = 17091657
+    config.num_epoch = 20
+    config.warmup_epoch = -1
+    config.decay_epoch = [8, 12, 15, 18]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+elif config.dataset == "webface":
+    config.ofrecord_path = "/train_tmp/faces_webface_112x112"
+    config.num_classes = 10572
+    config.num_image = "forget"
+    config.num_epoch = 34
+    config.warmup_epoch = -1
+    config.decay_epoch = [20, 28, 32]
+    config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/glint360k_mbf.py b/insightface/recognition/arcface_oneflow/configs/glint360k_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..96697a13bb9759864accbd2159d24c272fe7e7d3
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/glint360k_mbf.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.model_parallel = True
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.dataset = "glint360k"
+config.ofrecord_path = "/train_tmp/glint360k/"
+config.ofrecord_part_num = 200
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/glint360k_r100.py b/insightface/recognition/arcface_oneflow/configs/glint360k_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a415180c1ddefb6729a7479005c8cfe1f97f2dc
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/glint360k_r100.py
@@ -0,0 +1,31 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.model_parallel = True
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+
+config.dataset = "glint360k"
+config.ofrecord_path = "/train_tmp/glint360k/"
+config.ofrecord_part_num = 200
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/glint360k_r18.py b/insightface/recognition/arcface_oneflow/configs/glint360k_r18.py
new file mode 100644
index 0000000000000000000000000000000000000000..031fd02c0e3da2257e941343f2f14355908c93db
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/glint360k_r18.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.model_parallel = True
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.dataset = "glint360k"
+config.ofrecord_path = "/train_tmp/glint360k/"
+config.ofrecord_part_num = 200
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/glint360k_r34.py b/insightface/recognition/arcface_oneflow/configs/glint360k_r34.py
new file mode 100644
index 0000000000000000000000000000000000000000..b072b02ec7b397768b3370f0f7cae00b5e415530
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/glint360k_r34.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.model_parallel = True
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.dataset = "glint360k"
+config.ofrecord_path = "/train_tmp/glint360k/"
+config.ofrecord_part_num = 200
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/glint360k_r50.py b/insightface/recognition/arcface_oneflow/configs/glint360k_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..90add4875b665f48d60a405f184f931967be1618
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/glint360k_r50.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.model_parallel = True
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.dataset = "glint360k"
+config.ofrecord_path = "/train_tmp/glint360k/"
+config.ofrecord_part_num = 200
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = -1
+config.decay_epoch = [8, 12, 15, 18]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/ms1mv3_mbf.py b/insightface/recognition/arcface_oneflow/configs/ms1mv3_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..917fb137072bbc25ec239dc069c4460d7ba1b19d
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/ms1mv3_mbf.py
@@ -0,0 +1,29 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.model_parallel = True
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 2e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1"
+config.ofrecord_part_num = 8
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 20, 25]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/ms1mv3_r18.py b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r18.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b2168ad14460cb92ba3f499bc4ab8c2691e2e7d
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r18.py
@@ -0,0 +1,29 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.model_parallel = True
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1"
+config.ofrecord_part_num = 8
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/ms1mv3_r34.py b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r34.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95aef0d03ada14931ccee1c320b343f4f1fa338
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r34.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "arcface"
+config.network = "r34"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.model_parallel = True
+config.partial_fc = 1
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+
+config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1"
+config.ofrecord_part_num = 8
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/ms1mv3_r50.py b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7dede32fa2b9ed3a0c76c7a533bed27e3948df0
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/ms1mv3_r50.py
@@ -0,0 +1,30 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.loss = "cosface"
+config.network = "r50"
+config.resume = False
+config.output = "partial_fc"
+config.embedding_size = 512
+config.model_parallel = True
+config.partial_fc = 0
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+
+config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1/ofrecord/"
+config.ofrecord_part_num = 8
+config.num_classes = 93432
+config.num_image = 5179510
+config.num_epoch = 25
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_oneflow/configs/speed.py b/insightface/recognition/arcface_oneflow/configs/speed.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae0b8be7544d1f920044ef269e693a91ffeab40
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/configs/speed.py
@@ -0,0 +1,24 @@
+from easydict import EasyDict as edict
+
+# configs for test speed
+
+config = edict()
+config.loss = "arcface"
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.model_parallel = True
+config.sample_rate = 1.0
+config.fp16 = False
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # batch size is 512
+
+config.synthetic = True
+config.num_classes = 100000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.decay_epoch = [10, 16, 22]
+config.val_targets = []
diff --git a/insightface/recognition/arcface_oneflow/convert.sh b/insightface/recognition/arcface_oneflow/convert.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1ad7d8abd44a717568b55a65f4341317ef661310
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/convert.sh
@@ -0,0 +1,2 @@
+
+python3 oneflow2onnx.py configs/ms1mv3_r50 --model_path /workdir/epoch_0
diff --git a/insightface/recognition/arcface_oneflow/eval/__init__.py b/insightface/recognition/arcface_oneflow/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_oneflow/eval/onnx_helper.py b/insightface/recognition/arcface_oneflow/eval/onnx_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58cd5353be859cd7be680a5aa9ad079ad8f5efb
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/eval/onnx_helper.py
@@ -0,0 +1,227 @@
+import argparse
+import datetime
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+import onnx
+import onnxruntime
+from onnx import numpy_helper
+
+
+class ArcFaceORT:
+    def __init__(self, model_path):
+        self.model_path = model_path
+
+    def check(self, test_img=None):
+        max_model_size_mb = 1024
+        max_feat_dim = 512
+        max_time_cost = 15
+
+        if not os.path.exists(self.model_path):
+            return "model_path not exists"
+        if not os.path.isdir(self.model_path):
+            return "model_path should be directory"
+        onnx_files = []
+        for _file in os.listdir(self.model_path):
+            print("file_:", _file)
+            if _file.endswith(".onnx"):
+                onnx_files.append(osp.join(self.model_path, _file))
+        if len(onnx_files) == 0:
+            return "do not have onnx files"
+        self.model_file = sorted(onnx_files)[-1]
+        print("use onnx-model:", self.model_file)
+        try:
+            session = onnxruntime.InferenceSession(self.model_file, None)
+        except:
+            return "load onnx failed"
+
+        input_cfg = session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        print("input-shape:", input_shape)
+        if len(input_shape) != 4:
+            return "length of input_shape should be 4"
+        if not isinstance(input_shape[0], str):
+            # return "input_shape[0] should be str to support batch-inference"
+            print("reset input-shape[0] to None")
+            model = onnx.load(self.model_file)
+            model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = "None"
+            new_model_file = osp.join(self.model_path, "zzzzrefined.onnx")
+            onnx.save(model, new_model_file)
+            self.model_file = new_model_file
+            print("use new onnx-model:", self.model_file)
+            try:
+                session = onnxruntime.InferenceSession(self.model_file, None)
+            except:
+                return "load onnx failed"
+
+            input_cfg = session.get_inputs()[0]
+            input_shape = input_cfg.shape
+            print("new-input-shape:", input_shape)
+
+        self.image_size = tuple(input_shape[2:4][::-1])
+
+        input_name = input_cfg.name
+        outputs = session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+            # print(o.name, o.shape)
+        if len(output_names) != 1:
+            return "number of output nodes should be 1"
+        self.session = session
+        self.input_name = input_name
+        self.output_names = output_names
+
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        if len(graph.node) < 8:
+            return "too small onnx graph"
+
+        input_size = (112, 112)
+        self.crop = None
+        if True:
+            crop_file = osp.join(self.model_path, "crop.txt")
+            if osp.exists(crop_file):
+                lines = open(crop_file, "r").readlines()
+                if len(lines) != 6:
+                    return "crop.txt should contain 6 lines"
+                lines = [int(x) for x in lines]
+                self.crop = lines[:4]
+                input_size = tuple(lines[4:6])
+        if input_size != self.image_size:
+            return "input-size is inconsistant with onnx model input, %s vs %s" % (
+                input_size,
+                self.image_size,
+            )
+
+        self.model_size_mb = os.path.getsize(self.model_file) / float(1024 * 1024)
+        if self.model_size_mb > max_model_size_mb:
+            return "max model size exceed, given %.3f-MB" % self.model_size_mb
+
+        input_mean = None
+        input_std = None
+        if True:
+            pn_file = osp.join(self.model_path, "pixel_norm.txt")
+            if osp.exists(pn_file):
+                lines = open(pn_file, "r").readlines()
+                if len(lines) != 2:
+                    return "pixel_norm.txt should contain 2 lines"
+                input_mean = float(lines[0])
+                input_std = float(lines[1])
+        if input_mean is not None or input_std is not None:
+            if input_mean is None or input_std is None:
+                return "please set input_mean and input_std simultaneously"
+        else:
+            find_sub = False
+            find_mul = False
+            for nid, node in enumerate(graph.node[:8]):
+                print(nid, node.name)
+                if node.name.startswith("Sub") or node.name.startswith("_minus"):
+                    find_sub = True
+                if node.name.startswith("Mul") or node.name.startswith("_mul"):
+                    find_mul = True
+            if find_sub and find_mul:
+                # mxnet arcface model
+                input_mean = 0.0
+                input_std = 1.0
+            else:
+                input_mean = 127.5
+                input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        for initn in graph.initializer:
+            weight_array = numpy_helper.to_array(initn)
+
+            dt = weight_array.dtype
+            if dt.itemsize < 4:
+                return "invalid weight type - (%s:%s)" % (initn.name, dt.name)
+        if test_img is None:
+            test_img = np.random.randint(
+                0, 255, size=(self.image_size[1], self.image_size[0], 3), dtype=np.uint8
+            )
+        else:
+            test_img = cv2.resize(test_img, self.image_size)
+        feat, cost = self.benchmark(test_img)
+        if feat.shape[1] > max_feat_dim:
+            return "max feat dim exceed, given %d" % feat.shape[1]
+        self.feat_dim = feat.shape[1]
+        cost_ms = cost * 1000
+        if cost_ms > max_time_cost:
+            return "max time cost exceed, given %.4f" % cost_ms
+        self.cost_ms = cost_ms
+        print(
+            "check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f"
+            % (
+                self.model_size_mb,
+                self.feat_dim,
+                self.cost_ms,
+                self.input_mean,
+                self.input_std,
+            )
+        )
+        return None
+
+    def meta_info(self):
+        return {
+            "model-size-mb": self.model_size_mb,
+            "feature-dim": self.feat_dim,
+            "infer": self.cost_ms,
+        }
+
+    def forward(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.image_size
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1] : self.crop[3], self.crop[0] : self.crop[2], :]
+                if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]:
+                    nimg = cv2.resize(nimg, input_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(
+            imgs,
+            1.0 / self.input_std,
+            input_size,
+            (self.input_mean, self.input_mean, self.input_mean),
+            swapRB=True,
+        )
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+    def benchmark(self, img):
+        input_size = self.image_size
+        if self.crop is not None:
+            nimg = img[self.crop[1] : self.crop[3], self.crop[0] : self.crop[2], :]
+            if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]:
+                nimg = cv2.resize(nimg, input_size)
+            img = nimg
+        blob = cv2.dnn.blobFromImage(
+            img,
+            1.0 / self.input_std,
+            input_size,
+            (self.input_mean, self.input_mean, self.input_mean),
+            swapRB=True,
+        )
+        costs = []
+        for _ in range(50):
+            ta = datetime.datetime.now()
+            net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+            tb = datetime.datetime.now()
+            cost = (tb - ta).total_seconds()
+            costs.append(cost)
+        costs = sorted(costs)
+        cost = costs[5]
+        return net_out, cost
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_root", help="onnx model root, default is './'", default="./"
+    )
+    args = parser.parse_args()
+    ArcFaceORT(args.model_root).check()
diff --git a/insightface/recognition/arcface_oneflow/eval/onnx_ijbc.py b/insightface/recognition/arcface_oneflow/eval/onnx_ijbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d07d81850dc43173dbb08f0cbe043c156622571
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/eval/onnx_ijbc.py
@@ -0,0 +1,289 @@
+import argparse
+import os
+import pickle
+import timeit
+
+import cv2
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import prettytable
+import skimage.transform
+from sklearn.metrics import roc_curve
+from sklearn.preprocessing import normalize
+
+from onnx_helper import ArcFaceORT
+
+SRC = np.array(
+    [
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041],
+    ],
+    dtype=np.float32,
+)
+SRC[:, 0] += 8.0
+
+
+class AlignedDataSet(mx.gluon.data.Dataset):
+    def __init__(self, root, lines, align=True):
+        self.lines = lines
+        self.root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(" ")
+        name = os.path.join(self.root, name_lmk_score[0])
+        img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
+        landmark5 = np.array(
+            [float(x) for x in name_lmk_score[1:-1]], dtype=np.float32
+        ).reshape((5, 2))
+        st = skimage.transform.SimilarityTransform()
+        st.estimate(landmark5, SRC)
+        img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0)
+        img_1 = np.expand_dims(img, 0)
+        img_2 = np.expand_dims(np.fliplr(img), 0)
+        output = np.concatenate((img_1, img_2), axis=0).astype(np.float32)
+        output = np.transpose(output, (0, 3, 1, 2))
+        output = mx.nd.array(output)
+        return output
+
+
+def extract(model_root, dataset):
+    model = ArcFaceORT(model_path=model_root)
+    model.check()
+    feat_mat = np.zeros(shape=(len(dataset), 2 * model.feat_dim))
+
+    def batchify_fn(data):
+        return mx.nd.concat(*data, dim=0)
+
+    data_loader = mx.gluon.data.DataLoader(
+        dataset,
+        128,
+        last_batch="keep",
+        num_workers=4,
+        thread_pool=True,
+        prefetch=16,
+        batchify_fn=batchify_fn,
+    )
+    num_iter = 0
+    for batch in data_loader:
+        batch = batch.asnumpy()
+        batch = (batch - model.input_mean) / model.input_std
+        feat = model.session.run(model.output_names, {model.input_name: batch})[0]
+        feat = np.reshape(feat, (-1, model.feat_dim * 2))
+        feat_mat[128 * num_iter : 128 * num_iter + feat.shape[0], :] = feat
+        num_iter += 1
+        if num_iter % 50 == 0:
+            print(num_iter)
+    return feat_mat
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=" ", header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=" ", header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, "rb") as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True),
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print("Finish Calculating {} template features.".format(count_template))
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates
+
+
+def verification(template_norm_feats=None, unique_templates=None, p1=None, p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000
+    sublists = [total_pairs[i : i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print("Finish {}/{} pairs.".format(c, total_sublists))
+    return score
+
+
+def verification2(template_norm_feats=None, unique_templates=None, p1=None, p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    # small batchsize instead of all pairs in one batch due to the memory limiation
+    batchsize = 100000
+    sublists = [total_pairs[i : i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print("Finish {}/{} pairs.".format(c, total_sublists))
+    return score
+
+
+def main(args):
+    use_norm_score = True  # if Ture, TestMode(N1)
+    use_detector_score = True  # if Ture, TestMode(D1)
+    use_flip_test = True  # if Ture, TestMode(F1)
+    assert args.target == "IJBC" or args.target == "IJBB"
+
+    start = timeit.default_timer()
+    templates, medias = read_template_media_list(
+        os.path.join(
+            "%s/meta" % args.image_path, "%s_face_tid_mid.txt" % args.target.lower()
+        )
+    )
+    stop = timeit.default_timer()
+    print("Time: %.2f s. " % (stop - start))
+
+    start = timeit.default_timer()
+    p1, p2, label = read_template_pair_list(
+        os.path.join(
+            "%s/meta" % args.image_path,
+            "%s_template_pair_label.txt" % args.target.lower(),
+        )
+    )
+    stop = timeit.default_timer()
+    print("Time: %.2f s. " % (stop - start))
+
+    start = timeit.default_timer()
+    img_path = "%s/loose_crop" % args.image_path
+    img_list_path = "%s/meta/%s_name_5pts_score.txt" % (
+        args.image_path,
+        args.target.lower(),
+    )
+    img_list = open(img_list_path)
+    files = img_list.readlines()
+    dataset = AlignedDataSet(root=img_path, lines=files, align=True)
+    img_feats = extract(args.model_root, dataset)
+
+    faceness_scores = []
+    for each_line in files:
+        name_lmk_score = each_line.split()
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    stop = timeit.default_timer()
+    print("Time: %.2f s. " % (stop - start))
+    print("Feature Shape: ({} , {}) .".format(img_feats.shape[0], img_feats.shape[1]))
+    start = timeit.default_timer()
+
+    if use_flip_test:
+        img_input_feats = (
+            img_feats[:, 0 : img_feats.shape[1] // 2]
+            + img_feats[:, img_feats.shape[1] // 2 :]
+        )
+    else:
+        img_input_feats = img_feats[:, 0 : img_feats.shape[1] // 2]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        img_input_feats = img_input_feats / np.sqrt(
+            np.sum(img_input_feats ** 2, -1, keepdims=True)
+        )
+
+    if use_detector_score:
+        print(img_input_feats.shape, faceness_scores.shape)
+        img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+    else:
+        img_input_feats = img_input_feats
+
+    template_norm_feats, unique_templates = image2template_feature(
+        img_input_feats, templates, medias
+    )
+    stop = timeit.default_timer()
+    print("Time: %.2f s. " % (stop - start))
+
+    start = timeit.default_timer()
+    score = verification(template_norm_feats, unique_templates, p1, p2)
+    stop = timeit.default_timer()
+    print("Time: %.2f s. " % (stop - start))
+    save_path = os.path.join(args.result_dir, "{}_result".format(args.target))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    score_save_file = os.path.join(save_path, "{}.npy".format(args.model_root))
+    np.save(score_save_file, score)
+    files = [score_save_file]
+    methods = []
+    scores = []
+    for file in files:
+        methods.append(os.path.basename(file))
+        scores.append(np.load(file))
+    methods = np.array(methods)
+    scores = dict(zip(methods, scores))
+    x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+    tpr_fpr_table = prettytable.PrettyTable(["Methods"] + [str(x) for x in x_labels])
+    for method in methods:
+        fpr, tpr, _ = roc_curve(label, scores[method])
+        fpr = np.flipud(fpr)
+        tpr = np.flipud(tpr)
+        tpr_fpr_row = []
+        tpr_fpr_row.append("%s-%s" % (method, args.target))
+        for fpr_iter in np.arange(len(x_labels)):
+            _, min_index = min(
+                list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr))))
+            )
+            tpr_fpr_row.append("%.2f" % (tpr[min_index] * 100))
+        tpr_fpr_table.add_row(tpr_fpr_row)
+    print(tpr_fpr_table)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="do ijb test")
+    # general
+    parser.add_argument("--model-root", default="", help="path to load model.")
+    parser.add_argument("--image-path", default="", type=str, help="")
+    parser.add_argument("--result-dir", default=".", type=str, help="")
+    parser.add_argument(
+        "--target", default="IJBC", type=str, help="target, set to IJBC or IJBB"
+    )
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_oneflow/eval/verification.py b/insightface/recognition/arcface_oneflow/eval/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc5b61f38b9f7cdda1e6ec9ccd92662b913d517
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/eval/verification.py
@@ -0,0 +1,327 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import datetime
+import os
+import pickle
+
+
+import numpy as np
+import sklearn
+import oneflow as flow
+
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+import cv2 as cv
+import logging
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(
+    thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, pca=0
+):
+    assert embeddings1.shape[0] == embeddings2.shape[0]
+    assert embeddings1.shape[1] == embeddings2.shape[1]
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if pca > 0:
+            print("doing pca on", fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set]
+            )
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            (
+                tprs[fold_idx, threshold_idx],
+                fprs[fold_idx, threshold_idx],
+                _,
+            ) = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]
+        )
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))
+    )
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(
+    thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10
+):
+    assert embeddings1.shape[0] == embeddings2.shape[0]
+    assert embeddings1.shape[1] == embeddings2.shape[1]
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set]
+            )
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind="slinear")
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set]
+        )
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        nrof_folds=nrof_folds,
+        pca=pca,
+    )
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        1e-3,
+        nrof_folds=nrof_folds,
+    )
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def load_bin_cv(path, image_size):
+    bins, issame_list = pickle.load(open(path, "rb"), encoding="bytes")
+    data_list = []
+    for flip in [0, 1]:
+        data = flow.empty(len(issame_list) * 2, 3, image_size[0], image_size[1])
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        img_ori = cv.imdecode(_bin, cv.IMREAD_COLOR)[:, :, ::-1]
+
+        for flip in [0, 1]:
+            img = img_ori.copy()
+            if flip == 1:
+                img = cv.flip(img, 1)
+            img = np.array(img).transpose((2, 0, 1))
+            img = (img - 127.5) * 0.00784313725
+            data_list[flip][i] = flow.tensor(img, dtype=flow.float)
+
+        if i % 1000 == 0:
+            logging.info("loading bin:%d", i)
+    logging.info(data_list[0].shape)
+    return data_list, issame_list
+
+
+@flow.no_grad()
+def test(data_set, backbone, batch_size, nfolds=10, is_consistent=False):
+    logging.info("testing verification..")
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    if is_consistent:
+        placement = flow.env.all_device_placement("cpu")
+        sbp = flow.sbp.split(0)
+
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            img = data[bb - batch_size : bb]
+            time0 = datetime.datetime.now()
+            with flow.no_grad():
+                if is_consistent:
+                    img = img.to_consistent(placement=placement, sbp=sbp)
+                net_out = backbone(img.to("cuda"))
+
+            if is_consistent:
+                _embeddings = net_out.to_local().numpy()
+            else:
+                _embeddings = net_out.detach().numpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count) :, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    logging.info(embeddings.shape)
+    logging.info("infer time:%f" % time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(
+        embeddings, issame_list, nrof_folds=nfolds
+    )
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def dumpR(data_set, backbone, batch_size, name="", data_extra=None, label_shape=None):
+    print("dump verification embedding..")
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data,), label=(_label,))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra), label=(_label,))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count) :, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join("temp.bin")
+    with open(outname, "wb") as f:
+        pickle.dump((embeddings, issame_list), f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/insightface/recognition/arcface_oneflow/function.py b/insightface/recognition/arcface_oneflow/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a9fa35d44d1ef60b5c10d9f43e43749913dbc11
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/function.py
@@ -0,0 +1,261 @@
+import oneflow as flow
+from oneflow.nn.parallel import DistributedDataParallel as ddp
+from utils.ofrecord_data_utils import OFRecordDataLoader, SyntheticDataLoader
+from utils.utils_logging import AverageMeter
+from utils.utils_callbacks import CallBackVerification, CallBackLogging, CallBackModelCheckpoint
+from backbones import get_model
+from graph import TrainGraph, EvalGraph
+from utils.losses import CrossEntropyLoss_sbp
+import logging
+
+
+def make_data_loader(args, mode, is_consistent=False, synthetic=False):
+    assert mode in ("train", "validation")
+
+    if mode == "train":
+        total_batch_size = args.batch_size*flow.env.get_world_size()
+        batch_size = args.batch_size
+        num_samples = args.num_image
+    else:
+        total_batch_size = args.val_global_batch_size
+        batch_size = args.val_batch_size
+        num_samples = args.val_samples_per_epoch
+
+    placement = None
+    sbp = None
+
+    if is_consistent:
+        placement = flow.env.all_device_placement("cpu")
+        sbp = flow.sbp.split(0)
+        batch_size = total_batch_size
+
+    if synthetic:
+
+        data_loader = SyntheticDataLoader(
+            batch_size=batch_size,
+            num_classes=args.num_classes,
+            placement=placement,
+            sbp=sbp,
+        )
+        return data_loader.to("cuda")
+
+    ofrecord_data_loader = OFRecordDataLoader(
+        ofrecord_root=args.ofrecord_path,
+        mode=mode,
+        dataset_size=num_samples,
+        batch_size=batch_size,
+        total_batch_size=total_batch_size,
+        data_part_num=args.ofrecord_part_num,
+        placement=placement,
+        sbp=sbp,
+    )
+    return ofrecord_data_loader
+
+
+def make_optimizer(args, model):
+    param_group = {"params": [p for p in model.parameters() if p is not None]}
+
+    optimizer = flow.optim.SGD(
+        [param_group],
+        lr=args.lr,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay,
+    )
+    return optimizer
+
+
+class FC7(flow.nn.Module):
+    def __init__(self, embedding_size, num_classes, cfg, partial_fc=False, bias=False):
+        super(FC7, self).__init__()
+        self.weight = flow.nn.Parameter(
+            flow.empty(num_classes, embedding_size))
+        flow.nn.init.normal_(self.weight, mean=0, std=0.01)
+
+        self.partial_fc = partial_fc
+
+        size = flow.env.get_world_size()
+        num_local = (cfg.num_classes + size - 1) // size
+        self.num_sample = int(num_local * cfg.sample_rate)
+        self.total_num_sample = self.num_sample * size
+
+    def forward(self, x, label):
+        x = flow.nn.functional.l2_normalize(input=x, dim=1, epsilon=1e-10)
+        if self.partial_fc:
+            (
+                mapped_label,
+                sampled_label,
+                sampled_weight,
+            ) = flow.distributed_partial_fc_sample(
+                weight=self.weight, label=label, num_sample=self.total_num_sample,
+            )
+            label = mapped_label
+            weight = sampled_weight
+        else:
+            weight = self.weight
+        weight = flow.nn.functional.l2_normalize(
+            input=weight, dim=1, epsilon=1e-10)
+        x = flow.matmul(x, weight, transpose_b=True)
+        if x.is_consistent:
+            return x, label
+        else:
+            return x
+
+
+class Train_Module(flow.nn.Module):
+    def __init__(self, cfg, backbone, placement, world_size):
+        super(Train_Module, self).__init__()
+        self.placement = placement
+
+        if cfg.graph:
+            if cfg.model_parallel:
+                input_size = cfg.embedding_size
+                output_size = int(cfg.num_classes/world_size)
+                self.fc = FC7(input_size, output_size, cfg, partial_fc=cfg.partial_fc).to_consistent(
+                    placement=placement, sbp=flow.sbp.split(0))
+            else:
+                self.fc = FC7(cfg.embedding_size, cfg.num_classes, cfg).to_consistent(
+                    placement=placement, sbp=flow.sbp.broadcast)
+            self.backbone = backbone.to_consistent(
+                placement=placement, sbp=flow.sbp.broadcast)
+        else:
+            self.backbone = backbone
+            self.fc = FC7(cfg.embedding_size, cfg.num_classes, cfg)
+
+    def forward(self, x, labels):
+        x = self.backbone(x)
+        if x.is_consistent:
+            x = x.to_consistent(sbp=flow.sbp.broadcast)
+        x = self.fc(x, labels)
+        return x
+
+
+class Trainer(object):
+    def __init__(self, cfg, placement, load_path, world_size, rank):
+        self.placement = placement
+        self.load_path = load_path
+        self.cfg = cfg
+        self.world_size = world_size
+        self.rank = rank
+
+        # model
+        self.backbone = get_model(cfg.network, dropout=0.0,
+                                  num_features=cfg.embedding_size).to("cuda")
+        self.train_module = Train_Module(
+            cfg, self.backbone, self.placement, world_size).to("cuda")
+        if cfg.resume:
+            if load_path is not None:
+                self.load_state_dict()
+            else:
+                logging.info("Model resume failed! load path is None ")
+
+        # optimizer
+        self.optimizer = make_optimizer(cfg, self.train_module)
+
+        # data
+        self.train_data_loader = make_data_loader(
+            cfg, 'train', self.cfg.graph, self.cfg.synthetic)
+
+        # loss
+        if cfg.loss == "cosface":
+            self.margin_softmax = flow.nn.CombinedMarginLoss(
+                1, 0., 0.4).to("cuda")
+        else:
+            self.margin_softmax = flow.nn.CombinedMarginLoss(
+                1, 0.5, 0.).to("cuda")
+
+        self.of_cross_entropy = CrossEntropyLoss_sbp()
+
+        # lr_scheduler
+        self.decay_step = self.cal_decay_step()
+        self.scheduler = flow.optim.lr_scheduler.MultiStepLR(
+            optimizer=self.optimizer, milestones=self.decay_step, gamma=0.1
+        )
+
+        # log
+        self.callback_logging = CallBackLogging(
+            50, rank, cfg.total_step, cfg.batch_size, world_size, None)
+        # val
+        self.callback_verification = CallBackVerification(
+            600, rank, cfg.val_targets, cfg.ofrecord_path, is_consistent=cfg.graph)
+        # save checkpoint
+        self.callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output)
+
+        self.losses = AverageMeter()
+        self.start_epoch = 0
+        self.global_step = 0
+
+    def __call__(self):
+        # Train
+        if self.cfg.graph:
+            self.train_graph()
+        else:
+            self.train_eager()
+
+    def load_state_dict(self):
+
+        if self.is_consistent:
+            state_dict = flow.load(self.load_path, consistent_src_rank=0)
+        elif self.rank == 0:
+            state_dict = flow.load(self.load_path)
+        else:
+            return
+        logging.info("Model resume successfully!")
+        self.model.load_state_dict(state_dict)
+
+    def cal_decay_step(self):
+        cfg = self.cfg
+        num_image = cfg.num_image
+        total_batch_size = cfg.batch_size * self.world_size
+        self.warmup_step = num_image // total_batch_size * cfg.warmup_epoch
+        self.cfg.total_step = num_image // total_batch_size * cfg.num_epoch
+        logging.info("Total Step is:%d" % self.cfg.total_step)
+        return [x * num_image // total_batch_size for x in cfg.decay_epoch]
+
+    def train_graph(self):
+        train_graph = TrainGraph(self.train_module, self.cfg, self.margin_softmax,
+                                 self.of_cross_entropy, self.train_data_loader, self.optimizer, self.scheduler)
+        # train_graph.debug()
+        val_graph = EvalGraph(self.backbone, self.cfg)
+
+        for epoch in range(self.start_epoch, self.cfg.num_epoch):
+            self.train_module.train()
+            one_epoch_steps = len(self.train_data_loader)
+            for steps in range(one_epoch_steps):
+                self.global_step += 1
+                loss = train_graph()
+                loss = loss.to_consistent(
+                    sbp=flow.sbp.broadcast).to_local().numpy()
+                self.losses.update(loss, 1)
+                self.callback_logging(self.global_step,  self.losses, epoch, False,
+                                      self.scheduler.get_last_lr()[0])
+                self.callback_verification(
+                    self.global_step, self.train_module, val_graph)
+            self.callback_checkpoint(self.global_step, epoch,
+                                     self.train_module, is_consistent=True)
+
+    def train_eager(self):
+        self.train_module = ddp(self.train_module)
+        for epoch in range(self.start_epoch, self.cfg.num_epoch):
+            self.train_module.train()
+
+            one_epoch_steps = len(self.train_data_loader)
+            for steps in range(one_epoch_steps):
+                self.global_step += 1
+                image, label = self.train_data_loader()
+                image = image.to("cuda")
+                label = label.to("cuda")
+                features_fc7 = self.train_module(image, label)
+                features_fc7 = self.margin_softmax(features_fc7, label)*64
+                loss = self.of_cross_entropy(features_fc7, label)
+                loss.backward()
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                loss = loss.numpy()
+                self.losses.update(loss, 1)
+                self.callback_logging(self.global_step,  self.losses, epoch, False,
+                                      self.scheduler.get_last_lr()[0])
+                self.callback_verification(self.global_step, self.backbone)
+                self.scheduler.step()
+            self.callback_checkpoint(
+                self.global_step, epoch, self.train_module)
diff --git a/insightface/recognition/arcface_oneflow/graph.py b/insightface/recognition/arcface_oneflow/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a833ccfdb350539294b1af31a7411397b6290f
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/graph.py
@@ -0,0 +1,73 @@
+import oneflow as flow
+import oneflow.nn as nn
+
+
+def make_static_grad_scaler():
+    return flow.amp.StaticGradScaler(flow.env.get_world_size())
+
+
+def make_grad_scaler():
+    return flow.amp.GradScaler(
+        init_scale=2 ** 30, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000,
+    )
+
+
+def meter(self, mkey, *args):
+    assert mkey in self.m
+    self.m[mkey]["meter"].record(*args)
+
+
+class TrainGraph(flow.nn.Graph):
+    def __init__(
+        self,
+        model,
+        cfg,
+        combine_margin,
+        cross_entropy,
+        data_loader,
+        optimizer,
+        lr_scheduler=None,
+    ):
+        super().__init__()
+
+        if cfg.fp16:
+            self.config.enable_amp(True)
+            self.set_grad_scaler(make_grad_scaler())
+        elif cfg.scale_grad:
+            self.set_grad_scaler(make_static_grad_scaler())
+
+        self.config.allow_fuse_add_to_output(True)
+        self.config.allow_fuse_model_update_ops(True)
+
+        self.model = model
+
+        self.cross_entropy = cross_entropy
+        self.combine_margin = combine_margin
+        self.data_loader = data_loader
+        self.add_optimizer(optimizer, lr_sch=lr_scheduler)
+
+    def build(self):
+        image, label = self.data_loader()
+
+        image = image.to("cuda")
+        label = label.to("cuda")
+
+        logits, label = self.model(image, label)
+        logits = self.combine_margin(logits, label) * 64
+        loss = self.cross_entropy(logits, label)
+
+        loss.backward()
+        return loss
+
+
+class EvalGraph(flow.nn.Graph):
+    def __init__(self, model, cfg):
+        super().__init__()
+        self.config.allow_fuse_add_to_output(True)
+        self.model = model
+        if cfg.fp16:
+            self.config.enable_amp(True)
+
+    def build(self, image):
+        logits = self.model(image)
+        return logits
diff --git a/insightface/recognition/arcface_oneflow/oneflow2onnx.py b/insightface/recognition/arcface_oneflow/oneflow2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1713bc06f7ad0668f642234163083683077d195
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/oneflow2onnx.py
@@ -0,0 +1,67 @@
+import os
+from os import mkdir
+from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
+import oneflow as flow
+
+import logging
+from backbones import get_model
+from utils.utils_config import get_config
+import argparse
+import tempfile
+
+
+class ModelGraph(flow.nn.Graph):
+    def __init__(self, model):
+        super().__init__()
+        self.backbone = model
+
+    def build(self, x):
+        x = x.to("cuda")
+        out = self.backbone(x)
+        return out
+
+
+def convert_func(cfg, model_path, out_path,image_size):
+
+    model_module = get_model(cfg.network, dropout=0.0,
+                             num_features=cfg.embedding_size).to("cuda")
+    model_module.eval()
+    print(model_module)
+    model_graph = ModelGraph(model_module)
+    model_graph._compile(flow.randn(1, 3, image_size, image_size).to("cuda"))
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        new_parameters = dict()
+        parameters = flow.load(model_path)
+        for key, value in parameters.items():
+            if "num_batches_tracked" not in key:
+                if key == "fc.weight":
+                    continue
+                val = value
+                new_key = key.replace("backbone.", "")
+                new_parameters[new_key] = val
+        model_module.load_state_dict(new_parameters)
+        flow.save(model_module.state_dict(), tmpdirname)
+        convert_to_onnx_and_check(
+            model_graph, flow_weight_dir=tmpdirname, onnx_model_path="./", print_outlier=True)
+
+
+def main(args):
+    logging.basicConfig(level=logging.NOTSET)
+    logging.info(args.model_path)
+    cfg = get_config(args.config)
+    if not os.path.exists(args.out_path):
+        mkdir(args.out_path)
+    convert_func(cfg, args.model_path, args.out_path,args.image_size)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='OneFlow ArcFace val')
+    parser.add_argument('config', type=str, help='py config file')
+    parser.add_argument('--model_path', type=str, help='model path')
+    parser.add_argument('--image_size', type=int,
+                        default=112, help='input image size')
+    parser.add_argument('--out_path', type=str,
+                        default="onnx_model", help='out path')
+
diff --git a/insightface/recognition/arcface_oneflow/requirements.txt b/insightface/recognition/arcface_oneflow/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9b31a23c62d60e2d99b56db0cac7cbdebdd0952e
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+matplotlib
+Pillow
+opencv-python
+scikit-learn
+scipy
+easydict
\ No newline at end of file
diff --git a/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py b/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7047a3ab1e4650010545784743c49ea34f4a5c1
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py
@@ -0,0 +1,136 @@
+import os
+import sys
+import struct
+import argparse
+
+from mxnet import recordio
+import oneflow.core.record.record_pb2 as of_record
+
+
+def parse_arguement(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="insightface/datasets/faces_emore",
+        help="Root directory to mxnet dataset.",
+    )
+    parser.add_argument(
+        "--output_filepath",
+        type=str,
+        default="./output",
+        help="Path to output OFRecord.",
+    )
+    return parser.parse_args(argv)
+
+
+def load_train_data(data_dir):
+
+    path_imgrec = os.path.join(data_dir, "train.rec")
+    path_imgidx = path_imgrec[0:-4] + ".idx"
+
+    print(
+        "Loading recordio {}\n\
+  Corresponding record idx is {}".format(
+            path_imgrec, path_imgidx
+        )
+    )
+
+    imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r", key_type=int)
+
+    # Read header0 to get some info.
+    identity_key_start = 0
+    identity_key_end = 0
+    imgidx_list = []
+    id2range = {}
+
+    rec0 = imgrec.read_idx(0)
+    header0, img_str = recordio.unpack(rec0)
+    if header0.flag > 0:
+        identity_key_start = int(header0.label[0])
+        identity_key_end = int(header0.label[1])
+        imgidx_list = range(1, identity_key_start)
+
+        # Read identity id range
+        for identity in range(identity_key_start, identity_key_end):
+            rec = imgrec.read_idx(identity)
+            header, s = recordio.unpack(rec)
+            a, b = int(header.label[0]), int(header.label[1])
+            id2range[identity] = (a, b)
+
+    else:
+        imgidx_list = imgrec.keys
+    return imgrec, imgidx_list
+
+
+def convert_to_ofrecord(img_data):
+    """ Convert python dictionary formath data of one image to of.Example proto.
+  Args:
+    img_data: Python dict.
+  Returns:
+    example: The converted of.Exampl
+  """
+
+    def _int32_feature(value):
+        """Wrapper for inserting int32 features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(int32_list=of_record.Int32List(value=value))
+
+    def _float_feature(value):
+        """Wrapper for inserting float features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(float_list=of_record.FloatList(value=value))
+
+    def _double_feature(value):
+        """Wrapper for inserting float features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(double_list=of_record.DoubleList(value=value))
+
+    def _bytes_feature(value):
+        """Wrapper for inserting bytes features into Example proto."""
+        # if isinstance(value, six.string_types):
+        #  value = six.binary_type(value, encoding='utf-8')
+        return of_record.Feature(bytes_list=of_record.BytesList(value=[value]))
+
+    example = of_record.OFRecord(
+        feature={
+            "label": _int32_feature(img_data["label"]),
+            "encoded": _bytes_feature(img_data["pixel_data"]),
+        }
+    )
+
+    return example
+
+
+def main(args):
+    # Convert recordio to ofrecord
+    imgrec, imgidx_list = load_train_data(data_dir=args.data_dir)
+
+    output_dir = os.path.join(args.output_filepath, "train")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_file = os.path.join(output_dir, "part-0")
+    with open(output_file, "wb") as f:
+        for idx in imgidx_list:
+            if idx % 10000 == 0:
+                print("Converting images: {} of {}".format(idx, len(imgidx_list)))
+
+            img_data = {}
+            rec = imgrec.read_idx(idx)
+            header, s = recordio.unpack(rec)
+            img_data["label"] = int(header.label[0])
+            img_data["pixel_data"] = s
+
+            example = convert_to_ofrecord(img_data)
+            print("shape", len(img_data["pixel_data"]))
+            size = example.ByteSize()
+            f.write(struct.pack("q", size))
+            f.write(example.SerializeToString())
+
+
+if __name__ == "__main__":
+    main(parse_arguement(sys.argv[1:]))
diff --git a/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py b/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3b82c159a42f2acda47acd084b2d826e434441
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py
@@ -0,0 +1,156 @@
+import os
+import sys
+import struct
+import argparse
+import numbers
+import random
+
+from mxnet import recordio
+import oneflow.core.record.record_pb2 as of_record
+
+
+def parse_arguement(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="insightface/datasets/faces_emore",
+        help="Root directory to mxnet dataset.",
+    )
+    parser.add_argument(
+        "--output_filepath",
+        type=str,
+        default="./ofrecord",
+        help="Path to output OFRecord.",
+    )
+    parser.add_argument(
+        "--num_part", type=int, default=96, help="num_part of OFRecord to generate.",
+    )
+    return parser.parse_args(argv)
+
+
+def load_train_data(data_dir):
+
+    path_imgrec = os.path.join(data_dir, "train.rec")
+    path_imgidx = path_imgrec[0:-4] + ".idx"
+
+    print(
+        "Loading recordio {}\n\
+        Corresponding record idx is {}".format(
+            path_imgrec, path_imgidx
+        )
+    )
+
+    imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r", key_type=int)
+
+    # Read header0 to get some info.
+    identity_key_start = 0
+    identity_key_end = 0
+    imgidx_list = []
+    id2range = {}
+
+    rec0 = imgrec.read_idx(0)
+    header0, img_str = recordio.unpack(rec0)
+    if header0.flag > 0:
+        identity_key_start = int(header0.label[0])
+        identity_key_end = int(header0.label[1])
+        imgidx_list = range(1, identity_key_start)
+
+        # Read identity id range
+        for identity in range(identity_key_start, identity_key_end):
+            rec = imgrec.read_idx(identity)
+            header, s = recordio.unpack(rec)
+            a, b = int(header.label[0]), int(header.label[1])
+            id2range[identity] = (a, b)
+
+    else:
+        imgidx_list = imgrec.keys
+
+    return imgrec, imgidx_list
+
+
+def convert_to_ofrecord(img_data):
+    """ Convert python dictionary formath data of one image to of.Example proto.
+  Args:
+    img_data: Python dict.
+  Returns:
+    example: The converted of.Exampl
+  """
+
+    def _int32_feature(value):
+        """Wrapper for inserting int32 features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(int32_list=of_record.Int32List(value=value))
+
+    def _float_feature(value):
+        """Wrapper for inserting float features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(float_list=of_record.FloatList(value=value))
+
+    def _double_feature(value):
+        """Wrapper for inserting float features into Example proto."""
+        if not isinstance(value, list):
+            value = [value]
+        return of_record.Feature(double_list=of_record.DoubleList(value=value))
+
+    def _bytes_feature(value):
+        """Wrapper for inserting bytes features into Example proto."""
+        # if isinstance(value, six.string_types):
+        #  value = six.binary_type(value, encoding='utf-8')
+        return of_record.Feature(bytes_list=of_record.BytesList(value=[value]))
+
+    example = of_record.OFRecord(
+        feature={
+            "label": _int32_feature(img_data["label"]),
+            "encoded": _bytes_feature(img_data["pixel_data"]),
+        }
+    )
+
+    return example
+
+
+def main(args):
+    # Convert recordio to ofrecord
+    imgrec, imgidx_list = load_train_data(data_dir=args.data_dir)
+    imgidx_list = list(imgidx_list)
+    random.shuffle(imgidx_list)
+
+    output_dir = os.path.join(args.output_filepath, "train")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    num_images = len(imgidx_list)
+    num_images_per_part = (num_images + args.num_part) // args.num_part
+    print("num_images", num_images, "num_images_per_part", num_images_per_part)
+
+    for part_id in range(args.num_part):
+        part_name = "part-" + "{:0>5d}".format(part_id)
+        output_file = os.path.join(output_dir, part_name)
+        file_idx_start = part_id * num_images_per_part
+        file_idx_end = min((part_id + 1) * num_images_per_part, num_images)
+        print("part-" + str(part_id), "start", file_idx_start, "end", file_idx_end)
+        with open(output_file, "wb") as f:
+            for file_idx in range(file_idx_start, file_idx_end):
+                idx = imgidx_list[file_idx]
+                if idx % 10000 == 0:
+                    print("Converting images: {} of {}".format(idx, len(imgidx_list)))
+
+                img_data = {}
+                rec = imgrec.read_idx(idx)
+                header, s = recordio.unpack(rec)
+                label = header.label
+                if not isinstance(label, numbers.Number):
+                    label = label[0]
+                img_data["label"] = int(label)
+                img_data["pixel_data"] = s
+
+                example = convert_to_ofrecord(img_data)
+                size = example.ByteSize()
+                f.write(struct.pack("q", size))
+                f.write(example.SerializeToString())
+
+
+if __name__ == "__main__":
+    main(parse_arguement(sys.argv[1:]))
diff --git a/insightface/recognition/arcface_oneflow/train.py b/insightface/recognition/arcface_oneflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..477db3ce3a26d41e4f328c5ad83d36d00c0a4903
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/train.py
@@ -0,0 +1,43 @@
+import argparse
+import logging
+import os
+import oneflow as flow
+
+from function import Trainer
+from utils.utils_logging import init_logging
+from utils.utils_config import get_config
+
+
+def main(args):
+    cfg = get_config(args.config)
+    cfg.graph = args.graph
+    rank = flow.env.get_rank()
+    world_size = flow.env.get_world_size()
+    placement = flow.env.all_device_placement("cuda")
+
+    os.makedirs(cfg.output, exist_ok=True)
+    log_root = logging.getLogger()
+    init_logging(log_root, rank, cfg.output)
+
+    # root dir of loading checkpoint
+    load_path = None
+
+    for key, value in cfg.items():
+        num_space = 25 - len(key)
+        logging.info(": " + key + " " * num_space + str(value))
+
+    trainer = Trainer(cfg, placement, load_path, world_size, rank)
+    trainer()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="OneFlow ArcFace Training")
+    parser.add_argument("config", type=str, help="py config file")
+    parser.add_argument(
+        "--graph",
+        action="store_true",
+        help="Run model in graph mode,else run model in ddp mode.",
+    )
+    parser.add_argument("--local_rank", type=int, default=0, help="local_rank")
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_oneflow/train_ddp.sh b/insightface/recognition/arcface_oneflow/train_ddp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..86ed5415c2405532b93b6c24c589f6a4dcd8288a
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/train_ddp.sh
@@ -0,0 +1,25 @@
+# set -aux
+
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=17788
+DEVICE_NUM_PER_NODE=8
+NUM_NODES=1
+NODE_RANK=0
+
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+export NCCL_DEBUG=INFO
+export ONEFLOW_DEBUG_MODE=True
+
+
+NCCL_DEBUG=INFO \
+python3 -m oneflow.distributed.launch \
+--nproc_per_node $DEVICE_NUM_PER_NODE \
+--nnodes $NUM_NODES \
+--node_rank $NODE_RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT \
+train.py configs/ms1mv3_r50.py 
diff --git a/insightface/recognition/arcface_oneflow/train_graph_distributed.sh b/insightface/recognition/arcface_oneflow/train_graph_distributed.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4f5725059924f6c7f6d42520f09d2fa99824449b
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/train_graph_distributed.sh
@@ -0,0 +1,26 @@
+# set -aux
+
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=17788
+DEVICE_NUM_PER_NODE=8
+NUM_NODES=1
+NODE_RANK=0
+
+
+
+export PYTHONUNBUFFERED=1
+echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
+export NCCL_LAUNCH_MODE=PARALLEL
+echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
+#export NCCL_DEBUG=INFO
+export ONEFLOW_DEBUG_MODE=True
+
+
+#NCCL_DEBUG=INFO
+python3 -m oneflow.distributed.launch \
+--nproc_per_node $DEVICE_NUM_PER_NODE \
+--nnodes $NUM_NODES \
+--node_rank $NODE_RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT \
+train.py configs/ms1mv3_r50.py --graph 
diff --git a/insightface/recognition/arcface_oneflow/utils/__init__.py b/insightface/recognition/arcface_oneflow/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_oneflow/utils/losses.py b/insightface/recognition/arcface_oneflow/utils/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4aa1d03a24b2f3368389493cf811bcebb8879d
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/utils/losses.py
@@ -0,0 +1,66 @@
+import oneflow as flow
+from oneflow import nn
+
+
+def get_loss(name):
+    if name == "cosface":
+        return CosFace()
+    elif name == "arcface":
+        return ArcFace()
+    else:
+        raise ValueError()
+
+
+class CrossEntropyLoss_sbp(nn.Module):
+    def __init__(self):
+        super(CrossEntropyLoss_sbp, self).__init__()
+
+    def forward(self, logits, label):
+        loss = flow._C.sparse_softmax_cross_entropy(
+            logits, label)
+        loss = flow.mean(loss)
+        return loss
+
+
+def get_loss(name):
+    if name == "cosface":
+        return CosFace()
+    elif name == "arcface":
+        return ArcFace()
+    else:
+        raise ValueError()
+
+
+class CosFace(nn.Module):
+    def __init__(self, s=64.0, m=0.40):
+        super(CosFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine, label):
+        index = flow.where(label != -1)[0]
+        m_hot = flow.zeros(index.size()[0], cosine.size()[
+                           1], device=cosine.device)
+
+        m_hot = flow.scatter(m_hot, 1, label[index, None], self.m)
+        cosine = cosine[index] - m_hot
+
+        ret = cosine * self.s
+        return ret
+
+
+class ArcFace(nn.Module):
+    def __init__(self, s=64.0, m=0.5):
+        super(ArcFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine: flow.Tensor, label):
+        index = flow.where(label != -1)[0]
+        m_hot = flow.zeros(index.size()[0], cosine.size()[
+                           1], device=cosine.device)
+        m_hot.scatter_(1, label[index, None], self.m)
+        cosine.acos_()
+        cosine[index] += m_hot
+        cosine.cos_().mul_(self.s)
+        return cosine
diff --git a/insightface/recognition/arcface_oneflow/utils/ofrecord_data_utils.py b/insightface/recognition/arcface_oneflow/utils/ofrecord_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..449fe25210218da6f5656f9ff51562863ed58e7a
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/utils/ofrecord_data_utils.py
@@ -0,0 +1,148 @@
+import oneflow as flow
+import oneflow.nn as nn
+import os
+from typing import List, Union
+
+
+class OFRecordDataLoader(nn.Module):
+    def __init__(
+        self,
+        ofrecord_root: str = "./ofrecord",
+        mode: str = "train",  # "val"
+        dataset_size: int = 9469,
+        batch_size: int = 1,
+        total_batch_size: int = 1,
+        data_part_num: int = 8,
+        placement: flow.placement = None,
+        sbp: Union[flow.sbp.sbp, List[flow.sbp.sbp]] = None,
+    ):
+        super().__init__()
+        channel_last = False
+        output_layout = "NHWC" if channel_last else "NCHW"
+        assert (ofrecord_root, mode)
+        self.train_record_reader = flow.nn.OfrecordReader(
+            os.path.join(ofrecord_root, mode),
+            batch_size=batch_size,
+            data_part_num=data_part_num,
+            part_name_suffix_length=5,
+            random_shuffle=True if mode == "train" else False,
+            shuffle_after_epoch=True if mode == "train" else False,
+            placement=placement,
+            sbp=sbp,
+        )
+        self.record_label_decoder = flow.nn.OfrecordRawDecoder(
+            "label", shape=(), dtype=flow.int32
+        )
+
+        color_space = "RGB"
+        height = 112
+        width = 112
+
+        self.record_image_decoder = flow.nn.OFRecordImageDecoder(
+            "encoded", color_space=color_space
+        )
+        self.resize = (
+            flow.nn.image.Resize(target_size=[height, width])
+            if mode == "train"
+            else flow.nn.image.Resize(
+                resize_side="shorter", keep_aspect_ratio=True, target_size=112
+            )
+        )
+
+        self.flip = (
+            flow.nn.CoinFlip(batch_size=batch_size, placement=placement, sbp=sbp)
+            if mode == "train"
+            else None
+        )
+
+        rgb_mean = [127.5, 127.5, 127.5]
+        rgb_std = [127.5, 127.5, 127.5]
+        self.crop_mirror_norm = (
+            flow.nn.CropMirrorNormalize(
+                color_space=color_space,
+                output_layout=output_layout,
+                mean=rgb_mean,
+                std=rgb_std,
+                output_dtype=flow.float,
+            )
+            if mode == "train"
+            else flow.nn.CropMirrorNormalize(
+                color_space=color_space,
+                output_layout=output_layout,
+                crop_h=0,
+                crop_w=0,
+                crop_pos_y=0.5,
+                crop_pos_x=0.5,
+                mean=rgb_mean,
+                std=rgb_std,
+                output_dtype=flow.float,
+            )
+        )
+
+        self.batch_size = batch_size
+        self.total_batch_size = total_batch_size
+        self.dataset_size = dataset_size
+
+    def __len__(self):
+        return self.dataset_size // self.total_batch_size
+
+    def forward(self):
+        train_record = self.train_record_reader()
+        label = self.record_label_decoder(train_record)
+        image_raw_buffer = self.record_image_decoder(train_record)
+
+        image = self.resize(image_raw_buffer)[0]
+
+        rng = self.flip() if self.flip != None else None
+        image = self.crop_mirror_norm(image, rng)
+
+        return image, label
+
+
+class SyntheticDataLoader(flow.nn.Module):
+    def __init__(
+        self, batch_size, image_size=112, num_classes=10000, placement=None, sbp=None,
+    ):
+        super().__init__()
+
+        self.image_shape = (batch_size, 3, image_size, image_size)
+        self.label_shape = (batch_size,)
+        self.num_classes = num_classes
+        self.placement = placement
+        self.sbp = sbp
+
+        if self.placement is not None and self.sbp is not None:
+            self.image = flow.nn.Parameter(
+                flow.randint(
+                    0,
+                    high=255,
+                    size=self.image_shape,
+                    dtype=flow.float32,
+                    placement=self.placement,
+                    sbp=self.sbp,
+                ),
+                requires_grad=False,
+            )
+            self.label = flow.nn.Parameter(
+                flow.randint(
+                    0,
+                    high=self.num_classes,
+                    size=self.label_shape,
+                    placement=self.placement,
+                    sbp=self.sbp,
+                ).to(dtype=flow.int32),
+                requires_grad=False,
+            )
+        else:
+            self.image = flow.randint(
+                0, high=255, size=self.image_shape, dtype=flow.float32, device="cuda"
+            )
+            self.label = flow.randint(
+                0, high=self.num_classes, size=self.label_shape, device="cuda",
+            ).to(dtype=flow.int32)
+
+    def __len__(self):
+        return 10000
+
+    def forward(self):
+        return self.image, self.label
diff --git a/insightface/recognition/arcface_oneflow/utils/utils_callbacks.py b/insightface/recognition/arcface_oneflow/utils/utils_callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d349c34456e21fa1ab4ca54a49ed2fe019e6279
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/utils/utils_callbacks.py
@@ -0,0 +1,181 @@
+import logging
+import os
+import time
+from typing import List
+
+import oneflow as flow
+
+from eval import verification
+from utils.utils_logging import AverageMeter
+
+
+class CallBackVerification(object):
+    def __init__(
+        self,
+        frequent,
+        rank,
+        val_targets,
+        rec_prefix,
+        image_size=(112, 112),
+        world_size=1,
+        is_consistent=False,
+    ):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.highest_acc: float = 0.0
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        self.world_size = world_size
+        self.is_consistent = is_consistent
+
+        if self.is_consistent:
+            self.init_dataset(
+                val_targets=val_targets, data_dir=rec_prefix, image_size=image_size
+            )
+        else:
+            if self.rank is 0:
+                self.init_dataset(
+                    val_targets=val_targets, data_dir=rec_prefix, image_size=image_size
+                )
+
+    def ver_test(self, backbone: flow.nn.Module, global_step: int):
+        results = []
+        for i in range(len(self.ver_list)):
+
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], backbone, 10, 10, self.is_consistent
+            )
+            logging.info(
+                "[%s][%d]XNorm: %f" % (
+                    self.ver_name_list[i], global_step, xnorm)
+            )
+            logging.info(
+                "[%s][%d]Accuracy-Flip: %1.5f+-%1.5f"
+                % (self.ver_name_list[i], global_step, acc2, std2)
+            )
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                "[%s][%d]Accuracy-Highest: %1.5f"
+                % (self.ver_name_list[i], global_step, self.highest_acc_list[i])
+            )
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+
+        for name in val_targets:
+            path = os.path.join(data_dir, "val", name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin_cv(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+        if len(self.ver_list) == 0:
+            logging.info("Val targets is None !")
+
+    def __call__(self, num_update, backbone: flow.nn.Module, backbone_graph=None):
+
+        if self.is_consistent:
+            if num_update > 0 and num_update % self.frequent == 0:
+                backbone.eval()
+                self.ver_test(backbone_graph, num_update)
+                backbone.train()
+        else:
+            if self.rank is 0 and num_update > 0 and num_update % self.frequent == 0:
+                backbone.eval()
+                self.ver_test(backbone, num_update)
+                backbone.train()
+
+
+class CallBackLogging(object):
+    def __init__(self, frequent, rank, total_step, batch_size, world_size, writer=None):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.batch_size: int = batch_size
+        self.world_size: int = world_size
+        self.writer = writer
+
+        self.init = False
+        self.tic = 0
+
+    def __call__(
+        self,
+        global_step: int,
+        loss: AverageMeter,
+        epoch: int,
+        fp16: bool,
+        learning_rate: float,
+        grad_scaler=None,
+    ):
+        if self.rank == 0 and global_step % self.frequent == 0:
+            if self.init:
+                try:
+                    speed: float = self.frequent * self.batch_size / (
+                        time.time() - self.tic
+                    )
+                    speed_total = speed * self.world_size
+                except ZeroDivisionError:
+                    speed_total = float("inf")
+
+                time_now = (time.time() - self.time_start) / 3600
+                time_total = time_now / ((global_step + 1) / self.total_step)
+                time_for_end = time_total - time_now
+                if self.writer is not None:
+                    self.writer.add_scalar(
+                        "time_for_end", time_for_end, global_step)
+                    self.writer.add_scalar(
+                        "learning_rate", learning_rate, global_step)
+                    self.writer.add_scalar("loss", loss.avg, global_step)
+                if fp16:
+                    msg = (
+                        "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   "
+                        "Fp16 Grad Scale: %2.f   Required: %1.f hours"
+                        % (
+                            speed_total,
+                            loss.avg,
+                            learning_rate,
+                            epoch,
+                            global_step,
+                            time_for_end,
+                        )
+                    )
+                else:
+                    msg = (
+                        "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   "
+                        "Required: %1.f hours"
+                        % (
+                            speed_total,
+                            loss.avg,
+                            learning_rate,
+                            epoch,
+                            global_step,
+                            time_for_end,
+                        )
+                    )
+                logging.info(msg)
+                loss.reset()
+                self.tic = time.time()
+            else:
+                self.init = True
+                self.tic = time.time()
+
+
+class CallBackModelCheckpoint(object):
+    def __init__(self, rank, output="./"):
+        self.rank: int = rank
+        self.output: str = output
+
+    def __call__(self, global_step, epoch, backbone, is_consistent=False):
+
+        if global_step > 100 and backbone is not None:
+            path_module = os.path.join(self.output, "epoch_%d" % (epoch))
+
+            if is_consistent:
+                flow.save(backbone.state_dict(),
+                          path_module, consistent_dst_rank=0)
+            else:
+                if self.rank == 0:
+                    flow.save(backbone.state_dict(), path_module)
+            logging.info("oneflow Model Saved in '{}'".format(path_module))
diff --git a/insightface/recognition/arcface_oneflow/utils/utils_config.py b/insightface/recognition/arcface_oneflow/utils/utils_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61724bd00b0f4b4c64e2a731263296946906702
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/utils/utils_config.py
@@ -0,0 +1,18 @@
+import importlib
+import os.path as osp
+
+
+def get_config(config_file):
+    assert config_file.startswith(
+        "configs/"
+    ), "config file setting must start with configs/"
+    temp_config_name = osp.basename(config_file)
+    temp_module_name = osp.splitext(temp_config_name)[0]
+    config = importlib.import_module("configs.base")
+    cfg = config.config
+    config = importlib.import_module("configs.%s" % temp_module_name)
+    job_cfg = config.config
+    cfg.update(job_cfg)
+    if cfg.output is None:
+        cfg.output = osp.join("work_dirs", temp_module_name)
+    return cfg
diff --git a/insightface/recognition/arcface_oneflow/utils/utils_logging.py b/insightface/recognition/arcface_oneflow/utils/utils_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..543a7e14af3f0d2f591c6b908a96d744b6d9df6d
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/utils/utils_logging.py
@@ -0,0 +1,40 @@
+import logging
+import os
+import sys
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(log_root, rank, models_root):
+    if rank is 0:
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
+        handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info("rank_id: %d" % rank)
diff --git a/insightface/recognition/arcface_oneflow/val.py b/insightface/recognition/arcface_oneflow/val.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb85074a8e91754be327e52c745eeb5b863a9d9b
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/val.py
@@ -0,0 +1,43 @@
+import oneflow as flow
+from utils.utils_callbacks import CallBackVerification
+from backbones import get_model
+from graph import TrainGraph, EvalGraph
+import logging
+import argparse
+from utils.utils_config import get_config
+
+
+def main(args):
+
+    cfg = get_config(args.config)
+    logging.basicConfig(level=logging.NOTSET)
+    logging.info(args.model_path)
+
+    backbone = get_model(cfg.network, dropout=0.0, num_features=cfg.embedding_size).to(
+        "cuda"
+    )
+    val_callback = CallBackVerification(
+        1, 0, cfg.val_targets, cfg.ofrecord_path)
+
+    state_dict = flow.load(args.model_path)
+
+    new_parameters = dict()
+    for key, value in state_dict.items():
+        if "num_batches_tracked" not in key:
+            if key == "fc.weight":
+                continue
+            new_key = key.replace("backbone.", "")
+            new_parameters[new_key] = value
+
+    backbone.load_state_dict(new_parameters)
+
+    infer_graph = EvalGraph(backbone, cfg)
+    val_callback(1000, backbone, infer_graph)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="OneFlow ArcFace val")
+    parser.add_argument("config", type=str, help="py config file")
+    parser.add_argument("--model_path", type=str, help="model path")
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_oneflow/val.sh b/insightface/recognition/arcface_oneflow/val.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8151f400c95244719daba5c99e01b90b849ca855
--- /dev/null
+++ b/insightface/recognition/arcface_oneflow/val.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python val.py configs/ms1mv3_r50 --model_path eager_test/epoch_0
diff --git a/insightface/recognition/arcface_paddle/README.md b/insightface/recognition/arcface_paddle/README.md
new file mode 120000
index 0000000000000000000000000000000000000000..13c4f964bb9063f28d6e08dfb8c6b828a81d2536
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/README.md
@@ -0,0 +1 @@
+README_en.md
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/README_cn.md b/insightface/recognition/arcface_paddle/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..840ff77dbc00d89d12e4b698d2f3c8afd6290972
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/README_cn.md
@@ -0,0 +1,281 @@
+简体中文 | [English](README_en.md)
+
+# Arcface-Paddle
+
+* [1. 简介](#简介)
+* [2. 环境准备](#环境准备)
+* [3. 数据准备](#数据准备)
+  * [3.1 下载数据集](#下载数据集)
+  * [3.2 从 MXNet 格式数据集抽取图像](#从MXNet格式数据集抽取图像)
+* [4. 训练](#训练)
+  * [4.1 单机单卡](#单机单卡)
+  * [4.2 单机 8 卡](#单机8卡)
+* [5. 模型评价](#模型评价)
+* [6. 模型导出](#模型导出)
+* [7 模型推理](#模型推理)
+* [8 模型性能](#模型性能)
+  * [8.1 轻量化模型性能](#轻量化模型性能)
+  * [8.2 验证集准确率](#验证集准确率)
+  * [8.3 最大类别数支持](#最大类别数支持)
+  * [8.4 吞吐量对比](#吞吐量对比)
+* [9. 全流程推理](#全流程推理)
+
+<a name="简介"></a>
+
+## 1. 简介
+
+`Arcface-Paddle`是基于PaddlePaddle实现的，开源深度人脸检测、识别工具。`Arcface-Paddle`目前提供了三个预训练模型，包括用于人脸检测的 `BlazeFace`、用于人脸识别的 `ArcFace` 和 `MobileFace`。
+
+- 本部分内容为人脸识别部分。
+- 人脸检测相关内容可以参考：[基于BlazeFace的人脸检测](../../detection/blazeface_paddle/README_cn.md)。
+- 基于PaddleInference的Whl包预测部署内容可以参考：[Whl包预测部署](https://github.com/littletomatodonkey/insight-face-paddle)。
+
+注: 在此非常感谢 [GuoQuanhao](https://github.com/GuoQuanhao) 基于PaddlePaddle复现了 [Arcface的基线模型](https://github.com/GuoQuanhao/arcface-Paddle)。
+
+<a name="环境准备"></a>
+
+## 2. 环境准备
+
+请参照 [Installation](./install_cn.md) 配置实验所需环境。
+
+<a name="数据准备"></a>
+
+## 3. 数据准备
+
+<a name="下载数据集"></a>
+
+### 3.1 下载数据集
+
+数据集可以从 [insightface datasets](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_) 下载.
+
+* MS1M_v2: MS1M-ArcFace
+* MS1M_v3: MS1M-RetinaFace
+
+<a name="从MXNet格式数据集抽取图像"></a>
+
+### 3.2 从 MXNet 格式数据集抽取图像
+
+```shell
+python tools/mx_recordio_2_images.py --root_dir ms1m-retinaface-t1/ --output_dir MS1M_v3/
+```
+
+当数据集抽取完后，输出的图像数据集目录结构如下：
+
+```
+MS1M_v3
+|_ images
+|  |_ 00000001.jpg
+|  |_ ...
+|  |_ 05179510.jpg
+|_ label.txt
+|_ agedb_30.bin
+|_ cfp_ff.bin
+|_ cfp_fp.bin
+|_ lfw.bin
+```
+
+标签文件格式如下：
+
+```
+# 图像路径与标签的分隔符: "\t"
+# 以下是 label.txt 每行的格式
+images/00000001.jpg 0
+...
+```
+
+如果你想使用自定义数据集训练，可以根据以上目录结构和标签文件格式组织数据。
+
+<a name="训练"></a>
+
+## 4. 训练
+
+<a name="单机单卡"></a>
+
+### 4.1 单机单卡
+
+```bash
+export CUDA_VISIBLE_DEVICES=1
+python tools/train.py \
+    --config_file configs/ms1mv2_mobileface.py \
+    --embedding_size 128 \
+    --sample_ratio 1.0 \
+    --loss ArcFace \
+    --batch_size 512 \
+    --dataset MS1M_v2 \
+    --num_classes 85742 \
+    --data_dir MS1M_v2/ \
+    --label_file MS1M_v2/label.txt \
+    --fp16 False
+```
+<a name="单机8卡"></a>
+
+### 4.2 单机 8 卡
+
+为了方便训练，已经为用户准备好训练启动脚本。
+
+#### 静态图模式训练
+
+```bash
+sh scripts/train_static.sh
+```
+
+#### 动态图模式训练
+
+```bash
+sh scripts/train_dynamic.sh
+```
+
+注：多机器多卡训练参见 ``paddle.distributed.launch`` API 文档。单机与多机训练不同之处在于多机需要设置 ``--ips`` 参数。
+
+在训练过程中，你可以实时通过 `VisualDL` 可视化查看 loss 的变化，更多信息可以参考 [VisualDL](https://github.com/PaddlePaddle/VisualDL/)。
+
+<a name="模型评价"></a>
+
+## 5. 模型评价
+
+模型评价可以通过以下脚本启动
+
+#### 静态图模式
+
+```bash
+sh scripts/validation_static.sh
+```
+
+#### 动态图模式
+
+```bash
+sh scripts/validation_dynamic.sh
+```
+
+<a name="模型导出"></a>
+
+## 6. 模型导出
+
+PaddlePaddle 支持用预测引擎直接推理，首先，需要导出推理模型，通过以下脚本进行导出
+
+#### 静态图模式
+
+```bash
+sh scripts/export_static.sh
+```
+
+#### 动态图模式
+
+```bash
+sh scripts/export_dynamic.sh
+```
+
+<a name="模型推理"></a>
+
+## 7. 模型推理
+
+模型推理过程支持 paddle 格式的 ``save inference model`` 和 onnx 格式。
+
+```bash
+sh scripts/inference.sh
+```
+
+<a name="模型性能"></a>
+
+## 8. 模型性能
+
+<a name="轻量化模型性能"></a>
+
+### 8.1 轻量化模型性能
+
+**配置：**
+  * CPU: Intel(R) Xeon(R) Gold 6184 CPU @ 2.40GHz
+  * GPU: a single NVIDIA Tesla V100
+  * Precison: FP32
+  * BatchSize: 64/512
+  * SampleRatio: 1.0
+  * Embedding Size: 128
+  * MS1MV2
+
+| Model structure           | lfw    | cfp_fp  | agedb30 | CPU time cost | GPU time cost | Inference model |
+| ------------------------- | ------ | ------- | ------- | -------| -------- |---- |
+| MobileFace-Paddle      | 0.9952 | 0.9280  | 0.9612  | 4.3ms  | 2.3ms    | [download link](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar)  |
+| MobileFace-mxnet       | 0.9950 | 0.8894  | 0.9591  | 7.3ms  | 4.7ms    | -   |
+
+* 注: MobileFace-Paddle 是使用 MobileFaceNet_128 backbone 训练出的模型
+
+<a name="验证集准确率"></a>
+
+### 8.2 验证集准确率
+
+**配置：**
+  * GPU: 8 NVIDIA Tesla V100 32G
+  * Precison: Pure FP16
+  * BatchSize: 128/1024
+
+| Mode    | Datasets | backbone | Ratio | agedb30 | cfp_fp | lfw  | log  | checkpoint |
+| ------- | :------: | :------- | ----- | :------ | :----- | :--- | :--- |  :--- |
+| Static  |  MS1MV3  | r50      | 0.1   | 0.98317 | 0.98943| 0.99850 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/static/ms1mv3_r50_static_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz) |
+| Static  |  MS1MV3  | r50      | 1.0   | 0.98283 | 0.98843| 0.99850 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/static/ms1mv3_r50_static_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_1.0_epoch_24.tgz) |
+| Dynamic |  MS1MV3  | r50      | 0.1   | 0.98333 | 0.98900| 0.99833 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_dynamic_128_fp16_0.1_eopch_24.tgz) |
+| Dynamic |  MS1MV3  | r50      | 1.0   | 0.98317 | 0.98900| 0.99833 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_dynamic_128_fp16_1.0_eopch_24.tgz) |
+
+<a name="最大类别数支持"></a>
+
+### 8.3 最大类别数支持 
+
+**配置：**
+  * GPU: 8 NVIDIA Tesla V100 32G (32510MiB)
+  * BatchSize: 64/512
+  * SampleRatio: 0.1
+
+| Mode                      | Precision | Res50    | Res100   |
+| ------------------------- | --------- | -------- | -------- |
+| Framework1 (static)       | AMP       | 42000000 (31792MiB)| 39000000 (31938MiB)|
+| Framework2 (dynamic)      | AMP       | 30000000 (31702MiB)| 29000000 (32286MiB)|
+| Paddle (static)           | Pure FP16 | 60000000 (32018MiB)| 60000000 (32018MiB)|
+| Paddle (dynamic)          | Pure FP16 | 59000000 (31970MiB)| 59000000 (31970MiB)|
+
+* 注：在跑实验前配置环境变量 ``export FLAGS_allocator_strategy=naive_best_fit``
+
+<a name="吞吐量对比"></a>
+
+### 8.4 吞吐量对比
+
+**配置：**
+  * BatchSize: 128/1024
+  * SampleRatio: 0.1
+  * Datasets: MS1MV3
+  * V100: Driver Version: 450.80.02, CUDA Version: 11.0
+  * A100: Driver Version: 460.32.03, CUDA Version: 11.2
+  
+![insightface_throughtput](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/images/insightface_throughtput.png)
+
+更多实验结果可以参考 [PLSC](https://github.com/PaddlePaddle/PLSC)，PLSC (Paddle Large Scale Classification) 是 Paddle 官方开源的大规模分类库，支持单机 8 卡 NVIDIA V100 (32G) 训练 6000 千万类，目前还在持续更新中，请关注。
+
+<a name="全流程推理"></a>
+
+## 9. 全流程推理
+
+首先下载索引库、待识别图像与字体文件。
+
+```bash
+# 下载用于人脸识别的索引库，这里因为示例图像是老友记中的图像，所以使用老友记中角色的人脸图像构建的底库。
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/index.bin
+# 下载用于人脸识别的示例图像
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends2.jpg
+# 下载字体，用于可视化
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/SourceHanSansCN-Medium.otf
+```
+
+`检测+识别` 串联预测的示例脚本如下。
+
+```shell
+# 同时使用检测+识别
+python3.7 tools/test_recognition.py --det --rec --index=index.bin --input=friends2.jpg --output="./output"
+```
+
+最终可视化结果保存在`output`目录下，可视化结果如下所示。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/output/friends2.jpg"  width = "800" />
+</div>
+
+更多关于参数解释，索引库构建、whl包预测部署和Paddle Serving预测部署的内容可以参考：
+ * [Whl包预测部署](https://github.com/littletomatodonkey/insight-face-paddle)
+ * [Paddle Serving预测部署](./deploy/pdserving/README_CN.md)
diff --git a/insightface/recognition/arcface_paddle/README_en.md b/insightface/recognition/arcface_paddle/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b38d9ae4d6d0a6cb0db2fa5b4137d257855fd9f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/README_en.md
@@ -0,0 +1,284 @@
+[简体中文](README_cn.md) | English
+
+# Arcface-Paddle
+
+* [1. Introduction](#Introduction)
+* [2. Environment Preparation](#Environment_Preparation)
+* [3. Data Preparation](#Data_Preparation)
+  * [3.1 Download Dataset](#Download_Dataset)
+  * [3.2 Extract MXNet Dataset to images](#Extract_MXNet_Dataset_to_images)
+* [4. How to Training](#How_to_Training)
+  * [4.1 Single Node, Single GPU](#Single_Node_Single_GPU)
+  * [4.2 Single Node, 8 GPUs](#Single_Node_8_GPU)
+* [5. Model Evaluation](#Model_Evaluation)
+* [6. Export Model](#Export_Model)
+* [7 Model Inference](#Model_Inference)
+* [8 Model Performance](#Model_Performance)
+  * [8.1 Performance of Lighting Model](#Performance_of_Lighting_Model)
+  * [8.2 Accuracy on Verification Datasets](#Accuracy_on_Verification_Datasets)
+  * [8.3 Maximum Number of Identities ](#Maximum_Number_of_Identities)
+  * [8.4 Throughtput](#Throughtput)
+* [9. Inference Combined with Face Detection Model](#Inference_Combined_with_Face_Detection_Model)
+
+
+<a name="Introduction"></a>
+
+## 1. Introduction
+
+`Arcface-Paddle` is an open source deep face detection and recognition toolkit, powered by PaddlePaddle. `Arcface-Paddle` provides three related pretrained models now, include `BlazeFace` for face detection, `ArcFace` and `MobileFace` for face recognition.
+
+- This tutorial is mainly about face recognition.
+- For face detection task, please refer to: [Face detection tuturial](../../detection/blazeface_paddle/README_en.md).
+- For Whl package inference using PaddleInference, please refer to [whl package inference](https://github.com/littletomatodonkey/insight-face-paddle).
+
+Note: Many thanks to [GuoQuanhao](https://github.com/GuoQuanhao) for the reproduction of the [Arcface basline using PaddlePaddle](https://github.com/GuoQuanhao/arcface-Paddle).
+
+<a name="Environment_Preparation"></a>
+
+## 2. Environment Preparation
+
+Please refer to [Installation](./install_en.md) to setup environment at first.
+
+<a name="Data_Preparation"></a>
+
+## 3. Data Preparation
+
+<a name="Download_Dataset"></a>
+
+### 3.1 Download Dataset
+
+Download the dataset from [insightface datasets](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_).
+
+* MS1M_v2: MS1M-ArcFace
+* MS1M_v3: MS1M-RetinaFace
+
+<a name="Extract_MXNet_Dataset_to_images"></a>
+
+### 3.2 Extract MXNet Dataset to images
+
+```shell
+python tools/mx_recordio_2_images.py --root_dir ms1m-retinaface-t1/ --output_dir MS1M_v3/
+```
+
+After finishing unzipping the dataset, the folder structure is as follows.
+
+```
+MS1M_v3
+|_ images
+|  |_ 00000001.jpg
+|  |_ ...
+|  |_ 05179510.jpg
+|_ label.txt
+|_ agedb_30.bin
+|_ cfp_ff.bin
+|_ cfp_fp.bin
+|_ lfw.bin
+```
+
+Label file format is as follows.
+
+```
+# delimiter: "\t"
+# the following the content of label.txt
+images/00000001.jpg 0
+...
+```
+
+If you want to use customed dataset, you can arrange your data according to the above format. 
+
+<a name="How_to_Training"></a>
+
+## 4. How to Training
+
+<a name="Single_Node_Single_GPU"></a>
+
+### 4.1 Single Node, Single GPU
+
+```bash
+export CUDA_VISIBLE_DEVICES=1
+python tools/train.py \
+    --config_file configs/ms1mv2_mobileface.py \
+    --embedding_size 128 \
+    --sample_ratio 1.0 \
+    --loss ArcFace \
+    --batch_size 512 \
+    --dataset MS1M_v2 \
+    --num_classes 85742 \
+    --data_dir MS1M_v2/ \
+    --label_file MS1M_v2/label.txt \
+    --fp16 False
+```
+
+<a name="Single_Node_8_GPU"></a>
+
+### 4.2 Single Node, 8 GPUs
+
+#### Static Mode
+
+```bash
+sh scripts/train_static.sh
+```
+
+#### Dynamic Mode
+
+```bash
+sh scripts/train_dynamic.sh
+```
+
+
+During training, you can view loss changes in real time through `VisualDL`,  For more information, please refer to [VisualDL](https://github.com/PaddlePaddle/VisualDL/).
+
+
+<a name="Model_Evaluation"></a>
+
+## 5. Model Evaluation
+
+The model evaluation process can be started as follows.
+
+#### Static Mode
+
+```bash
+sh scripts/validation_static.sh
+```
+
+#### Dynamic Mode
+
+```bash
+sh scripts/validation_dynamic.sh
+```
+
+<a name="Export_Model"></a>
+
+## 6. Export Model
+PaddlePaddle supports inference using prediction engines. Firstly, you should export inference model.
+
+#### Static Mode
+
+```bash
+sh scripts/export_static.sh
+```
+
+#### Dynamic Mode
+
+```bash
+sh scripts/export_dynamic.sh
+```
+
+We also support export to onnx model, you only need to set `--export_type onnx`.
+
+<a name="Model_Inference"></a>
+
+## 7. Model Inference
+
+The model inference process supports paddle save inference model and onnx model.
+
+```bash
+sh scripts/inference.sh
+```
+
+<a name="Model_Performance"></a>
+
+## 8. Model Performance
+
+<a name="Performance_of_Lighting_Model"></a>
+
+### 8.1 Performance of Lighting Model
+
+**Configuration：**
+  * CPU: Intel(R) Xeon(R) Gold 6184 CPU @ 2.40GHz
+  * GPU: a single NVIDIA Tesla V100
+  * Precison: FP32
+  * BatchSize: 64/512
+  * SampleRatio: 1.0
+  * Embedding Size: 128
+  * MS1MV2
+
+| Model structure           | lfw    | cfp_fp  | agedb30 | CPU time cost | GPU time cost | Inference model |
+| ------------------------- | ------ | ------- | ------- | -------| -------- |---- |
+| MobileFace-Paddle      | 0.9952 | 0.9280  | 0.9612  | 4.3ms  | 2.3ms    | [download link](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar)  |
+| MobileFace-mxnet       | 0.9950 | 0.8894  | 0.9591  | 7.3ms  | 4.7ms    | -   |
+
+* Note: MobileFace-Paddle training using MobileFaceNet_128
+
+<a name="Accuracy_on_Verification_Datasets"></a>
+
+### 8.2 Accuracy on Verification Datasets
+
+**Configuration：**
+  * GPU: 8 NVIDIA Tesla V100 32G
+  * Precison: Pure FP16
+  * BatchSize: 128/1024
+
+| Mode    | Datasets | backbone | Ratio | agedb30 | cfp_fp | lfw  | log  | checkpoint |
+| ------- | :------: | :------- | ----- | :------ | :----- | :--- | :--- |  :--- |
+| Static  |  MS1MV3  | r50      | 0.1   | 0.98317 | 0.98943| 0.99850 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/static/ms1mv3_r50_static_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz) |
+| Static  |  MS1MV3  | r50      | 1.0   | 0.98283 | 0.98843| 0.99850 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/static/ms1mv3_r50_static_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_1.0_epoch_24.tgz) |
+| Dynamic |  MS1MV3  | r50      | 0.1   | 0.98333 | 0.98900| 0.99833 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_dynamic_128_fp16_0.1_eopch_24.tgz) |
+| Dynamic |  MS1MV3  | r50      | 1.0   | 0.98317 | 0.98900| 0.99833 | [log](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_dynamic_128_fp16_1.0_eopch_24.tgz) |
+
+
+<a name="Maximum_Number_of_Identities"></a>
+
+### 8.3 Maximum Number of Identities 
+
+**Configuration：**
+  * GPU: 8 NVIDIA Tesla V100 32G (32510MiB)
+  * BatchSize: 64/512
+  * SampleRatio: 0.1
+
+| Mode                      | Precision | Res50    | Res100   |
+| ------------------------- | --------- | -------- | -------- |
+| Framework1 (static)       | AMP       | 42000000 (31792MiB)| 39000000 (31938MiB)|
+| Framework2 (dynamic)      | AMP       | 30000000 (31702MiB)| 29000000 (32286MiB)|
+| Paddle (static)           | Pure FP16 | 60000000 (32018MiB)| 60000000 (32018MiB)|
+| Paddle (dynamic)          | Pure FP16 | 59000000 (31970MiB)| 59000000 (31970MiB)|
+
+**Note:** config environment variable by ``export FLAGS_allocator_strategy=naive_best_fit``
+
+<a name="Throughtput"></a>
+
+### 8.4 Throughtput
+
+**Configuration：**
+  * BatchSize: 128/1024
+  * SampleRatio: 0.1
+  * Datasets: MS1MV3
+  * V100: Driver Version: 450.80.02, CUDA Version: 11.0
+  * A100: Driver Version: 460.32.03, CUDA Version: 11.2
+  
+![insightface_throughtput](https://github.com/PaddlePaddle/PLSC/blob/master/experiments/arcface_paddle/images/insightface_throughtput.png)
+
+For more experimental results see [PLSC](https://github.com/PaddlePaddle/PLSC), which is an open source Paddle Large Scale Classification Tools powered by PaddlePaddle. It supports 60 million classes on single node 8 NVIDIA V100 (32G).
+
+<a name="Inference_Combined_with_Face_Detection_Model"></a>
+
+## 9. Inference Combined with Face Detection Model
+
+Firstly, use the following commands to download the index gallery, demo image and font file for visualization.
+
+
+```bash
+# Index library for the recognition process
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/index.bin
+# Demo image
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/query/friends2.jpg
+# Font file for visualization
+wget https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/SourceHanSansCN-Medium.otf
+```
+
+Use the following command to run the whole face recognition demo.
+
+```shell
+# detection + recogniotion process
+python3.7 tools/test_recognition.py --det --rec --index=index.bin --input=friends2.jpg --output="./output"
+```
+
+The final result is save in folder `output/`, which is shown as follows.
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/littletomatodonkey/insight-face-paddle/main/demo/friends/output/friends2.jpg"  width = "800" />
+</div>
+
+For more details about parameter explanations, index gallery construction and whl package inference, please refer to:
+ *  [Whl package inference tutorial](https://github.com/littletomatodonkey/insight-face-paddle).
+ * [Paddle Serving inference](./deploy/pdserving/README.md)
diff --git a/insightface/recognition/arcface_paddle/configs/__init__.py b/insightface/recognition/arcface_paddle/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/insightface/recognition/arcface_paddle/configs/argparser.py b/insightface/recognition/arcface_paddle/configs/argparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f40f7ad0dbacf1273d9037c4a157ac47c21993
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/argparser.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import argparse
+import importlib
+
+
+def print_args(args):
+    logging.info('--------args----------')
+    for k in list(vars(args).keys()):
+        logging.info('%s: %s' % (k, vars(args)[k]))
+    logging.info('------------------------\n')
+
+
+def str2bool(v):
+    return str(v).lower() in ("true", "t", "1")
+
+
+def tostrlist(v):
+    if isinstance(v, list):
+        return v
+    elif isinstance(v, str):
+        return [e.strip() for e in v.split(',')]
+
+
+def tointlist(v):
+    if isinstance(v, list):
+        return v
+    elif isinstance(v, str):
+        return [int(e.strip()) for e in v.split(',')]
+
+
+def get_config(config_file):
+    assert config_file.startswith(
+        'configs/'), 'config file setting must start with configs/'
+    temp_config_name = os.path.basename(config_file)
+    temp_module_name = os.path.splitext(temp_config_name)[0]
+    config = importlib.import_module("configs.config")
+    cfg = config.config
+    config = importlib.import_module("configs.%s" % temp_module_name)
+    job_cfg = config.config
+    cfg.update(job_cfg)
+    if cfg.output is None:
+        cfg.output = osp.join('work_dirs', temp_module_name)
+    return cfg
+
+
+class UserNamespace(object):
+    pass
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser(description='Paddle Face Training')
+    user_namespace = UserNamespace()
+    parser.add_argument(
+        '--config_file', type=str, required=True, help='config file path')
+    parser.parse_known_args(namespace=user_namespace)
+    cfg = get_config(user_namespace.config_file)
+
+    # Model setting
+    parser.add_argument(
+        '--is_static',
+        type=str2bool,
+        default=cfg.is_static,
+        help='whether to use static mode')
+    parser.add_argument(
+        '--backbone', type=str, default=cfg.backbone, help='backbone network')
+    parser.add_argument(
+        '--classifier',
+        type=str,
+        default=cfg.classifier,
+        help='classification network')
+    parser.add_argument(
+        '--embedding_size',
+        type=int,
+        default=cfg.embedding_size,
+        help='embedding size')
+    parser.add_argument(
+        '--model_parallel',
+        type=str2bool,
+        default=cfg.model_parallel,
+        help='whether to use model parallel')
+    parser.add_argument(
+        '--sample_ratio',
+        type=float,
+        default=cfg.sample_ratio,
+        help='sample rate, use partial fc sample if sample rate less than 1.0')
+    parser.add_argument(
+        '--loss', type=str, default=cfg.loss, help='loss function')
+    parser.add_argument(
+        '--dropout',
+        type=float,
+        default=cfg.dropout,
+        help='probability of dropout')
+
+    # AMP setting
+    parser.add_argument(
+        '--fp16',
+        type=str2bool,
+        default=cfg.fp16,
+        help='whether to use fp16 training')
+    parser.add_argument(
+        '--init_loss_scaling',
+        type=float,
+        default=cfg.init_loss_scaling,
+        help='The initial loss scaling factor.')
+    parser.add_argument(
+        '--max_loss_scaling',
+        type=float,
+        default=cfg.max_loss_scaling,
+        help='The maximum loss scaling factor.')
+    parser.add_argument(
+        '--incr_every_n_steps',
+        type=int,
+        default=cfg.incr_every_n_steps,
+        help='Increases loss scaling every n consecutive steps with finite gradients.'
+    )
+    parser.add_argument(
+        '--decr_every_n_nan_or_inf',
+        type=int,
+        default=cfg.decr_every_n_nan_or_inf,
+        help='Decreases loss scaling every n accumulated steps with nan or inf gradients.'
+    )
+    parser.add_argument(
+        '--incr_ratio',
+        type=float,
+        default=cfg.incr_ratio,
+        help='The multiplier to use when increasing the loss scaling.')
+    parser.add_argument(
+        '--decr_ratio',
+        type=float,
+        default=cfg.decr_ratio,
+        help='The less-than-one-multiplier to use when decreasing the loss scaling.'
+    )
+    parser.add_argument(
+        '--use_dynamic_loss_scaling',
+        type=str2bool,
+        default=cfg.use_dynamic_loss_scaling,
+        help='Whether to use dynamic loss scaling.')
+    parser.add_argument(
+        '--custom_white_list',
+        type=tostrlist,
+        default=cfg.custom_white_list,
+        help='fp16 custom white list.')
+    parser.add_argument(
+        '--custom_black_list',
+        type=tostrlist,
+        default=cfg.custom_black_list,
+        help='fp16 custom black list.')
+
+    # Optimizer setting
+    parser.add_argument(
+        '--lr', type=float, default=cfg.lr, help='learning rate')
+    parser.add_argument(
+        '--lr_decay',
+        type=float,
+        default=cfg.lr_decay,
+        help='learning rate decay factor')
+    parser.add_argument(
+        '--weight_decay',
+        type=float,
+        default=cfg.weight_decay,
+        help='weight decay')
+    parser.add_argument(
+        '--momentum', type=float, default=cfg.momentum, help='sgd momentum')
+    parser.add_argument(
+        '--train_unit',
+        type=str,
+        default=cfg.train_unit,
+        help='train unit, "step" or "epoch"')
+    parser.add_argument(
+        '--warmup_num',
+        type=int,
+        default=cfg.warmup_num,
+        help='warmup num according train unit')
+    parser.add_argument(
+        '--train_num',
+        type=int,
+        default=cfg.train_num,
+        help='train num according train unit')
+    parser.add_argument(
+        '--decay_boundaries',
+        type=tointlist,
+        default=cfg.decay_boundaries,
+        help='piecewise decay boundaries')
+
+    # Train dataset setting
+    parser.add_argument(
+        '--use_synthetic_dataset',
+        type=str2bool,
+        default=cfg.use_synthetic_dataset,
+        help='whether to use synthetic dataset')
+    parser.add_argument(
+        '--dataset', type=str, default=cfg.dataset, help='train dataset name')
+    parser.add_argument(
+        '--data_dir',
+        type=str,
+        default=cfg.data_dir,
+        help='train dataset directory')
+    parser.add_argument(
+        '--label_file',
+        type=str,
+        default=cfg.label_file,
+        help='train label file name, each line split by "\t"')
+    parser.add_argument(
+        '--is_bin',
+        type=str2bool,
+        default=cfg.is_bin,
+        help='whether the train data is bin or original image file')
+    parser.add_argument(
+        '--num_classes',
+        type=int,
+        default=cfg.num_classes,
+        help='classes of train dataset')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=cfg.batch_size,
+        help='batch size of each rank')
+    parser.add_argument(
+        '--num_workers',
+        type=int,
+        default=cfg.num_workers,
+        help='the number workers of DataLoader')
+
+    # Validation dataset setting
+    parser.add_argument(
+        '--do_validation_while_train',
+        type=str2bool,
+        default=cfg.do_validation_while_train,
+        help='do validation while train')
+    parser.add_argument(
+        '--validation_interval_step',
+        type=int,
+        default=cfg.validation_interval_step,
+        help='validation interval step')
+    parser.add_argument(
+        '--val_targets',
+        type=tostrlist,
+        default=cfg.val_targets,
+        help='val targets, list or str split by comma')
+
+    # IO setting
+    parser.add_argument(
+        '--logdir', type=str, default=cfg.logdir, help='log dir')
+    parser.add_argument(
+        '--log_interval_step',
+        type=int,
+        default=cfg.log_interval_step,
+        help='log interval step')
+    parser.add_argument(
+        '--output', type=str, default=cfg.output, help='output dir')
+    parser.add_argument(
+        '--resume', type=str2bool, default=cfg.resume, help='whether to using resume training')
+    parser.add_argument(
+        '--checkpoint_dir',
+        type=str,
+        default=cfg.checkpoint_dir,
+        help='set checkpoint direcotry when resume training')
+    parser.add_argument(
+        '--max_num_last_checkpoint',
+        type=int,
+        default=cfg.max_num_last_checkpoint,
+        help='the maximum number of lastest checkpoint to keep')
+
+    args = parser.parse_args(namespace=user_namespace)
+    return args
diff --git a/insightface/recognition/arcface_paddle/configs/config.py b/insightface/recognition/arcface_paddle/configs/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd126fb760637bdaed91cc6f3bb695f77c02c59
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/config.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from easydict import EasyDict as edict
+
+config = edict()
+config.is_static = True
+config.backbone = 'FresResNet100'
+config.classifier = 'LargeScaleClassifier'
+config.embedding_size = 512
+config.model_parallel = True
+config.sample_ratio = 0.1
+config.loss = 'ArcFace'
+config.dropout = 0.0
+
+config.fp16 = True
+config.init_loss_scaling = 128.0
+config.max_loss_scaling = 128.0
+config.incr_every_n_steps = 2000
+config.decr_every_n_nan_or_inf = 1
+config.incr_ratio = 2.0
+config.decr_ratio = 0.5
+config.use_dynamic_loss_scaling = True
+config.custom_white_list = []
+config.custom_black_list = []
+
+config.lr = 0.1  # for global batch size = 512
+config.lr_decay = 0.1
+config.weight_decay = 5e-4
+config.momentum = 0.9
+config.train_unit = 'step'  # 'step' or 'epoch'
+config.warmup_num = 1000
+config.train_num = 180000
+config.decay_boundaries = [100000, 140000, 160000]
+
+config.use_synthetic_dataset = False
+config.dataset = "MS1M_v3"
+config.data_dir = "./MS1M_v3"
+config.label_file = "./MS1M_v3/label.txt"
+config.is_bin = False
+config.num_classes = 93431  # 85742 for MS1M_v2, 93431 for MS1M_v3
+config.batch_size = 64  # global batch size 512 of 8 GPU
+config.num_workers = 8
+
+config.do_validation_while_train = True
+config.validation_interval_step = 2000
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+config.logdir = './log'
+config.log_interval_step = 10
+config.output = './MS1M_v3_arcface'
+config.resume = False
+config.checkpoint_dir = None
+config.max_num_last_checkpoint = 3
diff --git a/insightface/recognition/arcface_paddle/configs/ms1mv2_mobileface.py b/insightface/recognition/arcface_paddle/configs/ms1mv2_mobileface.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29a062a54911bc59b19d38c6c7f793bb1f564b4
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/ms1mv2_mobileface.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from easydict import EasyDict as edict
+
+config = edict()
+config.is_static = False
+config.backbone = 'MobileFaceNet_128'
+config.classifier = 'LargeScaleClassifier'
+config.embedding_size = 128
+config.model_parallel = True
+config.sample_ratio = 1.0
+config.loss = 'ArcFace'
+config.dropout = 0.0
+
+config.lr = 0.1  # for global batch size = 512
+config.lr_decay = 0.1
+config.weight_decay = 5e-4
+config.momentum = 0.9
+config.train_unit = 'epoch'  # 'step' or 'epoch'
+config.warmup_num = 0
+config.train_num = 25
+config.decay_boundaries = [10, 16, 22]
+
+config.use_synthetic_dataset = False
+config.dataset = "MS1M_v2"
+config.data_dir = "./MS1M_v2"
+config.label_file = "./MS1M_v2/label.txt"
+config.is_bin = False
+config.num_classes = 85742  # 85742 for MS1M_v2, 93431 for MS1M_v3
+config.batch_size = 128  # global batch size 1024 of 8 GPU
+config.num_workers = 8
+
+config.do_validation_while_train = True
+config.validation_interval_step = 2000
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+config.logdir = './log'
+config.log_interval_step = 100
+config.output = './MS1M_v2_arcface_MobileFaceNet_128_0.1'
+config.resume = False
+config.checkpoint_dir = None
+config.max_num_last_checkpoint = 1
diff --git a/insightface/recognition/arcface_paddle/configs/ms1mv3_r100.py b/insightface/recognition/arcface_paddle/configs/ms1mv3_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..75d200d9b3c2e3bf1dd7fa3f9f9167d383824c0c
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/ms1mv3_r100.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from easydict import EasyDict as edict
+
+config = edict()
+config.is_static = True
+config.backbone = 'FresResNet100'
+config.classifier = 'LargeScaleClassifier'
+config.embedding_size = 512
+config.model_parallel = True
+config.sample_ratio = 0.1
+config.loss = 'ArcFace'
+config.dropout = 0.0
+
+config.lr = 0.1  # for global batch size = 512
+config.lr_decay = 0.1
+config.weight_decay = 5e-4
+config.momentum = 0.9
+config.train_unit = 'epoch'  # 'step' or 'epoch'
+config.warmup_num = 0
+config.train_num = 25
+config.decay_boundaries = [10, 16, 22]
+
+config.use_synthetic_dataset = False
+config.dataset = "MS1M_v3"
+config.data_dir = "./MS1M_v3"
+config.label_file = "./MS1M_v3/label.txt"
+config.is_bin = False
+config.num_classes = 93431  # 85742 for MS1M_v2, 93431 for MS1M_v3
+config.batch_size = 128  # global batch size 512 of 8 GPU
+config.num_workers = 8
+
+config.do_validation_while_train = True
+config.validation_interval_step = 2000
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+config.logdir = './log'
+config.log_interval_step = 100
+config.output = './MS1M_v3_arcface'
+config.resume = False
+config.checkpoint_dir = None
+config.max_num_last_checkpoint = 1
diff --git a/insightface/recognition/arcface_paddle/configs/ms1mv3_r50.py b/insightface/recognition/arcface_paddle/configs/ms1mv3_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f556c487298ef83e2c52cdc97db04f335cffb9
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/configs/ms1mv3_r50.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from easydict import EasyDict as edict
+
+config = edict()
+config.is_static = True
+config.backbone = 'FresResNet50'
+config.classifier = 'LargeScaleClassifier'
+config.embedding_size = 512
+config.model_parallel = True
+config.sample_ratio = 0.1
+config.loss = 'ArcFace'
+config.dropout = 0.0
+
+config.lr = 0.1  # for global batch size = 512
+config.lr_decay = 0.1
+config.weight_decay = 5e-4
+config.momentum = 0.9
+config.train_unit = 'epoch'  # 'step' or 'epoch'
+config.warmup_num = 0
+config.train_num = 25
+config.decay_boundaries = [10, 16, 22]
+
+config.use_synthetic_dataset = False
+config.dataset = "MS1M_v3"
+config.data_dir = "./MS1M_v3"
+config.label_file = "./MS1M_v3/label.txt"
+config.is_bin = False
+config.num_classes = 93431  # 85742 for MS1M_v2, 93431 for MS1M_v3
+config.batch_size = 128  # global batch size 512 of 8 GPU
+config.num_workers = 8
+
+config.do_validation_while_train = True
+config.validation_interval_step = 2000
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+
+config.logdir = './log'
+config.log_interval_step = 100
+config.output = './MS1M_v3_arcface'
+config.resume = False
+config.checkpoint_dir = None
+config.max_num_last_checkpoint = 1
diff --git a/insightface/recognition/arcface_paddle/datasets/__init__.py b/insightface/recognition/arcface_paddle/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c97f8e4768b0705647724dd6c23915ccd9cb061f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/datasets/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common_dataset import CommonDataset, SyntheticDataset, load_bin
diff --git a/insightface/recognition/arcface_paddle/datasets/common_dataset.py b/insightface/recognition/arcface_paddle/datasets/common_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1d5a151a44703da5e7a59b31533689c4ef26903
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/datasets/common_dataset.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import paddle
+import os
+import cv2
+import six
+import random
+import paddle
+import numpy as np
+import logging
+from PIL import Image
+from io import BytesIO
+
+from datasets.kv_helper import read_img_from_bin
+
+
+def transform(img):
+    # random horizontal flip
+    if random.randint(0, 1) == 0:
+        img = cv2.flip(img, 1)
+    # normalize to mean 0.5, std 0.5
+    img = (img - 127.5) * 0.00784313725
+    # BGR2RGB
+    img = img[:, :, ::-1]
+    img = img.transpose((2, 0, 1))
+    return img
+
+
+class CommonDataset(paddle.io.Dataset):
+    def __init__(self, root_dir, label_file, fp16=False, is_bin=True):
+        super(CommonDataset, self).__init__()
+        self.root_dir = root_dir
+        self.label_file = label_file
+        self.fp16 = fp16
+        with open(label_file, "r") as fin:
+            self.full_lines = fin.readlines()
+
+        self.delimiter = "\t"
+        self.is_bin = is_bin
+
+        self.num_samples = len(self.full_lines)
+        logging.info("read label file finished, total num: {}"
+                     .format(self.num_samples))
+
+    def __getitem__(self, idx):
+
+        line = self.full_lines[idx]
+
+        img_path, label = line.strip().split(self.delimiter)
+        img_path = os.path.join(self.root_dir, img_path)
+        if self.is_bin:
+            img = read_img_from_bin(img_path)
+        else:
+            img = cv2.imread(img_path)
+
+        img = transform(img)
+
+        img = img.astype('float16' if self.fp16 else 'float32')
+        label = np.int32(label)
+
+        return img, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class SyntheticDataset(paddle.io.Dataset):
+    def __init__(self, num_classes, fp16=False):
+        super(SyntheticDataset, self).__init__()
+        self.num_classes = num_classes
+        self.fp16 = fp16
+        self.label_list = np.random.randint(
+            0, num_classes, (5179510, ), dtype=np.int32)
+        self.num_samples = len(self.label_list)
+
+    def __getitem__(self, idx):
+        label = self.label_list[idx]
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
+        img = transform(img)
+
+        img = img.astype('float16' if self.fp16 else 'float32')
+        label = np.int32(label)
+
+        return img, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+# 返回为 numpy
+def load_bin(path, image_size):
+    if six.PY2:
+        bins, issame_list = pickle.load(open(path, 'rb'))
+    else:
+        bins, issame_list = pickle.load(open(path, 'rb'), encoding='bytes')
+    data_list = []
+    for flip in [0, 1]:
+        data = np.empty(
+            (len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        if six.PY2:
+            if not isinstance(_bin, six.string_types):
+                _bin = _bin.tostring()
+            img_ori = Image.open(StringIO(_bin))
+        else:
+            img_ori = Image.open(BytesIO(_bin))
+        for flip in [0, 1]:
+            img = img_ori.copy()
+            if flip == 1:
+                img = img.transpose(Image.FLIP_LEFT_RIGHT)
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            img = np.array(img).astype('float32').transpose((2, 0, 1))
+            img = (img - 127.5) * 0.00784313725
+            data_list[flip][i][:] = img
+        if i % 1000 == 0:
+            print('loading bin', i)
+    print(data_list[0].shape)
+    return data_list, issame_list
diff --git a/insightface/recognition/arcface_paddle/datasets/kv_helper.py b/insightface/recognition/arcface_paddle/datasets/kv_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..43bff7cd2bc6c3093ad687078cc50797b3210a19
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/datasets/kv_helper.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+import struct
+import random
+import multiprocessing
+import numpy as np
+import cv2
+import json
+
+
+def readkv(f):
+    """readkv"""
+    keylendata = f.read(4)
+    if len(keylendata) != 4:
+        return None
+    keylen = struct.unpack('I', keylendata)[0]
+    if keylen > 5000:
+        raise Exception('wrong key len' + str(keylen))
+    key = f.read(keylen)
+    valuelen = struct.unpack('I', f.read(4))[0]
+    value = f.read(valuelen)
+    return key, value
+
+
+def writekv(f, k, v, flush=True):
+    """writekv"""
+    f.write(struct.pack('I', len(k)))
+    f.write(k)
+    f.write(struct.pack('I', len(v)))
+    f.write(v)
+    if flush:
+        f.flush()
+    return
+
+
+def trans_img_to_bin(img_name, output_path):
+    with open(img_name, "rb") as fin:
+        img = fin.read()
+    key = os.path.split(img_name)[-1]
+    with open(output_path, "wb") as fout:
+        writekv(fout, key.encode(), pickle.dumps(img, -1))
+    return
+
+
+def read_img_from_bin(input_path):
+    # the file can exist many key-vals, but it just save one in fact.
+    with open(input_path, "rb") as fin:
+        r = readkv(fin)
+        assert r is not None
+        _, value = r
+        value = pickle.loads(value)
+        value = np.frombuffer(value, dtype='uint8')
+        img = cv2.imdecode(value, 1)
+    return img
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/README.md b/insightface/recognition/arcface_paddle/deploy/pdserving/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..946478012de65771d2d8e73257eb0f97048bc694
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/README.md
@@ -0,0 +1,180 @@
+
+# Service deployment based on PaddleServing  
+
+(English|[简体中文](./README_CN.md))
+
+
+This document will introduce how to use the [PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README.md) to deploy the Arcface dynamic graph model as a pipeline online service.
+
+Some Key Features of Paddle Serving:
+- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed with one line command.
+- Industrial serving features supported, such as models management, online loading, online A/B testing etc.
+- Highly concurrent and efficient communication between clients and servers supported.
+
+The introduction and tutorial of Paddle Serving service deployment framework reference [document](https://github.com/PaddlePaddle/Serving/blob/develop/README.md).
+
+
+## Contents
+- [Environmental preparation](#environmental-preparation)
+- [Model conversion](#model-conversion)
+- [Paddle Serving pipeline deployment](#paddle-serving-pipeline-deployment)
+- [FAQ](#faq)
+
+<a name="environmental-preparation"></a>
+## Environmental preparation
+
+Arcface operating environment and Paddle Serving operating environment are needed.
+
+1. Please prepare Arcface operating environment reference [link](../../README_en.md).
+   Download the corresponding paddle whl package according to the environment, it is recommended to install version 2.2+.
+
+
+2. The steps of PaddleServing operating environment prepare are as follows:
+
+    Install serving which used to start the service
+    ```
+    pip3 install paddle-serving-server==0.6.3 # for CPU
+    pip3 install paddle-serving-server-gpu==0.6.3 # for GPU
+    # Other GPU environments need to confirm the environment and then choose to execute the following commands
+    pip3 install paddle-serving-server-gpu==0.6.3.post101 # GPU with CUDA10.1 + TensorRT6
+    pip3 install paddle-serving-server-gpu==0.6.3.post11 # GPU with CUDA11 + TensorRT7
+    ```
+
+3. Install the client to send requests to the service
+    In [download link](https://github.com/PaddlePaddle/Serving/blob/develop/doc/LATEST_PACKAGES.md) find the client installation package corresponding to the python version.
+    The python3.7 version is recommended here:
+
+    ```
+    pip3 install paddle-serving-client==0.6.3
+    ```
+
+4. Install serving-app
+    ```
+    pip3 install paddle-serving-app==0.6.3
+    ```
+
+   **note:** If you want to install the latest version of PaddleServing, refer to [link](https://github.com/PaddlePaddle/Serving/blob/develop/doc/LATEST_PACKAGES.md).
+
+
+<a name="model-conversion"></a>
+## Model conversion
+When using PaddleServing for service deployment, you need to convert the saved inference model into a serving model that is easy to deploy.
+
+Firstly, download the inference model of Arcface
+```
+wget -nc -P ./inference https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar
+tar xf inference/mobileface_v1.0_infer.tar --strip-components 1 -C inference 
+```
+Then, you can use installed paddle_serving_client tool to convert inference model to mobile model.
+```
+python3 -m paddle_serving_client.convert --dirname ./inference/ \
+                                         --model_filename inference.pdmodel \
+                                         --params_filename inference.pdiparams \
+                                         --serving_server ./MobileFaceNet_128_serving/ \
+                                         --serving_client ./MobileFaceNet_128_client/
+
+```
+
+After the detection model is converted, there will be additional folders of `MobileFaceNet_128_serving` and `MobileFaceNet_128_client` in the current folder, with the following format:
+```
+MobileFaceNet_128_serving
+├── __model__
+├── __params__
+├── serving_server_conf.prototxt
+└── serving_server_conf.stream.prototxt
+
+MobileFaceNet_128_client/
+├── serving_client_conf.prototxt
+└── serving_client_conf.stream.prototxt
+
+```
+The recognition model is the same.
+
+<a name="paddle-serving-pipeline-deployment"></a>
+## Paddle Serving pipeline deployment
+
+1. Download the PaddleOCR code, if you have already downloaded it, you can skip this step.
+    ```
+    git clone https://github.com/deepinsight/insightface
+
+    # Enter the working directory  
+    cd recognition/arcface_paddle/deploy/pdserving
+    ```
+
+    The pdserver directory contains the code to start the pipeline service and send prediction requests, including:
+    ```
+    __init__.py
+    config.yml # Start the service configuration file
+    ocr_reader.py # pre-processing and post-processing code implementation
+    pipeline_http_client.py # Script to send pipeline prediction request
+    web_service.py # Start the script of the pipeline server
+    ```
+
+2. Run the following command to start the service.
+    ```
+    # Start the service and save the running log in log.txt
+    python3 web_service.py &>log.txt &
+    ```
+    After the service is successfully started, a log similar to the following will be printed in log.txt
+    ![](./imgs/start_server.png)
+
+3. Send service request
+    ```
+    python3 pipeline_http_client.py
+    ```
+    After successfully running, the predicted result of the model will be printed in the cmd window. An example of the result is:
+    ![](./imgs/results.png)  
+
+    Adjust the number of concurrency in config.yml to get the largest QPS. Generally, the number of concurrent detection and recognition is 2:1
+
+    ```
+    det:
+        concurrency: 8
+        ...
+    rec:
+        concurrency: 4
+        ...
+    ```
+
+    Multiple service requests can be sent at the same time if necessary.
+
+    The predicted performance data will be automatically written into the `PipelineServingLogs/pipeline.tracer` file.
+
+    Tested on 700 real picture. The average QPS on V100 GPU can reach around 57:
+
+    ```
+    2021-11-04 13:38:52,507 Op(ArcFace):
+    2021-11-04 13:38:52,507 	in[135.4579597902098 ms]
+    2021-11-04 13:38:52,507 	prep[0.9921311188811189 ms]
+    2021-11-04 13:38:52,507 	midp[3.9232132867132865 ms]
+    2021-11-04 13:38:52,507 	postp[0.12166258741258741 ms]
+    2021-11-04 13:38:52,507 	out[0.9898286713286714 ms]
+    2021-11-04 13:38:52,508 	idle[0.9643989520087675]
+    2021-11-04 13:38:52,508 DAGExecutor:
+    2021-11-04 13:38:52,508 	Query count[573]
+    2021-11-04 13:38:52,508 	QPS[57.3 q/s]
+    2021-11-04 13:38:52,509 	Succ[0.9982547993019197]
+    2021-11-04 13:38:52,509 	Error req[394]
+    2021-11-04 13:38:52,509 	Latency:
+    2021-11-04 13:38:52,509 		ave[11.52941186736475 ms]
+    2021-11-04 13:38:52,509 		.50[11.492 ms]
+    2021-11-04 13:38:52,509 		.60[11.658 ms]
+    2021-11-04 13:38:52,509 		.70[11.95 ms]
+    2021-11-04 13:38:52,509 		.80[12.251 ms]
+    2021-11-04 13:38:52,509 		.90[12.736 ms]
+    2021-11-04 13:38:52,509 		.95[13.21 ms]
+    2021-11-04 13:38:52,509 		.99[13.987 ms]
+    2021-11-04 13:38:52,510 Channel (server worker num[10]):
+    2021-11-04 13:38:52,510 	chl0(In: ['@DAGExecutor'], Out: ['ArcFace']) size[0/0]
+    2021-11-04 13:38:52,510 	chl1(In: ['ArcFace'], Out: ['@DAGExecutor']) size[0/0]
+    ```
+
+<a name="faq"></a>
+## FAQ
+**Q1**: No result return after sending the request.
+
+**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and before sending the request. The command to close the proxy is:
+```
+unset https_proxy
+unset http_proxy
+```  
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/README_CN.md b/insightface/recognition/arcface_paddle/deploy/pdserving/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..77e0d08372a434bb64f5348cad989575615f6d6f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/README_CN.md
@@ -0,0 +1,172 @@
+# 基于PaddleServing的服务部署
+
+(简体中文|[English](./README.md))
+
+
+本文档将介绍如何使用[PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)工具部署 Arcface 动态图模型的pipeline在线服务。
+
+PaddleServing具备以下优点：
+- 支持客户端和服务端之间高并发和高效通信
+- 支持 工业级的服务能力 例如模型管理，在线加载，在线A/B测试等
+- 支持 多种编程语言 开发客户端，例如C++, Python和Java
+
+更多有关PaddleServing服务化部署框架介绍和使用教程参考[文档](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)。
+
+## 目录
+- [环境准备](#环境准备)
+- [模型转换](#模型转换)
+- [Paddle Serving pipeline部署](#部署)
+- [FAQ](#FAQ)
+
+<a name="环境准备"></a>
+## 环境准备
+
+需要准备 Arcface 的运行环境和Paddle Serving的运行环境。
+
+- 准备 Arcface 的运行环境[链接](../../README_cn.md)
+  根据环境下载对应的paddle whl包，推荐安装2.2+版本
+
+- 准备PaddleServing的运行环境，步骤如下
+
+1. 安装serving，用于启动服务
+    ```
+    pip3 install paddle-serving-server==0.6.3 # for CPU
+    pip3 install paddle-serving-server-gpu==0.6.3 # for GPU
+    # 其他GPU环境需要确认环境再选择执行如下命令
+    pip3 install paddle-serving-server-gpu==0.6.3.post101 # GPU with CUDA10.1 + TensorRT6
+    pip3 install paddle-serving-server-gpu==0.6.3.post11 # GPU with CUDA11 + TensorRT7
+    ```
+
+2. 安装client，用于向服务发送请求
+    ```
+    pip3 install paddle_serving_client==0.6.3
+    ```
+
+3. 安装serving-app
+    ```
+    pip3 install paddle-serving-app==0.6.3
+    ```
+
+    **Note:** 如果要安装最新版本的PaddleServing参考[链接](https://github.com/PaddlePaddle/Serving/blob/develop/doc/LATEST_PACKAGES.md)。
+
+<a name="模型转换"></a>
+## 模型转换
+
+使用PaddleServing做服务化部署时，需要将保存的inference模型转换为serving易于部署的模型。
+
+首先，下载Arcface的inference模型
+```
+# 下载并解压 Arcface 模型
+wget -nc -P ./inference https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar
+tar xf inference/mobileface_v1.0_infer.tar --strip-components 1 -C inference 
+```
+
+接下来，用安装的paddle_serving_client把下载的inference模型转换成易于server部署的模型格式。
+
+```
+python3 -m paddle_serving_client.convert --dirname ./inference/ \
+                                         --model_filename inference.pdmodel \
+                                         --params_filename inference.pdiparams \
+                                         --serving_server ./MobileFaceNet_128_serving/ \
+                                         --serving_client ./MobileFaceNet_128_client/
+```
+
+检测模型转换完成后，会在当前文件夹多出`MobileFaceNet_128_serving/` 和`MobileFaceNet_128_client`的文件夹，具备如下格式：
+```
+MobileFaceNet_128_serving
+├── __model__
+├── __params__
+├── serving_server_conf.prototxt
+└── serving_server_conf.stream.prototxt
+
+MobileFaceNet_128_client/
+├── serving_client_conf.prototxt
+└── serving_client_conf.stream.prototxt
+
+```
+
+<a name="部署"></a>
+## Paddle Serving pipeline部署
+
+1. 下载insightface代码，若已下载可跳过此步骤
+    ```
+    git clone https://github.com/deepinsight/insightface
+
+    # 进入到工作目录
+    cd recognition/arcface_paddle/deploy/pdserving
+    ```
+
+    pdserving目录包含启动pipeline服务和发送预测请求的代码，包括：
+    
+    ```
+    __init__.py
+    config.yml            # 启动服务的配置文件
+    pipeline_http_client.py   # web方式发送pipeline预测请求的脚本
+    pipeline_rpc_client.py   # rpc方式发送pipeline预测请求的脚本
+    web_service.py        # 启动pipeline服务端的脚本
+    ```
+
+2. 启动服务可运行如下命令：
+    ```
+    # 启动服务，运行日志保存在log.txt
+    python3 web_service.py &>log.txt &
+    ```
+    成功启动服务后，log.txt中会打印类似如下日志
+    ![](./imgs/start_server.png)
+
+3. 发送服务请求：
+    ```
+    python3 pipeline_http_client.py
+    ```
+    成功运行后，模型预测的结果会打印在cmd窗口中，结果示例为：
+    ![](./imgs/results.png)
+
+    调整 config.yml 中的并发个数获得最大的QPS, 一般检测和识别的并发数为2：1
+    ```
+    ArcFace:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 8
+        ...
+    ```
+    有需要的话可以同时发送多个服务请求
+
+    预测性能数据会被自动写入 `PipelineServingLogs/pipeline.tracer` 文件中。
+
+    在700张真实图片上测试，V100 GPU 上 QPS 均值可达到57左右：
+
+    ```
+    2021-11-04 13:38:52,507 Op(ArcFace):
+    2021-11-04 13:38:52,507 	in[135.4579597902098 ms]
+    2021-11-04 13:38:52,507 	prep[0.9921311188811189 ms]
+    2021-11-04 13:38:52,507 	midp[3.9232132867132865 ms]
+    2021-11-04 13:38:52,507 	postp[0.12166258741258741 ms]
+    2021-11-04 13:38:52,507 	out[0.9898286713286714 ms]
+    2021-11-04 13:38:52,508 	idle[0.9643989520087675]
+    2021-11-04 13:38:52,508 DAGExecutor:
+    2021-11-04 13:38:52,508 	Query count[573]
+    2021-11-04 13:38:52,508 	QPS[57.3 q/s]
+    2021-11-04 13:38:52,509 	Succ[0.9982547993019197]
+    2021-11-04 13:38:52,509 	Error req[394]
+    2021-11-04 13:38:52,509 	Latency:
+    2021-11-04 13:38:52,509 		ave[11.52941186736475 ms]
+    2021-11-04 13:38:52,509 		.50[11.492 ms]
+    2021-11-04 13:38:52,509 		.60[11.658 ms]
+    2021-11-04 13:38:52,509 		.70[11.95 ms]
+    2021-11-04 13:38:52,509 		.80[12.251 ms]
+    2021-11-04 13:38:52,509 		.90[12.736 ms]
+    2021-11-04 13:38:52,509 		.95[13.21 ms]
+    2021-11-04 13:38:52,509 		.99[13.987 ms]
+    2021-11-04 13:38:52,510 Channel (server worker num[10]):
+    2021-11-04 13:38:52,510 	chl0(In: ['@DAGExecutor'], Out: ['ArcFace']) size[0/0]
+    2021-11-04 13:38:52,510 	chl1(In: ['ArcFace'], Out: ['@DAGExecutor']) size[0/0]
+    ```
+
+<a name="FAQ"></a>
+## FAQ
+**Q1**： 发送请求后没有结果返回或者提示输出解码报错
+
+**A1**： 启动服务和发送请求时不要设置代理，可以在启动服务前和发送请求前关闭代理，关闭代理的命令是：
+```
+unset https_proxy
+unset http_proxy
+```
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/__init__.py b/insightface/recognition/arcface_paddle/deploy/pdserving/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d5aa213694a29c4820ead6e2a74123c2df44e8
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/config.yml b/insightface/recognition/arcface_paddle/deploy/pdserving/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3dd9fb5e94893597c278b0ced5fbe7ac54b90
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/config.yml
@@ -0,0 +1,45 @@
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18091
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 9998
+
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 10
+
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: False
+
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+
+    #重试次数
+    retry: 10
+
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+    use_profile: True
+    
+    tracer:
+        interval_s: 10
+op:
+    ArcFace:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 8
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #模型路径
+            model_config: ./MobileFaceNet_128_serving
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["save_infer_model/scale_0.tmp_1"]
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0"
+
+            ir_optim: True
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/demo.jpg b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/demo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82833092a200bbe88103e13e52fada2162ffb60f
Binary files /dev/null and b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/demo.jpg differ
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/results.png b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/results.png
new file mode 100644
index 0000000000000000000000000000000000000000..8dddb5cfe166ac07e3b82551d10578d2fe532392
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/results.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84ed6a638113d9252ce41b351649ce5000744f4486e57c29368194da24b6eea7
+size 576988
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/start_server.png b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/start_server.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3f00c70eace0c78a9d211923d9399af5a1fbd5c
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/imgs/start_server.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2581c06ca83f8d5bf712138da6fcb19d2787a9f9b3e932e6803ba46a16801177
+size 1266353
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_http_client.py b/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_http_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fe3dc4037f54a089be3e0eccf98de86140e726f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_http_client.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import requests
+import json
+import base64
+import os
+
+import argparse
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument("--image_dir", type=str, default="./imgs")
+args = parser.parse_args()
+
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+
+url = "http://127.0.0.1:9998/ArcFace/prediction"
+
+test_img_dir = args.image_dir
+for idx, img_file in enumerate(os.listdir(test_img_dir)):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data1 = file.read()
+
+    image = cv2_to_base64(image_data1)
+
+    for i in range(1):
+        data = {"key": ["image"], "value": [image]}
+        r = requests.post(url=url, data=json.dumps(data))
+        print(r.json())
+
+print("==> total number of test imgs: ", len(os.listdir(test_img_dir)))
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_rpc_client.py b/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_rpc_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4e0b8c553f389700804da5c3ec15ccb93b4ff9
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/pipeline_rpc_client.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from paddle_serving_server_gpu.pipeline import PipelineClient
+except ImportError:
+    from paddle_serving_server.pipeline import PipelineClient
+import numpy as np
+import requests
+import json
+import cv2
+import base64
+import os
+
+client = PipelineClient()
+client.connect(['127.0.0.1:18091'])
+
+
+
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode('utf8')
+
+import argparse
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument("--image_dir", type=str, default="../../doc/imgs/")
+args = parser.parse_args()
+test_img_dir = args.image_dir
+
+for idx, img_file in enumerate(os.listdir(test_img_dir)):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data1 = file.read()
+
+    image = cv2_to_base64(image_data1)
+
+    for i in range(1):
+        ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+        print(ret)
+
+print("==> total number of test imgs: ", len(os.listdir(test_img_dir)))
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/deploy/pdserving/web_service.py b/insightface/recognition/arcface_paddle/deploy/pdserving/web_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f4f68f9502a5f8e328ff746bf39d85acb31396
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/deploy/pdserving/web_service.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server.web_service import WebService, Op
+
+import numpy as np
+import cv2
+import base64
+
+
+class ArcFaceOp(Op):
+    def init_op(self):
+        pass
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        data = base64.b64decode(input_dict["image"])
+        data = np.frombuffer(data, np.uint8)
+        # Note: class variables(self.var) can only be used in process op mode
+        img = cv2.imdecode(data, cv2.IMREAD_COLOR)
+        img = cv2.resize(img,(112,112))
+        # normalize to mean 0.5, std 0.5
+        img = (img - 127.5) * 0.00784313725
+        # BGR2RGB
+        img = img[:, :, ::-1]
+        img = img.transpose((2, 0, 1))
+        img = np.expand_dims(img, 0)
+        img = img.astype('float32')
+        return {"x":img.copy()}, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, log_id):
+        out = fetch_dict["save_infer_model/scale_0.tmp_1"]
+        out_dict = {"out": out}
+
+        return out_dict, None, ""
+
+class ArcFaceService(WebService):
+    def get_pipeline_response(self, read_op):
+        arcface_op = ArcFaceOp(name="ArcFace", input_ops=[read_op])
+        return arcface_op
+
+
+arcface_service = ArcFaceService(name="ArcFace")
+arcface_service.prepare_pipeline_config("config.yml")
+arcface_service.run_service()
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/dynamic/backbones/__init__.py b/insightface/recognition/arcface_paddle/dynamic/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e51edd29568751cec3f2e69db470e5bbc3a7399
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/backbones/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mobilefacenet import MobileFaceNet_128
+from .iresnet import FresResNet50, FresResNet100
diff --git a/insightface/recognition/arcface_paddle/dynamic/backbones/iresnet.py b/insightface/recognition/arcface_paddle/dynamic/backbones/iresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9424a5f434d1a250d6641f2f9a500b4e90cc8259
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/backbones/iresnet.py
@@ -0,0 +1,337 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout, PReLU
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import XavierNormal, Constant
+
+import math
+
+__all__ = ["FresResNet50", "FresResNet100"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            epsilon=1e-05,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(bn_name + "_offset"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance",
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        bn_name = "bn_" + name[3:] + "_before"
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=None,
+            epsilon=1e-05,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(bn_name + "_offset"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance",
+            data_layout=data_format)
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.prelu = PReLU(num_parameters=1, name=name + "_branch2a_prelu")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act=None,
+            name=name + "_branch2b",
+            data_format=data_format)
+
+        if shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                act=None,
+                name=name + "_branch1",
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self._batch_norm(inputs)
+        y = self.conv0(y)
+        y = self.prelu(y)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = self.short(inputs)
+        else:
+            short = inputs
+        y = paddle.add(x=short, y=conv1)
+        return y
+
+
+class FC(nn.Layer):
+    def __init__(self,
+                 bn_channels,
+                 num_channels,
+                 num_classes,
+                 fc_type,
+                 dropout=0.4,
+                 name=None,
+                 data_format="NCHW"):
+        super(FC, self).__init__()
+        self.p = dropout
+        self.fc_type = fc_type
+        self.num_channels = num_channels
+
+        bn_name = "bn_" + name
+        if fc_type == "Z":
+            self._batch_norm_1 = BatchNorm(
+                bn_channels,
+                act=None,
+                epsilon=1e-05,
+                param_attr=ParamAttr(name=bn_name + "_1_scale"),
+                bias_attr=ParamAttr(bn_name + "_1_offset"),
+                moving_mean_name=bn_name + "_1_mean",
+                moving_variance_name=bn_name + "_1_variance",
+                data_layout=data_format)
+            if self.p > 0:
+                self.dropout = Dropout(p=self.p, name=name + '_dropout')
+
+        elif fc_type == "E":
+            self._batch_norm_1 = BatchNorm(
+                bn_channels,
+                act=None,
+                epsilon=1e-05,
+                param_attr=ParamAttr(name=bn_name + "_1_scale"),
+                bias_attr=ParamAttr(bn_name + "_1_offset"),
+                moving_mean_name=bn_name + "_1_mean",
+                moving_variance_name=bn_name + "_1_variance",
+                data_layout=data_format)
+            if self.p > 0:
+                self.dropout = Dropout(p=self.p, name=name + '_dropout')
+            self.fc = Linear(
+                num_channels,
+                num_classes,
+                weight_attr=ParamAttr(
+                    initializer=XavierNormal(fan_in=0.0), name=name + ".w_0"),
+                bias_attr=ParamAttr(
+                    initializer=Constant(), name=name + ".b_0"))
+            self._batch_norm_2 = BatchNorm(
+                num_classes,
+                act=None,
+                epsilon=1e-05,
+                param_attr=ParamAttr(name=bn_name + "_2_scale"),
+                bias_attr=ParamAttr(bn_name + "_2_offset"),
+                moving_mean_name=bn_name + "_2_mean",
+                moving_variance_name=bn_name + "_2_variance",
+                data_layout=data_format)
+
+        elif fc_type == "FC":
+            self._batch_norm_1 = BatchNorm(
+                bn_channels,
+                act=None,
+                epsilon=1e-05,
+                param_attr=ParamAttr(name=bn_name + "_1_scale"),
+                bias_attr=ParamAttr(bn_name + "_1_offset"),
+                moving_mean_name=bn_name + "_1_mean",
+                moving_variance_name=bn_name + "_1_variance",
+                data_layout=data_format)
+            self.fc = Linear(
+                num_channels,
+                num_classes,
+                weight_attr=ParamAttr(
+                    initializer=XavierNormal(fan_in=0.0), name=name + ".w_0"),
+                bias_attr=ParamAttr(
+                    initializer=Constant(), name=name + ".b_0"))
+            self._batch_norm_2 = BatchNorm(
+                num_classes,
+                act=None,
+                epsilon=1e-05,
+                param_attr=ParamAttr(name=bn_name + "_2_scale"),
+                bias_attr=ParamAttr(bn_name + "_2_offset"),
+                moving_mean_name=bn_name + "_2_mean",
+                moving_variance_name=bn_name + "_2_variance",
+                data_layout=data_format)
+
+    def forward(self, inputs):
+        if self.fc_type == "Z":
+            y = self._batch_norm_1(inputs)
+            y = paddle.reshape(y, shape=[-1, self.num_channels])
+            if self.p > 0:
+                y = self.dropout(y)
+
+        elif self.fc_type == "E":
+            y = self._batch_norm_1(inputs)
+            y = paddle.reshape(y, shape=[-1, self.num_channels])
+            if self.p > 0:
+                y = self.dropout(y)
+            y = self.fc(y)
+            y = self._batch_norm_2(y)
+
+        elif self.fc_type == "FC":
+            y = self._batch_norm_1(inputs)
+            y = paddle.reshape(y, shape=[-1, self.num_channels])
+            y = self.fc(y)
+            y = self._batch_norm_2(y)
+
+        return y
+
+
+class FresResNet(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 num_features=512,
+                 fc_type='E',
+                 dropout=0.4,
+                 input_image_channel=3,
+                 input_image_width=112,
+                 input_image_height=112,
+                 data_format="NCHW"):
+
+        super(FresResNet, self).__init__()
+
+        self.layers = layers
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+
+        supported_layers = [50, 100]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            units = [3, 4, 14, 3]
+        elif layers == 100:
+            units = [3, 13, 30, 3]
+
+        num_channels = [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name="conv1",
+            data_format=self.data_format)
+        self.prelu = PReLU(num_parameters=1, name="prelu1")
+
+        self.block_list = paddle.nn.LayerList()
+        for block in range(len(units)):
+            shortcut = True
+            for i in range(units[block]):
+                conv_name = "res" + str(block + 2) + chr(97 + i)
+                basic_block = self.add_sublayer(
+                    conv_name,
+                    BasicBlock(
+                        num_channels=num_channels[block]
+                        if i == 0 else num_filters[block],
+                        num_filters=num_filters[block],
+                        stride=2 if shortcut else 1,
+                        shortcut=shortcut,
+                        name=conv_name,
+                        data_format=self.data_format))
+                self.block_list.append(basic_block)
+                shortcut = False
+
+        assert input_image_width % 16 == 0
+        assert input_image_height % 16 == 0
+        feat_w = input_image_width // 16
+        feat_h = input_image_height // 16
+        self.fc_channels = num_filters[-1] * feat_w * feat_h
+        self.fc = FC(num_filters[-1],
+                     self.fc_channels,
+                     num_features,
+                     fc_type,
+                     dropout,
+                     name='fc')
+
+    def forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        y = self.conv(inputs)
+        y = self.prelu(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.fc(y)
+        return y
+
+
+def FresResNet50(**args):
+    model = FresResNet(layers=50, **args)
+    return model
+
+
+def FresResNet100(**args):
+    model = FresResNet(layers=100, **args)
+    return model
diff --git a/insightface/recognition/arcface_paddle/dynamic/backbones/mobilefacenet.py b/insightface/recognition/arcface_paddle/dynamic/backbones/mobilefacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6949b2851a7e362b5bf619f01d7286d72efed42d
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/backbones/mobilefacenet.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import math
+
+__all__ = ['MobileFaceNet_128']
+
+MobileFaceNet_BottleNeck_Setting = [
+    # t, c , n ,s
+    [2, 64, 5, 2],
+    [4, 128, 1, 2],
+    [2, 128, 6, 1],
+    [4, 128, 1, 2],
+    [2, 128, 2, 1]
+]
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self, inp, oup, stride, expansion):
+        super().__init__()
+        self.connect = stride == 1 and inp == oup
+
+        self.conv = nn.Sequential(
+            # 1*1 conv
+            nn.Conv2D(
+                inp, inp * expansion, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(inp * expansion),
+            nn.PReLU(inp * expansion),
+
+            # 3*3 depth wise conv
+            nn.Conv2D(
+                inp * expansion,
+                inp * expansion,
+                3,
+                stride,
+                1,
+                groups=inp * expansion,
+                bias_attr=False),
+            nn.BatchNorm2D(inp * expansion),
+            nn.PReLU(inp * expansion),
+
+            # 1*1 conv
+            nn.Conv2D(
+                inp * expansion, oup, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(oup), )
+
+    def forward(self, x):
+        if self.connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, inp, oup, k, s, p, dw=False, linear=False):
+        super().__init__()
+        self.linear = linear
+        if dw:
+            self.conv = nn.Conv2D(
+                inp, oup, k, s, p, groups=inp, bias_attr=False)
+        else:
+            self.conv = nn.Conv2D(inp, oup, k, s, p, bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(oup)
+        if not linear:
+            self.prelu = nn.PReLU(oup)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.linear:
+            return x
+        else:
+            return self.prelu(x)
+
+
+class MobileFaceNet(nn.Layer):
+    def __init__(self,
+                 feature_dim=128,
+                 bottleneck_setting=MobileFaceNet_BottleNeck_Setting,
+                 **args):
+        super().__init__()
+        self.conv1 = ConvBlock(3, 64, 3, 2, 1)
+        self.dw_conv1 = ConvBlock(64, 64, 3, 1, 1, dw=True)
+
+        self.cur_channel = 64
+        block = BottleNeck
+        self.blocks = self._make_layer(block, bottleneck_setting)
+
+        self.conv2 = ConvBlock(128, 512, 1, 1, 0)
+        self.linear7 = ConvBlock(512, 512, 7, 1, 0, dw=True, linear=True)
+        self.linear1 = ConvBlock(512, feature_dim, 1, 1, 0, linear=True)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                # ks * ks * out_ch
+                n = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype=m.weight.dtype,
+                    default_initializer=nn.initializer.Normal(
+                        mean=0.0, std=math.sqrt(2.0 / n)))
+                
+            elif isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.GroupNorm)):
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype=m.weight.dtype,
+                    default_initializer=nn.initializer.Constant(value=1.0))
+                m.bias = paddle.create_parameter(
+                    shape=m.bias.shape,
+                    dtype=m.bias.dtype,
+                    default_initializer=nn.initializer.Constant(value=0.0))
+
+    def _make_layer(self, block, setting):
+        layers = []
+        for t, c, n, s in setting:
+            for i in range(n):
+                if i == 0:
+                    layers.append(block(self.cur_channel, c, s, t))
+                else:
+                    layers.append(block(self.cur_channel, c, 1, t))
+                self.cur_channel = c
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.dw_conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.linear7(x)
+        x = self.linear1(x)
+        x = x.reshape([x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]])
+        return x
+
+
+def MobileFaceNet_128(num_features=128, **args):
+    model = MobileFaceNet(feature_dim=num_features, **args)
+    return model
+
+
+# if __name__ == "__main__":
+#     paddle.set_device("cpu")
+#     x = paddle.rand([2, 3, 112, 112])
+#     net = MobileFaceNet()
+#     print(net)
+
+#     x = net(x)
+#     print(x.shape)
diff --git a/insightface/recognition/arcface_paddle/dynamic/classifiers/__init__.py b/insightface/recognition/arcface_paddle/dynamic/classifiers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0e52f2b5d2045af6ba6c10793e0567033e2ae9
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/classifiers/__init__.py
@@ -0,0 +1,15 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lsc import LargeScaleClassifier
diff --git a/insightface/recognition/arcface_paddle/dynamic/classifiers/lsc.py b/insightface/recognition/arcface_paddle/dynamic/classifiers/lsc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f22a769eca1efdf81c09b7154913ba0c32e812eb
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/classifiers/lsc.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import math
+import os
+import paddle
+import paddle.nn as nn
+
+class LargeScaleClassifier(nn.Layer):
+    """
+    Author: {Xiang An, Yang Xiao, XuHan Zhu} in DeepGlint,
+    Partial FC: Training 10 Million Identities on a Single Machine
+    See the original paper:
+    https://arxiv.org/abs/2010.05222
+    """
+
+    @paddle.no_grad()
+    def __init__(self,
+                 rank,
+                 world_size,
+                 num_classes,
+                 margin1=1.0,
+                 margin2=0.5,
+                 margin3=0.0,
+                 scale=64.0,
+                 sample_ratio=1.0,
+                 embedding_size=512,
+                 fp16=False,
+                 name=None):
+        super(LargeScaleClassifier, self).__init__()
+        self.num_classes: int = num_classes
+        self.rank: int = rank
+        self.world_size: int = world_size
+        self.sample_ratio: float = sample_ratio
+        self.embedding_size: int = embedding_size
+        self.fp16 = fp16
+        self.num_local: int = (num_classes + world_size - 1) // world_size
+        if num_classes % world_size != 0 and rank == world_size - 1:
+            self.num_local = num_classes % self.num_local
+        self.num_sample: int = int(self.sample_ratio * self.num_local)
+        self.margin1 = margin1
+        self.margin2 = margin2
+        self.margin3 = margin3
+        self.logit_scale = scale
+
+        self._parameter_list = []
+
+        if name is None:
+            name = 'dist@fc@rank@%05d.w' % rank
+        assert '.w' in name
+
+        stddev = math.sqrt(2.0 / (self.embedding_size + self.num_local))
+        param_attr = paddle.ParamAttr(
+            name=name, initializer=paddle.nn.initializer.Normal(std=stddev))
+
+        self.index = None
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.num_local],
+            attr=param_attr,
+            is_bias=False,
+            dtype='float16' if self.fp16 else 'float32')
+        self.weight.is_distributed = True
+
+        if int(self.sample_ratio) < 1:
+            self.weight.stop_gradient = True
+
+    def step(self, optimizer):
+        if int(self.sample_ratio) < 1:
+            warnings.warn(
+                "Explicitly call the function paddle._C_ops.sparse_momentum is a temporary manner. "
+                "We will merge it to optimizer in the future, please don't follow.")
+            
+            found_inf = paddle.logical_not(
+                paddle.all(paddle.isfinite(self._parameter_list[0].grad)))
+            if found_inf:
+                print('Found inf or nan in classifier')
+            else:
+                if self.weight.name not in optimizer._accumulators[
+                        optimizer._velocity_acc_str]:
+                    optimizer._add_accumulator(optimizer._velocity_acc_str,
+                                               self.weight)
+
+                velocity = optimizer._accumulators[
+                    optimizer._velocity_acc_str][self.weight.name]
+                _, _ = paddle._C_ops.sparse_momentum(
+                    self.weight,
+                    self._parameter_list[0].grad,
+                    velocity,
+                    self.index,
+                    paddle.to_tensor(
+                        optimizer.get_lr(), dtype='float32'),
+                    self.weight,
+                    velocity,
+                    'mu',
+                    optimizer._momentum,
+                    'use_nesterov',
+                    optimizer._use_nesterov,
+                    'regularization_method',
+                    optimizer._regularization_method,
+                    'regularization_coeff',
+                    optimizer._regularization_coeff,
+                    'axis',
+                    1)
+
+    def clear_grad(self):
+        self._parameter_list = []
+
+    def forward(self, feature, label):
+
+        if self.world_size > 1:
+            feature_list = []
+            paddle.distributed.all_gather(feature_list, feature)
+            total_feature = paddle.concat(feature_list, axis=0)
+
+            label_list = []
+            paddle.distributed.all_gather(label_list, label)
+            total_label = paddle.concat(label_list, axis=0)
+            total_label.stop_gradient = True
+        else:
+            total_feature = feature
+            total_label = label
+
+        if self.sample_ratio < 1.0:
+            # partial fc sample process
+            total_label, self.index = paddle.nn.functional.class_center_sample(
+                total_label, self.num_local, self.num_sample)
+            total_label.stop_gradient = True
+            self.index.stop_gradient = True
+            self.sub_weight = paddle.gather(self.weight, self.index, axis=1)
+            self.sub_weight.stop_gradient = False
+            self._parameter_list.append(self.sub_weight)
+        else:
+            self.sub_weight = self.weight
+
+        norm_feature = paddle.fluid.layers.l2_normalize(total_feature, axis=1)
+        norm_weight = paddle.fluid.layers.l2_normalize(self.sub_weight, axis=0)
+
+        local_logit = paddle.matmul(norm_feature, norm_weight)
+
+        loss = paddle.nn.functional.margin_cross_entropy(
+            local_logit,
+            total_label,
+            margin1=self.margin1,
+            margin2=self.margin2,
+            margin3=self.margin3,
+            scale=self.logit_scale,
+            return_softmax=False,
+            reduction=None, )
+
+        loss = paddle.mean(loss)
+
+        return loss
diff --git a/insightface/recognition/arcface_paddle/dynamic/export.py b/insightface/recognition/arcface_paddle/dynamic/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..a41d30aeb1282dc43f6f99c4879ffc4686b96802
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/export.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+
+from .utils.io import Checkpoint
+from . import backbones
+
+
+def export(args):
+    checkpoint = Checkpoint(
+        rank=0,
+        world_size=1,
+        embedding_size=args.embedding_size,
+        num_classes=None,
+        checkpoint_dir=args.checkpoint_dir, )
+
+    backbone = eval("backbones.{}".format(args.backbone))(
+        num_features=args.embedding_size)
+    checkpoint.load(backbone, for_train=False, dtype='float32')
+
+    print("Load checkpoint from '{}'.".format(args.checkpoint_dir))
+    backbone.eval()
+
+    path = os.path.join(args.output_dir, args.backbone)
+
+    if args.export_type == 'onnx':
+        paddle.onnx.export(
+            backbone,
+            path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 3, 112, 112], dtype='float32')
+            ])
+    else:
+        paddle.jit.save(
+            backbone,
+            path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 3, 112, 112], dtype='float32')
+            ])
+    print("Save exported model to '{}'.".format(args.output_dir))
diff --git a/insightface/recognition/arcface_paddle/dynamic/train.py b/insightface/recognition/arcface_paddle/dynamic/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..778ea54f8a7d7f31969a3de45cb3f08716a3b660
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/train.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import sys
+import numpy as np
+import logging
+
+import paddle
+from visualdl import LogWriter
+
+from utils.logging import AverageMeter, init_logging, CallBackLogging
+from datasets import CommonDataset, SyntheticDataset
+from utils import losses
+
+from .utils.verification import CallBackVerification
+from .utils.io import Checkpoint
+from .utils.amp import LSCGradScaler
+
+from . import classifiers
+from . import backbones
+
+RELATED_FLAGS_SETTING = {
+    'FLAGS_cudnn_exhaustive_search': 1,
+    'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+    'FLAGS_max_inplace_grad_add': 8,
+    'FLAGS_fraction_of_gpu_memory_to_use': 0.9999,
+}
+paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
+
+
+def train(args):
+    writer = LogWriter(logdir=args.logdir)
+
+    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
+
+    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
+    place = paddle.CUDAPlace(gpu_id)
+
+    if world_size > 1:
+        import paddle.distributed.fleet as fleet
+        from .utils.data_parallel import sync_gradients, sync_params
+
+        strategy = fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+        fleet.init(is_collective=True, strategy=strategy)
+
+    if args.use_synthetic_dataset:
+        trainset = SyntheticDataset(args.num_classes, fp16=args.fp16)
+    else:
+        trainset = CommonDataset(
+            root_dir=args.data_dir,
+            label_file=args.label_file,
+            fp16=args.fp16,
+            is_bin=args.is_bin)
+
+    num_image = len(trainset)
+    total_batch_size = args.batch_size * world_size
+    steps_per_epoch = num_image // total_batch_size
+    if args.train_unit == 'epoch':
+        warmup_steps = steps_per_epoch * args.warmup_num
+        total_steps = steps_per_epoch * args.train_num
+        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
+        total_epoch = args.train_num
+    else:
+        warmup_steps = args.warmup_num
+        total_steps = args.train_num
+        decay_steps = [x for x in args.decay_boundaries]
+        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch
+
+    if rank == 0:
+        logging.info('world_size: {}'.format(world_size))
+        logging.info('total_batch_size: {}'.format(total_batch_size))
+        logging.info('warmup_steps: {}'.format(warmup_steps))
+        logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
+        logging.info('total_steps: {}'.format(total_steps))
+        logging.info('total_epoch: {}'.format(total_epoch))
+        logging.info('decay_steps: {}'.format(decay_steps))
+
+    base_lr = total_batch_size * args.lr / 512
+    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
+        boundaries=decay_steps,
+        values=[
+            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
+        ])
+    if warmup_steps > 0:
+        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
+            lr_scheduler, warmup_steps, 0, base_lr)
+
+    if args.fp16:
+        paddle.set_default_dtype("float16")
+
+    margin_loss_params = eval("losses.{}".format(args.loss))()
+    backbone = eval("backbones.{}".format(args.backbone))(
+        num_features=args.embedding_size, dropout=args.dropout)
+    classifier = eval("classifiers.{}".format(args.classifier))(
+        rank=rank,
+        world_size=world_size,
+        num_classes=args.num_classes,
+        margin1=margin_loss_params.margin1,
+        margin2=margin_loss_params.margin2,
+        margin3=margin_loss_params.margin3,
+        scale=margin_loss_params.scale,
+        sample_ratio=args.sample_ratio,
+        embedding_size=args.embedding_size,
+        fp16=args.fp16)
+
+    backbone.train()
+    classifier.train()
+
+    optimizer = paddle.optimizer.Momentum(
+        parameters=[{
+            'params': backbone.parameters(),
+        }, {
+            'params': classifier.parameters(),
+        }],
+        learning_rate=lr_scheduler,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay)
+
+    if args.fp16:
+        optimizer._dtype = 'float32'
+
+    if world_size > 1:
+        # sync backbone params for data parallel
+        sync_params(backbone.parameters())
+
+    if args.do_validation_while_train:
+        callback_verification = CallBackVerification(
+            args.validation_interval_step,
+            rank,
+            args.batch_size,
+            args.val_targets,
+            args.data_dir,
+            fp16=args.fp16, )
+
+    callback_logging = CallBackLogging(args.log_interval_step, rank,
+                                       world_size, total_steps,
+                                       args.batch_size, writer)
+
+    checkpoint = Checkpoint(
+        rank=rank,
+        world_size=world_size,
+        embedding_size=args.embedding_size,
+        num_classes=args.num_classes,
+        model_save_dir=os.path.join(args.output, args.backbone),
+        checkpoint_dir=args.checkpoint_dir,
+        max_num_last_checkpoint=args.max_num_last_checkpoint)
+
+    start_epoch = 0
+    global_step = 0
+    loss_avg = AverageMeter()
+    if args.resume:
+        extra_info = checkpoint.load(
+            backbone, classifier, optimizer, for_train=True)
+        start_epoch = extra_info['epoch'] + 1
+        lr_state = extra_info['lr_state']
+        # there last_epoch means last_step in for PiecewiseDecay
+        # since we always use step style for lr_scheduler
+        global_step = lr_state['last_epoch']
+        lr_scheduler.set_state_dict(lr_state)
+
+    train_loader = paddle.io.DataLoader(
+        trainset,
+        places=place,
+        num_workers=args.num_workers,
+        batch_sampler=paddle.io.DistributedBatchSampler(
+            dataset=trainset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True))
+
+    scaler = LSCGradScaler(
+        enable=args.fp16,
+        init_loss_scaling=args.init_loss_scaling,
+        incr_ratio=args.incr_ratio,
+        decr_ratio=args.decr_ratio,
+        incr_every_n_steps=args.incr_every_n_steps,
+        decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+        use_dynamic_loss_scaling=args.use_dynamic_loss_scaling)
+
+    for epoch in range(start_epoch, total_epoch):
+        train_reader_cost = 0.0
+        train_run_cost = 0.0
+        total_samples = 0
+        reader_start = time.time()
+        for step, (img, label) in enumerate(train_loader):
+            train_reader_cost += time.time() - reader_start
+            global_step += 1
+            train_start = time.time()
+            with paddle.amp.auto_cast(enable=args.fp16):
+                features = backbone(img)
+                loss_v = classifier(features, label)
+
+            scaler.scale(loss_v).backward()
+            if world_size > 1:
+                # data parallel sync backbone gradients
+                sync_gradients(backbone.parameters())
+
+            scaler.step(optimizer)
+            classifier.step(optimizer)
+            optimizer.clear_grad()
+            classifier.clear_grad()
+
+            train_run_cost += time.time() - train_start
+            total_samples += len(img)
+
+            lr_value = optimizer.get_lr()
+            loss_avg.update(loss_v.item(), 1)
+            callback_logging(
+                global_step,
+                loss_avg,
+                epoch,
+                lr_value,
+                avg_reader_cost=train_reader_cost / args.log_interval_step,
+                avg_batch_cost=(train_reader_cost + train_run_cost) / args.log_interval_step,
+                avg_samples=total_samples / args.log_interval_step,
+                ips=total_samples / (train_reader_cost + train_run_cost))
+           
+            if args.do_validation_while_train:
+                callback_verification(global_step, backbone)
+            lr_scheduler.step()
+
+            if global_step >= total_steps:
+                break
+            sys.stdout.flush()
+            if rank is 0 and global_step > 0 and global_step % args.log_interval_step == 0:
+                train_reader_cost = 0.0
+                train_run_cost = 0.0
+                total_samples = 0
+            reader_start = time.time()
+        checkpoint.save(
+            backbone, classifier, optimizer, epoch=epoch, for_train=True)
+    writer.close()
diff --git a/insightface/recognition/arcface_paddle/dynamic/utils/__init__.py b/insightface/recognition/arcface_paddle/dynamic/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/insightface/recognition/arcface_paddle/dynamic/utils/amp.py b/insightface/recognition/arcface_paddle/dynamic/utils/amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42c09cad5220006f295077d7efe332ef27edcca
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/utils/amp.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from paddle.amp import GradScaler
+from paddle import _C_ops
+import paddle
+
+
+class LSCGradScaler(GradScaler):
+    def __init__(self,
+                 enable=True,
+                 init_loss_scaling=2.**15,
+                 incr_ratio=2.0,
+                 decr_ratio=0.5,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 use_dynamic_loss_scaling=True,
+                 max_loss_scaling=32768.0):
+        super(LSCGradScaler, self).__init__(
+            enable, init_loss_scaling, incr_ratio, decr_ratio,
+            incr_every_n_steps, decr_every_n_nan_or_inf,
+            use_dynamic_loss_scaling)
+        self.max_loss_scaling = max_loss_scaling
+
+    def step(self, optimizer, classifier=None):
+        if not self._enable:
+            if classifier is not None:
+                classifier.step(optimizer)
+            return optimizer.step()
+
+        # unscale the grad
+        self._unscale(optimizer)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimizer.step()
+            if classifier is not None:
+                classifier.step(optimizer)
+
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            # update the scale
+            self._update()
+
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+
+        param_grads_dict = defaultdict(list)
+        dist_param_grads_dict = defaultdict(list)
+        if getattr(optimizer, '_param_groups', None) and isinstance(
+                optimizer._param_groups[0], dict):
+            for group in optimizer._param_groups:
+                for param in group['params']:
+                    if not param.is_distributed:
+                        if param._grad_ivar() is not None:
+                            param_grads_dict[param._grad_ivar().dtype].append(
+                                param._grad_ivar())
+                    else:
+                        if param._grad_ivar() is not None:
+                            dist_param_grads_dict[param._grad_ivar(
+                            ).dtype].append(param._grad_ivar())
+        else:
+            for param in optimizer._parameter_list:
+                if not param.is_distributed:
+                    if param._grad_ivar() is not None:
+                        param_grads_dict[param._grad_ivar().dtype].append(
+                            param._grad_ivar())
+                else:
+                    if param._grad_ivar() is not None:
+                        dist_param_grads_dict[param._grad_ivar().dtype].append(
+                            param._grad_ivar())
+        for dtype in dist_param_grads_dict:
+            for grad in dist_param_grads_dict[dtype]:
+                self._found_inf = paddle.logical_not(
+                    paddle.all(paddle.isfinite(grad)))
+                if self._found_inf:
+                    print('Found inf or nan in classifier, dtype is', dtype)
+                    return
+
+        for dtype in param_grads_dict:
+            param_grads = param_grads_dict[dtype]
+            _C_ops.check_finite_and_unscale(param_grads, self._scale,
+                                            param_grads, self._found_inf)
+            if self._found_inf:
+                print('Found inf or nan in backbone, dtype is', dtype)
+                break
diff --git a/insightface/recognition/arcface_paddle/dynamic/utils/data_parallel.py b/insightface/recognition/arcface_paddle/dynamic/utils/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa4372c438f3d6d2e4fafac88336e8b69c43e51
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/utils/data_parallel.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+@paddle.no_grad()
+def sync_params(parameters):
+    for param in parameters:
+        paddle.distributed.broadcast(
+            param.detach(), src=0, group=None, use_calc_stream=True)
+
+
+@paddle.no_grad()
+def sync_gradients(parameters):
+    grad_var_set = set()
+    grad_vars = []
+    sparse_grad_vars = []
+
+    for param in parameters:
+        if param.trainable and (param._grad_ivar() is not None):
+            g_var = param._grad_ivar()
+            assert not g_var._is_sparse(
+            ), "Now, it doesn't support sparse parameters"
+            grad_vars.append(g_var)
+            assert g_var not in grad_var_set
+            grad_var_set.add(g_var)
+
+    coalesced_grads_and_vars = \
+        paddle.fluid.dygraph.parallel.build_groups(grad_vars, 128 * 1024 * 1024)
+
+    nranks = paddle.distributed.get_world_size()
+    for coalesced_grad, _, _ in coalesced_grads_and_vars:
+        # need to div nranks
+        div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type="elementwise_div",
+            inputs={'X': coalesced_grad,
+                    'Y': div_factor},
+            outputs={'Out': coalesced_grad},
+            attrs={'axis': -1})
+
+        paddle.distributed.all_reduce(coalesced_grad)
+
+    paddle.fluid.dygraph.parallel._split_tensors(coalesced_grads_and_vars)
diff --git a/insightface/recognition/arcface_paddle/dynamic/utils/io.py b/insightface/recognition/arcface_paddle/dynamic/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30449c86fecbc8a94bc54465ff1bee37578a896
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/utils/io.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import os
+import paddle
+import logging
+import numpy as np
+import shutil
+import json
+from utils.rearrange_weight import rearrange_weight
+
+
+class Checkpoint(object):
+    def __init__(self,
+                 rank,
+                 world_size,
+                 embedding_size,
+                 num_classes,
+                 model_save_dir="./",
+                 checkpoint_dir=None,
+                 max_num_last_checkpoint=3):
+
+        self.rank: int = rank
+        self.world_size: int = world_size
+        self.embedding_size: int = embedding_size
+        self.num_classes: int = num_classes
+        self.model_save_dir: str = model_save_dir
+        self.checkpoint_dir: str = checkpoint_dir
+        self.max_num_last_checkpoint: int = max_num_last_checkpoint
+
+    def save(self,
+             backbone: paddle.nn.Layer,
+             classifier: paddle.nn.Layer=None,
+             optimizer=None,
+             epoch=0,
+             for_train=True):
+
+        model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+        if not os.path.exists(model_save_dir):
+            # may be more than one processes trying
+            # to create the directory
+            try:
+                os.makedirs(model_save_dir)
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise
+                pass
+
+        if self.rank == 0:
+            # for non dist param, we only save their at rank 0.
+            for name, param in backbone.state_dict().items():
+                paddle.save(
+                    param,
+                    os.path.join(model_save_dir, param.name + '.pdparam'))
+
+        if classifier is not None:
+            # for dist param, we need to save their at all ranks.
+            for name, param in classifier.state_dict().items():
+                paddle.save(
+                    param,
+                    os.path.join(model_save_dir, param.name + '.pdparam'))
+
+        if for_train:
+            assert optimizer is not None
+            opt_state_dict = optimizer.state_dict()
+            lr_state_dict = opt_state_dict['LR_Scheduler']
+            for name, opt in opt_state_dict.items():
+                if '@GRAD' in name:
+                    continue
+                # for non dist opt var, we only save their at rank 0,
+                # but for dist opt var, we need to save their at all ranks.
+                if 'dist@' in name and '@rank@' in name or self.rank == 0:
+                    paddle.save(opt,
+                                os.path.join(model_save_dir, name + '.pdopt'))
+
+            if self.rank == 0:
+                # save some extra info for resume
+                # pretrain_world_size, embedding_size, num_classes are used for
+                # re-split fc weight when gpu setting changed.
+                # epoch use to restart.
+                config_file = os.path.join(model_save_dir, 'meta.json')
+                extra_info = dict()
+                extra_info["pretrain_world_size"] = self.world_size
+                extra_info["embedding_size"] = self.embedding_size
+                extra_info['num_classes'] = self.num_classes
+                extra_info['epoch'] = epoch
+                extra_info['lr_state'] = lr_state_dict
+                with open(config_file, 'w') as f:
+                    json.dump(extra_info, f)
+
+        logging.info("Save model to {}.".format(model_save_dir))
+        if self.rank == 0 and self.max_num_last_checkpoint > 0:
+            for idx in range(-1, epoch - self.max_num_last_checkpoint + 1):
+                path = os.path.join(self.model_save_dir, str(idx))
+                if os.path.exists(path):
+                    logging.info("Remove checkpoint {}.".format(path))
+                    shutil.rmtree(path)
+
+    def load(self,
+             backbone: paddle.nn.Layer,
+             classifier: paddle.nn.Layer=None,
+             optimizer=None,
+             for_train=True,
+             dtype=None):
+
+        assert os.path.exists(self.checkpoint_dir)
+        checkpoint_dir = os.path.abspath(self.checkpoint_dir)
+
+        param_state_dict = {}
+        opt_state_dict = {}
+        dist_param_state_dict = {}
+
+        dist_weight_state_dict = {}
+        dist_weight_velocity_state_dict = {}
+        dist_bias_state_dict = {}
+        dist_bias_velocity_state_dict = {}
+        for path in os.listdir(checkpoint_dir):
+            path = os.path.join(checkpoint_dir, path)
+            if not os.path.isfile(path):
+                continue
+
+            basename = os.path.basename(path)
+            name, ext = os.path.splitext(basename)
+
+            if ext not in ['.pdopt', '.pdparam']:
+                continue
+
+            if not for_train and ext == '.pdopt':
+                continue
+
+            tensor = paddle.load(path, return_numpy=True)
+            if dtype:
+                assert dtype in ['float32', 'float16']
+                tensor = tensor.astype(dtype)
+
+            if 'dist@' in name and '@rank@' in name:
+                if '.w' in name and 'velocity' not in name:
+                    dist_weight_state_dict[name] = tensor
+                elif '.w' in name and 'velocity' in name:
+                    dist_weight_velocity_state_dict[name] = tensor
+                elif '.b' in name and 'velocity' not in name:
+                    dist_bias_state_dict[name] = tensor
+                elif '.b' in name and 'velocity' in name:
+                    dist_bias_velocity_state_dict[name] = tensor
+
+            else:
+                if ext == '.pdparam':
+                    param_state_dict[name] = tensor
+                else:
+                    opt_state_dict[name] = tensor
+
+        if classifier is not None and for_train:
+            meta_file = os.path.join(checkpoint_dir, 'meta.json')
+            if not os.path.exists(meta_file):
+                logging.error(
+                    "Please make sure the checkpoint dir {} exists, and "
+                    "parameters in that dir are validating.".format(
+                        checkpoint_dir))
+                exit()
+
+            with open(meta_file, 'r') as handle:
+                extra_info = json.load(handle)
+
+            # Preporcess distributed parameters.
+            pretrain_world_size = extra_info['pretrain_world_size']
+            assert pretrain_world_size > 0
+            embedding_size = extra_info['embedding_size']
+            assert embedding_size == self.embedding_size
+            num_classes = extra_info['num_classes']
+            assert num_classes == self.num_classes
+
+            logging.info(
+                "Parameters for pre-training: pretrain_world_size ({}), "
+                "embedding_size ({}), and num_classes ({}).".format(
+                    pretrain_world_size, embedding_size, num_classes))
+            logging.info("Parameters for inference or fine-tuning: "
+                         "world_size ({}).".format(self.world_size))
+
+            rank_str = '%05d' % self.rank
+
+            dist_weight_state_dict = rearrange_weight(
+                dist_weight_state_dict, pretrain_world_size, self.world_size)
+            dist_bias_state_dict = rearrange_weight(
+                dist_bias_state_dict, pretrain_world_size, self.world_size)
+            for name, value in dist_weight_state_dict.items():
+                if rank_str in name:
+                    dist_param_state_dict[name] = value
+            for name, value in dist_bias_state_dict.items():
+                if rank_str in name:
+                    dist_param_state_dict[name] = value
+
+            if for_train:
+                dist_weight_velocity_state_dict = rearrange_weight(
+                    dist_weight_velocity_state_dict, pretrain_world_size,
+                    self.world_size)
+                dist_bias_velocity_state_dict = rearrange_weight(
+                    dist_bias_velocity_state_dict, pretrain_world_size,
+                    self.world_size)
+                for name, value in dist_weight_velocity_state_dict.items():
+                    if rank_str in name:
+                        opt_state_dict[name] = value
+                for name, value in dist_bias_velocity_state_dict.items():
+                    if rank_str in name:
+                        opt_state_dict[name] = value
+
+        def map_actual_param_name(state_dict, load_state_dict):
+            for name, param in state_dict.items():
+                state_dict[name] = load_state_dict[param.name]
+            return state_dict
+
+        logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir))
+        param_state_dict = map_actual_param_name(backbone.state_dict(),
+                                                 param_state_dict)
+        backbone.set_state_dict(param_state_dict)
+        if classifier is not None:
+            dist_param_state_dict = map_actual_param_name(
+                classifier.state_dict(), dist_param_state_dict)
+            classifier.set_state_dict(dist_param_state_dict)
+        if for_train:
+            assert optimizer is not None
+            optimizer.set_state_dict(opt_state_dict)
+
+        if classifier is not None and for_train:
+            return extra_info
+        else:
+            return {}
diff --git a/insightface/recognition/arcface_paddle/dynamic/utils/verification.py b/insightface/recognition/arcface_paddle/dynamic/utils/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a971baa5a8a715a41a00186e20ca3865bc2d2469
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/utils/verification.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import numpy as np
+import sklearn
+import paddle
+import logging
+from typing import List
+
+from utils.verification import evaluate
+from datasets import load_bin
+
+
+@paddle.no_grad()
+def test(data_set, backbone, batch_size, fp16=False, nfolds=10):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = data[bb - batch_size:bb]
+            # 将numpy转Tensor
+            img = paddle.to_tensor(
+                _data, dtype='float16' if fp16 else 'float32')
+            net_out: paddle.Tensor = backbone(img)
+            _embeddings = net_out.detach().cpu().numpy()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    try:
+        embeddings = sklearn.preprocessing.normalize(embeddings)
+    except:
+        print(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    _, _, accuracy, val, val_std, far = evaluate(
+        embeddings, issame_list, nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+class CallBackVerification(object):
+    def __init__(self,
+                 frequent,
+                 rank,
+                 batch_size,
+                 val_targets,
+                 rec_prefix,
+                 fp16=False,
+                 image_size=(112, 112)):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.batch_size: int = batch_size
+        self.fp16 = fp16
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        if self.rank == 0:
+            self.init_dataset(
+                val_targets=val_targets,
+                data_dir=rec_prefix,
+                image_size=image_size)
+
+    def ver_test(self, backbone: paddle.nn.Layer, global_step: int):
+        for i in range(len(self.ver_list)):
+            test_start = time.time()
+            acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+                self.ver_list[i],
+                backbone,
+                self.batch_size,
+                fp16=self.fp16,
+                nfolds=10)
+            logging.info('[%s][%d]XNorm: %f' %
+                         (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                         (self.ver_name_list[i], global_step, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info('[%s][%d]Accuracy-Highest: %1.5f' % (
+                self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            test_end = time.time()
+            logging.info("test time: {:.4f}".format(test_end - test_start))
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update, backbone: paddle.nn.Layer):
+        if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0:
+            backbone.eval()
+            with paddle.no_grad():
+                self.ver_test(backbone, num_update)
+            backbone.train()
diff --git a/insightface/recognition/arcface_paddle/dynamic/validation.py b/insightface/recognition/arcface_paddle/dynamic/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7da72ae633c8c6cedb018be2ae57d23ec4a32c09
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/dynamic/validation.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+
+from .utils.verification import CallBackVerification
+from .utils.io import Checkpoint
+from . import backbones
+
+
+def validation(args):
+    checkpoint = Checkpoint(
+        rank=0,
+        world_size=1,
+        embedding_size=args.embedding_size,
+        num_classes=None,
+        checkpoint_dir=args.checkpoint_dir, )
+
+    backbone = eval("backbones.{}".format(args.backbone))(
+        num_features=args.embedding_size)
+    checkpoint.load(backbone, for_train=False, dtype='float32')
+    backbone.eval()
+
+    callback_verification = CallBackVerification(
+        1, 0, args.batch_size, args.val_targets, args.data_dir)
+
+    callback_verification(1, backbone)
diff --git a/insightface/recognition/arcface_paddle/install_cn.md b/insightface/recognition/arcface_paddle/install_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..61da350ec044c36ed110c397e6841db4af199e4a
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/install_cn.md
@@ -0,0 +1,115 @@
+简体中文 | [English](install_en.md)
+
+# 安装说明
+
+---
+本章将介绍如何安装ArcFace-paddle及其依赖项。
+
+
+## 1. 安装PaddlePaddle
+
+运行ArcFace-paddle需要`PaddlePaddle 2.2.0rc0`或更高版本。可以参考下面的步骤安装PaddlePaddle。
+
+### 1.1 环境要求
+
+- python 3.x
+- cuda >= 10.1 (如果使用paddlepaddle-gpu)
+- cudnn >= 7.6.4 (如果使用paddlepaddle-gpu)
+- nccl >= 2.1.2 (如果使用分布式训练/评估)
+- gcc >= 8.2
+
+建议使用我们提供的docker运行ArcFace-paddle，有关docker、nvidia-docker使用请参考[链接](https://www.runoob.com/docker/docker-tutorial.html)。
+
+在cuda10.1时，建议显卡驱动版本大于等于418.39；在使用cuda10.2时，建议显卡驱动版本大于440.33，更多cuda版本与要求的显卡驱动版本可以参考[链接](https://docs.nvidia.com/deploy/cuda-compatibility/index.html)。
+
+
+如果不使用docker，可以直接跳过1.2部分内容，从1.3部分开始执行。
+
+
+### 1.2 （建议）准备docker环境。第一次使用这个镜像，会自动下载该镜像，请耐心等待。
+
+```
+# 切换到工作目录下
+cd /home/Projects
+# 首次运行需创建一个docker容器，再次运行时不需要运行当前命令
+# 创建一个名字为face_paddle的docker容器，并将当前目录映射到容器的/paddle目录下
+
+如果您希望在CPU环境下使用docker，使用docker而不是nvidia-docker创建docker，设置docker容器共享内存shm-size为8G，建议设置8G以上
+sudo docker run --name face_paddle -v $PWD:/paddle --shm-size=8G --network=host -it paddlepaddle/paddle:2.2.0rc0 /bin/bash
+
+如果希望使用GPU版本的容器，请运行以下命令创建容器。
+sudo nvidia-docker run --name face_paddle -v $PWD:/paddle --shm-size=8G --network=host -it paddlepaddle/paddle:2.2.0rc0-gpu-cuda11.2-cudnn8 /bin/bash
+```
+
+
+您也可以访问[DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/)获取与您机器适配的镜像。
+
+```
+# ctrl+P+Q可退出docker 容器，重新进入docker 容器使用如下命令
+sudo docker exec -it face_paddle /bin/bash
+```
+
+### 1.3 通过pip安装PaddlePaddle
+
+运行下面的命令，通过pip安装最新GPU版本PaddlePaddle
+
+```bash
+pip3 install paddlepaddle-gpu==2.2.0rc0 --upgrade -i https://mirror.baidu.com/pypi/simple
+```
+
+如果希望在CPU环境中使用PaddlePaddle，可以运行下面的命令安装PaddlePaddle。
+
+```bash
+pip3 install paddlepaddle==2.2.0rc0 --upgrade -i https://mirror.baidu.com/pypi/simple
+```
+
+**注意：**
+* 如果先安装了CPU版本的paddlepaddle，之后想切换到GPU版本，那么需要首先卸载CPU版本的paddle，再安装GPU版本的paddle，否则容易导致使用的paddle版本混乱。
+* 您也可以从源码编译安装PaddlePaddle，请参照[PaddlePaddle 安装文档](http://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
+
+
+### 1.4 验证是否安装成功
+
+使用以下命令可以验证PaddlePaddle是否安装成功。
+
+```python
+import paddle
+paddle.utils.run_check()
+```
+
+查看PaddlePaddle版本的命令如下：
+
+```bash
+python3 -c "import paddle; print(paddle.__version__)"
+```
+
+注意：
+- 从源码编译的PaddlePaddle版本号为0.0.0，请确保使用了PaddlePaddle 2.2.0rc0及之后的源码编译。
+- ArcFace-paddle基于PaddlePaddle高性能的分布式训练能力，若您从源码编译，请确保打开编译选项，**WITH_DISTRIBUTE=ON**。具体编译选项参考[编译选项表](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3)。
+- 在docker中运行时，为保证docker容器有足够的共享内存用于Paddle的数据读取加速，在创建docker容器时，请设置参数`--shm_size=8g`，条件允许的话可以设置为更大的值
+- 如果只希望使用识别模块，则可以跳过下面的第3部分；如果只希望使用检测模块，则可以跳过下面的第2部分。
+
+
+## 2. 准备识别模块的环境
+
+安装`requiremnts`，命令如下。
+
+```shell
+pip3 install -r requirement.txt
+```
+
+## 3. 准备检测模块的环境
+
+检测模块依赖于PaddleDetection，需要首先下载PaddleDetection的代码，并安装`requiremnts`。具体命令如下。
+
+```bash
+# 克隆PaddleDetection仓库
+cd <path/to/clone/PaddleDetection>
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+
+cd PaddleDetection
+# 安装其他依赖
+pip3 install -r requirements.txt
+```
+
+更多安装教程，请参考: [Install tutorial](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.2/docs/tutorials/INSTALL_cn.md)。
diff --git a/insightface/recognition/arcface_paddle/install_en.md b/insightface/recognition/arcface_paddle/install_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..10ddcc95fc0bc53c91f5105d8aa5c9e127dda577
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/install_en.md
@@ -0,0 +1,109 @@
+[简体中文](install_cn.md) | English
+
+# Installation
+
+---
+This tutorial introduces how to install ArcFace-paddle and its requirements.
+
+## 1. Install PaddlePaddle
+
+`PaddlePaddle 2.2.0rc0` or later is required for ArcFace-paddle. You can use the following steps to install PaddlePaddle.
+
+### 1.1 Environment requirements
+
+- python 3.x
+- cuda >= 10.1 (necessary if you want to use paddlepaddle-gpu)
+- cudnn >= 7.6.4 (necessary if you want to use paddlepaddle-gpu)
+- nccl >= 2.1.2 (necessary if you want the use distributed training/eval)
+- gcc >= 8.2
+
+Docker is recomended to run ArcFace-paddle, for more detailed information about docker and nvidia-docker, you can refer to the [tutorial](https://www.runoob.com/docker/docker-tutorial.html).
+
+When you use cuda10.1, the driver version needs to be larger or equal than 418.39. When you use cuda10.2, the driver version needs to be larger or equal than 440.33. For more cuda versions and specific driver versions, you can refer to the [link](https://docs.nvidia.com/deploy/cuda-compatibility/index.html).
+
+If you do not want to use docker, you can skip section 1.2 and go into section 1.3 directly.
+
+
+### 1.2 (Recommended) Prepare for a docker environment. The first time you use this docker image, it will be downloaded automatically. Please be patient.
+
+
+```
+# Switch to the working directory
+cd /home/Projects
+# You need to create a docker container for the first run, and do not need to run the current command when you run it again
+# Create a docker container named face_paddle and map the current directory to the /paddle directory of the container
+# It is recommended to set a shared memory greater than or equal to 8G through the --shm-size parameter
+sudo docker run --name face_paddle -v $PWD:/paddle --shm-size=8G --network=host -it paddlepaddle/paddle:2.2.0rc0 /bin/bash
+
+# Use the following command to create a container if you want to use GPU in the container
+sudo nvidia-docker run --name face_paddle -v $PWD:/paddle --shm-size=8G --network=host -it paddlepaddle/paddle:2.2.0rc0-gpu-cuda11.2-cudnn8 /bin/bash
+```
+
+You can also visit [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) to get more docker images.
+
+```
+# use ctrl+P+Q to exit docker, to re-enter docker using the following command:
+sudo docker exec -it face_paddle /bin/bash
+```
+
+### 1.3 Install PaddlePaddle using pip
+
+If you want to use PaddlePaddle on GPU, you can use the following command to install PaddlePaddle.
+
+```bash
+pip3 install paddlepaddle-gpu==2.2.0rc0 --upgrade -i https://mirror.baidu.com/pypi/simple
+```
+
+If you want to use PaddlePaddle on CPU, you can use the following command to install PaddlePaddle.
+
+```bash
+pip3 install paddlepaddle==2.2.0rc0 --upgrade -i https://mirror.baidu.com/pypi/simple
+```
+
+**Note:**
+* If you have already installed CPU version of PaddlePaddle and want to use GPU version now, you should uninstall CPU version of PaddlePaddle and then install GPU version to avoid package confusion.
+* You can also compile PaddlePaddle from source code, please refer to [PaddlePaddle Installation tutorial](http://www.paddlepaddle.org.cn/install/quick) to more compilation options.
+
+### 1.4 Verify Installation process
+
+```python
+import paddle
+paddle.utils.run_check()
+```
+
+Check PaddlePaddle version：
+
+```bash
+python3 -c "import paddle; print(paddle.__version__)"
+```
+
+Note:
+- Make sure the compiled source code is later than PaddlePaddle2.0.
+- If you want to enable distribution ability, you should assign **WITH_DISTRIBUTE=ON** when compiling. For more compilation options, please refer to [Instruction](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#id3) for more details.
+- When running in docker, in order to ensure that the container has enough shared memory for dataloader acceleration of Paddle, please set the parameter `--shm_size=8g` at creating a docker container, if conditions permit, you can set it to a larger value.
+- If you just want to use recognition module, you can skip section 3. If you just want to use detection module, you can skip section 2.
+
+## 2. Prepare for the environment of recognition
+
+Run the following command to install `requiremnts`.
+
+```shell
+pip3 install -r requirement.txt
+```
+
+## 3. Prepare for the environment of detection
+
+The detection module depends on PaddleDetection. You need to download PaddleDetection and install `requiremnts`, the command is as follows.
+
+
+```bash
+# clone PaddleDetection repo
+cd <path/to/clone/PaddleDetection>
+git clone https://github.com/PaddlePaddle/PaddleDetection.git
+
+cd PaddleDetection
+# install requiremnts
+pip3 install -r requirements.txt
+```
+
+For more installation tutorials, please refer to [Install tutorial](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.1/docs/tutorials/INSTALL.md).
diff --git a/insightface/recognition/arcface_paddle/requirement.txt b/insightface/recognition/arcface_paddle/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..25de974effc997ea02ff4cb7e32de2a8726ee1db
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/requirement.txt
@@ -0,0 +1,16 @@
+visualdl
+opencv-python
+pillow
+numpy
+easydict
+scipy
+sklearn
+requests
+prettytable
+tqdm
+Pillow
+scikit-learn==0.23.2
+opencv-python==4.4.0.46
+onnxruntime
+onnx
+paddle2onnx
diff --git a/insightface/recognition/arcface_paddle/scripts/export_dynamic.sh b/insightface/recognition/arcface_paddle/scripts/export_dynamic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cdc0e14616d0036c2536c702a04faeebac52b981
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/export_dynamic.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python tools/export.py \
+    --is_static False \
+    --export_type paddle \
+    --backbone FresResNet50 \
+    --embedding_size 512 \
+    --checkpoint_dir MS1M_v3_arcface_dynamic_128_fp16_0.1/FresResNet50/24 \
+    --output_dir MS1M_v3_arcface_dynamic_128_fp16_0.1/FresResNet50/exported_model
diff --git a/insightface/recognition/arcface_paddle/scripts/export_static.sh b/insightface/recognition/arcface_paddle/scripts/export_static.sh
new file mode 100644
index 0000000000000000000000000000000000000000..11c1d8ec1cdabd1f5b039e3d6035586af8dbe6f3
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/export_static.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python tools/export.py \
+    --is_static True \
+    --export_type paddle \
+    --backbone FresResNet50 \
+    --embedding_size 512 \
+    --checkpoint_dir MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/24 \
+    --output_dir MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/exported_model
diff --git a/insightface/recognition/arcface_paddle/scripts/inference.sh b/insightface/recognition/arcface_paddle/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19698dc7ffc909f411670fa3ee6dfa1687696b66
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/inference.sh
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python tools/inference.py \
+    --export_type paddle \
+    --model_file MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/exported_model/FresResNet50.pdmodel \
+    --params_file MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/exported_model/FresResNet50.pdiparams \
+    --image_path MS1M_v3/images/00000001.jpg
+
+python tools/inference.py \
+    --export_type onnx \
+    --onnx_file MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/exported_model/FresResNet50.onnx \
+    --image_path MS1M_v3/images/00000001.jpg
diff --git a/insightface/recognition/arcface_paddle/scripts/kill_train_process.sh b/insightface/recognition/arcface_paddle/scripts/kill_train_process.sh
new file mode 100644
index 0000000000000000000000000000000000000000..76dcf50c0aef611016009bdb66b006c5ee93377c
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/kill_train_process.sh
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ps -ef | grep "train.py" | grep -v grep | awk '{print $2}' | xargs kill -9
diff --git a/insightface/recognition/arcface_paddle/scripts/perf_dynamic.sh b/insightface/recognition/arcface_paddle/scripts/perf_dynamic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fa1ce24b5b9beaf956f534292c7f53f9010081f8
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/perf_dynamic.sh
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+num_test=5
+num_nodes=1
+
+configs=(configs/ms1mv3_r50.py configs/ms1mv3_r100.py)
+dtypes=(fp16 fp32)
+gpus=("0" "0,1,2,3" "0,1,2,3,4,5,6,7")
+
+for config in "${configs[@]}"
+do
+    for dtype in "${dtypes[@]}"
+    do
+        for gpu in "${gpus[@]}"
+        do
+            i=1
+            while [ $i -le ${num_test} ]
+            do
+                bash scripts/perf_runner.sh $gpu $config dynamic 93431 $dtype $num_nodes 128 0.1 ${i}
+                echo " >>>>>>Finished Test Case $config, $dtype, $gpu, ${i} <<<<<<<"
+                let i++
+                sleep 20s
+            done
+        done
+    done
+done
diff --git a/insightface/recognition/arcface_paddle/scripts/perf_runner.sh b/insightface/recognition/arcface_paddle/scripts/perf_runner.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9e0b705dff4062a6292f4151177a330f8d9e44c6
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/perf_runner.sh
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+gpus=${1:-0,1,2,3,4,5,6,7}
+config_file=${2:-configs/ms1mv3_r50.py}
+mode=${3:-static}
+num_classes=${4:-93431}
+dtype=${5:-fp16}
+num_nodes=${6:-1}
+batch_size_per_device=${7:-128}
+sample_ratio=${8:-0.1}
+test_id=${9:-1}
+
+if [ $mode = "static" ]; then
+    is_static=True
+else
+    is_static=False
+fi
+
+if [ $dtype = "fp16" ]; then
+    fp16=True
+else
+    fp16=False
+fi
+
+if [[ $config_file =~ r50 ]]; then
+    backbone=r50
+else
+    backbone=r100
+fi
+
+gpu_num_per_node=`expr ${#gpus} / 2 + 1`
+
+log_dir=./logs/arcface_paddle_${backbone}_${mode}_${dtype}_r${sample_ratio}_bz${batch_size_per_device}_${num_nodes}n${gpu_num_per_node}g_id${test_id}
+
+python -m paddle.distributed.launch --gpus=${gpus} --log_dir=${log_dir} tools/train.py \
+    --config_file ${config_file} \
+    --is_static ${is_static} \
+    --num_classes ${num_classes} \
+    --fp16 ${fp16} \
+    --sample_ratio ${sample_ratio} \
+    --log_interval_step 1 \
+    --train_unit 'step' \
+    --train_num 200 \
+    --warmup_num 0 \
+    --use_synthetic_dataset True \
+    --do_validation_while_train False
diff --git a/insightface/recognition/arcface_paddle/scripts/perf_static.sh b/insightface/recognition/arcface_paddle/scripts/perf_static.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e9c53b1eb1209e4a8c0b6a4744a9cdc93df88e1
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/perf_static.sh
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+num_test=5
+num_nodes=1
+
+configs=(configs/ms1mv3_r50.py configs/ms1mv3_r100.py)
+dtypes=(fp16 fp32)
+gpus=("0" "0,1,2,3" "0,1,2,3,4,5,6,7")
+
+for config in "${configs[@]}"
+do
+    for dtype in "${dtypes[@]}"
+    do
+        for gpu in "${gpus[@]}"
+        do
+            i=1
+            while [ $i -le ${num_test} ]
+            do
+                bash scripts/perf_runner.sh $gpu $config static 93431 $dtype $num_nodes 128 0.1 ${i}
+                echo " >>>>>>Finished Test Case $config, $dtype, $gpu, ${i} <<<<<<<"
+                let i++
+                sleep 20s
+            done
+        done
+    done
+done
diff --git a/insightface/recognition/arcface_paddle/scripts/train_dynamic.sh b/insightface/recognition/arcface_paddle/scripts/train_dynamic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a623945992f51a9298a9f4f2bebfa8b3dcbefb53
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/train_dynamic.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
+    --config_file configs/ms1mv3_r50.py \
+    --is_static False \
+    --backbone FresResNet50 \
+    --classifier LargeScaleClassifier \
+    --embedding_size 512 \
+    --model_parallel True \
+    --dropout 0.0 \
+    --sample_ratio 0.1 \
+    --loss ArcFace \
+    --batch_size 128 \
+    --dataset MS1M_v3 \
+    --num_classes 93431 \
+    --data_dir MS1M_v3/ \
+    --label_file MS1M_v3/label.txt \
+    --is_bin False \
+    --log_interval_step 100 \
+    --validation_interval_step 2000 \
+    --fp16 True \
+    --use_dynamic_loss_scaling True \
+    --init_loss_scaling 27648.0 \
+    --num_workers 8 \
+    --train_unit 'epoch' \
+    --warmup_num 0 \
+    --train_num 25 \
+    --decay_boundaries "10,16,22" \
+    --output MS1M_v3_arcface_dynamic_0.1
diff --git a/insightface/recognition/arcface_paddle/scripts/train_static.sh b/insightface/recognition/arcface_paddle/scripts/train_static.sh
new file mode 100644
index 0000000000000000000000000000000000000000..52ca2be1eb12da8d84872b898da29ecccde95d14
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/train_static.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
+    --config_file configs/ms1mv3_r50.py \
+    --is_static True \
+    --backbone FresResNet50 \
+    --classifier LargeScaleClassifier \
+    --embedding_size 512 \
+    --model_parallel True \
+    --dropout 0.0 \
+    --sample_ratio 0.1 \
+    --loss ArcFace \
+    --batch_size 128 \
+    --dataset MS1M_v3 \
+    --num_classes 93431 \
+    --data_dir MS1M_v3/ \
+    --label_file MS1M_v3/label.txt \
+    --is_bin False \
+    --log_interval_step 100 \
+    --validation_interval_step 2000 \
+    --fp16 True \
+    --use_dynamic_loss_scaling True \
+    --init_loss_scaling 27648.0 \
+    --num_workers 8 \
+    --train_unit 'epoch' \
+    --warmup_num 0 \
+    --train_num 25 \
+    --decay_boundaries "10,16,22" \
+    --output MS1M_v3_arcface_static_0.1
diff --git a/insightface/recognition/arcface_paddle/scripts/validation_dynamic.sh b/insightface/recognition/arcface_paddle/scripts/validation_dynamic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3b635c5f058965f674ab1469b1b7b27f0e4c9292
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/validation_dynamic.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python tools/validation.py \
+    --is_static False \
+    --backbone FresResNet50 \
+    --embedding_size 512 \
+    --checkpoint_dir MS1M_v3_arcface_dynamic_128_fp16_0.1/FresResNet50/24 \
+    --data_dir MS1M_v3/ \
+    --val_targets lfw,cfp_fp,agedb_30 \
+    --batch_size 128
diff --git a/insightface/recognition/arcface_paddle/scripts/validation_static.sh b/insightface/recognition/arcface_paddle/scripts/validation_static.sh
new file mode 100644
index 0000000000000000000000000000000000000000..64227f7debd86f01e2d19a0b69c96893231c579f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/scripts/validation_static.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+python tools/validation.py \
+    --is_static True \
+    --backbone FresResNet50 \
+    --embedding_size 512 \
+    --checkpoint_dir MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/24 \
+    --data_dir MS1M_v3/ \
+    --val_targets lfw,cfp_fp,agedb_30 \
+    --batch_size 128
diff --git a/insightface/recognition/arcface_paddle/static/backbones/__init__.py b/insightface/recognition/arcface_paddle/static/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db1164d19b4585a6233935956425e366375fec8
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/backbones/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .iresnet import FresResNet50, FresResNet100
diff --git a/insightface/recognition/arcface_paddle/static/backbones/iresnet.py b/insightface/recognition/arcface_paddle/static/backbones/iresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..73e98a934d0c73c96aab1a8996b397c57591be4a
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/backbones/iresnet.py
@@ -0,0 +1,249 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from collections import OrderedDict
+
+__all__ = [
+    "FresResNet", "FresResNet50", "FresResNet100", "FresResNet101",
+    "FresResNet152"
+]
+
+
+class FresResNet(object):
+    def __init__(self,
+                 layers=50,
+                 num_features=512,
+                 is_train=True,
+                 fp16=False,
+                 fc_type='E',
+                 dropout=0.4):
+        super(FresResNet, self).__init__()
+        self.layers = layers
+        self.num_features = num_features
+        self.fc_type = fc_type
+
+        self.input_dict = OrderedDict()
+        self.output_dict = OrderedDict()
+
+        image = paddle.static.data(
+            name='image',
+            shape=[-1, 3, 112, 112],
+            dtype='float16' if fp16 else 'float32')
+        self.input_dict['image'] = image
+        if is_train:
+            label = paddle.static.data(name='label', shape=[-1], dtype='int32')
+            self.input_dict['label'] = label
+
+        supported_layers = [50, 100, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers {}, but given {}".format(supported_layers, layers)
+
+        if layers == 50:
+            units = [3, 4, 14, 3]
+        elif layers == 100:
+            units = [3, 13, 30, 3]
+        elif layers == 101:
+            units = [3, 4, 23, 3]
+        elif layers == 152:
+            units = [3, 8, 36, 3]
+        filter_list = [64, 64, 128, 256, 512]
+        num_stages = 4
+
+        input_blob = paddle.static.nn.conv2d(
+            input=image,
+            num_filters=filter_list[0],
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            param_attr=paddle.ParamAttr(),
+            bias_attr=False)
+        input_blob = paddle.static.nn.batch_norm(
+            input=input_blob,
+            act=None,
+            epsilon=1e-05,
+            momentum=0.9,
+            is_test=False if is_train else True)
+        # input_blob = paddle.nn.functional.relu6(input_blob)
+        input_blob = paddle.static.nn.prelu(
+            input_blob,
+            mode="all",
+            param_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.25)))
+
+        for i in range(num_stages):
+            for j in range(units[i]):
+                input_blob = self.residual_unit_v3(
+                    input_blob,
+                    filter_list[i + 1],
+                    3,
+                    2 if j == 0 else 1,
+                    1,
+                    is_train, )
+        fc1 = self.get_fc1(input_blob, is_train, dropout)
+
+        self.output_dict['feature'] = fc1
+
+    def residual_unit_v3(self, in_data, num_filter, filter_size, stride, pad,
+                         is_train):
+
+        bn1 = paddle.static.nn.batch_norm(
+            input=in_data,
+            act=None,
+            epsilon=1e-05,
+            momentum=0.9,
+            is_test=False if is_train else True)
+        conv1 = paddle.static.nn.conv2d(
+            input=bn1,
+            num_filters=num_filter,
+            filter_size=filter_size,
+            stride=1,
+            padding=1,
+            groups=1,
+            param_attr=paddle.ParamAttr(),
+            bias_attr=False)
+        bn2 = paddle.static.nn.batch_norm(
+            input=conv1,
+            act=None,
+            epsilon=1e-05,
+            momentum=0.9,
+            is_test=False if is_train else True)
+        # prelu = paddle.nn.functional.relu6(bn2)
+        prelu = paddle.static.nn.prelu(
+            bn2,
+            mode="all",
+            param_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.25)))
+        conv2 = paddle.static.nn.conv2d(
+            input=prelu,
+            num_filters=num_filter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=1,
+            param_attr=paddle.ParamAttr(),
+            bias_attr=False)
+        bn3 = paddle.static.nn.batch_norm(
+            input=conv2,
+            act=None,
+            epsilon=1e-05,
+            momentum=0.9,
+            is_test=False if is_train else True)
+
+        if stride == 1:
+            input_blob = in_data
+        else:
+            input_blob = paddle.static.nn.conv2d(
+                input=in_data,
+                num_filters=num_filter,
+                filter_size=1,
+                stride=stride,
+                padding=0,
+                groups=1,
+                param_attr=paddle.ParamAttr(),
+                bias_attr=False)
+
+            input_blob = paddle.static.nn.batch_norm(
+                input=input_blob,
+                act=None,
+                epsilon=1e-05,
+                momentum=0.9,
+                is_test=False if is_train else True)
+
+        identity = paddle.add(bn3, input_blob)
+        return identity
+
+    def get_fc1(self, last_conv, is_train, dropout=0.4):
+        body = last_conv
+        if self.fc_type == "Z":
+            body = paddle.static.nn.batch_norm(
+                input=body,
+                act=None,
+                epsilon=1e-05,
+                is_test=False if is_train else True)
+            if dropout > 0:
+                body = paddle.nn.functional.dropout(
+                    x=body,
+                    p=dropout,
+                    training=is_train,
+                    mode='upscale_in_train')
+            fc1 = body
+        elif self.fc_type == "E":
+            body = paddle.static.nn.batch_norm(
+                input=body,
+                act=None,
+                epsilon=1e-05,
+                is_test=False if is_train else True)
+            if dropout > 0:
+                body = paddle.nn.functional.dropout(
+                    x=body,
+                    p=dropout,
+                    training=is_train,
+                    mode='upscale_in_train')
+            fc1 = paddle.static.nn.fc(
+                x=body,
+                size=self.num_features,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=0.0)),
+                bias_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant()))
+            fc1 = paddle.static.nn.batch_norm(
+                input=fc1,
+                act=None,
+                epsilon=1e-05,
+                is_test=False if is_train else True)
+
+        elif self.fc_type == "FC":
+            body = paddle.static.nn.batch_norm(
+                input=body,
+                act=None,
+                epsilon=1e-05,
+                is_test=False if is_train else True)
+            fc1 = paddle.static.nn.fc(
+                x=body,
+                size=self.num_features,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=0.0)),
+                bias_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Constant()))
+            fc1 = paddle.static.nn.batch_norm(
+                input=fc1,
+                act=None,
+                epsilon=1e-05,
+                is_test=False if is_train else True)
+
+        return fc1
+
+
+def FresResNet50(**args):
+    model = FresResNet(layers=50, **args)
+    return model
+
+
+def FresResNet100(**args):
+    model = FresResNet(layers=100, **args)
+    return model
+
+
+def FresResNet101(**args):
+    model = FresResNet(layers=101, **args)
+    return model
+
+
+def FresResNet152(**args):
+    model = FresResNet(layers=152, **args)
+    return model
diff --git a/insightface/recognition/arcface_paddle/static/classifiers/__init__.py b/insightface/recognition/arcface_paddle/static/classifiers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0e52f2b5d2045af6ba6c10793e0567033e2ae9
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/classifiers/__init__.py
@@ -0,0 +1,15 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lsc import LargeScaleClassifier
diff --git a/insightface/recognition/arcface_paddle/static/classifiers/lsc.py b/insightface/recognition/arcface_paddle/static/classifiers/lsc.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab66456cbff84144bc67146d773415d78c4e9ee
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/classifiers/lsc.py
@@ -0,0 +1,128 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from six.moves import reduce
+from collections import OrderedDict
+
+import paddle
+
+__all__ = ["LargeScaleClassifier"]
+
+
+class LargeScaleClassifier(object):
+    """
+    Author: {Xiang An, Yang Xiao, XuHan Zhu} in DeepGlint,
+    Partial FC: Training 10 Million Identities on a Single Machine
+    See the original paper:
+    https://arxiv.org/abs/2010.05222
+    """
+
+    def __init__(self,
+                 feature,
+                 label,
+                 rank,
+                 world_size,
+                 num_classes,
+                 margin1=1.0,
+                 margin2=0.5,
+                 margin3=0.0,
+                 scale=64.0,
+                 sample_ratio=1.0,
+                 embedding_size=512,
+                 name=None):
+        super(LargeScaleClassifier, self).__init__()
+        self.num_classes: int = num_classes
+        self.rank: int = rank
+        self.world_size: int = world_size
+        self.sample_ratio: float = sample_ratio
+        self.embedding_size: int = embedding_size
+        self.num_local: int = (num_classes + world_size - 1) // world_size
+        if num_classes % world_size != 0 and rank == world_size - 1:
+            self.num_local = num_classes % self.num_local
+        self.num_sample: int = int(self.sample_ratio * self.num_local)
+        self.margin1 = margin1
+        self.margin2 = margin2
+        self.margin3 = margin3
+        self.logit_scale = scale
+
+        self.input_dict = OrderedDict()
+        self.input_dict['feature'] = feature
+        self.input_dict['label'] = label
+
+        self.output_dict = OrderedDict()
+
+        if name is None:
+            name = 'dist@fc@rank@%05d.w' % rank
+        assert '.w' in name
+
+        stddev = math.sqrt(2.0 / (self.embedding_size + self.num_local))
+        param_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Normal(std=stddev))
+
+        weight_dtype = 'float16' if feature.dtype == paddle.float16 else 'float32'
+        weight = paddle.static.create_parameter(
+            shape=[self.embedding_size, self.num_local],
+            dtype=weight_dtype,
+            name=name,
+            attr=param_attr,
+            is_bias=False)
+
+        # avoid allreducing gradients for distributed parameters
+        weight.is_distributed = True
+        # avoid broadcasting distributed parameters in startup program
+        paddle.static.default_startup_program().global_block().vars[
+            weight.name].is_distributed = True
+
+        if self.world_size > 1:
+            feature_list = []
+            paddle.distributed.all_gather(feature_list, feature)
+            total_feature = paddle.concat(feature_list, axis=0)
+
+            label_list = []
+            paddle.distributed.all_gather(label_list, label)
+            total_label = paddle.concat(label_list, axis=0)
+            total_label.stop_gradient = True
+        else:
+            total_feature = feature
+            total_label = label
+
+        total_label.stop_gradient = True
+
+        if self.sample_ratio < 1.0:
+            # partial fc sample process
+            total_label, sampled_class_index = paddle.nn.functional.class_center_sample(
+                total_label, self.num_local, self.num_sample)
+            sampled_class_index.stop_gradient = True
+            weight = paddle.gather(weight, sampled_class_index, axis=1)
+
+        norm_feature = paddle.fluid.layers.l2_normalize(total_feature, axis=1)
+        norm_weight = paddle.fluid.layers.l2_normalize(weight, axis=0)
+
+        local_logit = paddle.matmul(norm_feature, norm_weight)
+
+        loss = paddle.nn.functional.margin_cross_entropy(
+            local_logit,
+            total_label,
+            margin1=self.margin1,
+            margin2=self.margin2,
+            margin3=self.margin3,
+            scale=self.logit_scale,
+            return_softmax=False,
+            reduction=None, )
+
+        loss.desc.set_dtype(paddle.fluid.core.VarDesc.VarType.FP32)
+        loss = paddle.mean(loss)
+
+        self.output_dict['loss'] = loss
diff --git a/insightface/recognition/arcface_paddle/static/export.py b/insightface/recognition/arcface_paddle/static/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1f3e077b0c29cfde4c7d96ce8f265dc280fe9b2
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/export.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import os
+import numpy as np
+import paddle
+
+from .utils.io import Checkpoint
+from . import backbones
+from .static_model import StaticModel
+
+
+def export_onnx(path_prefix, feed_vars, fetch_vars, executor, program):
+
+    from paddle2onnx.graph import PaddleGraph, ONNXGraph
+    from paddle2onnx.passes import PassManager
+
+    opset_version = 10
+    enable_onnx_checker = True
+    verbose = False
+
+    paddle_graph = PaddleGraph.build_from_program(program, feed_vars,
+                                                  fetch_vars,
+                                                  paddle.fluid.global_scope())
+
+    onnx_graph = ONNXGraph.build(paddle_graph, opset_version, verbose)
+    onnx_graph = PassManager.run_pass(onnx_graph, ['inplace_node_pass'])
+
+    onnx_proto = onnx_graph.export_proto(enable_onnx_checker)
+
+    try:
+        # mkdir may conflict if pserver and trainer are running on the same machine
+        dirname = os.path.dirname(path_prefix)
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+    model_path = path_prefix + ".onnx"
+    if os.path.isdir(model_path):
+        raise ValueError("'{}' is an existing directory.".format(model_path))
+
+    with open(model_path, 'wb') as f:
+        f.write(onnx_proto.SerializeToString())
+
+
+def export(args):
+    checkpoint = Checkpoint(
+        rank=0,
+        world_size=1,
+        embedding_size=args.embedding_size,
+        num_classes=None,
+        checkpoint_dir=args.checkpoint_dir, )
+
+    test_program = paddle.static.Program()
+    startup_program = paddle.static.Program()
+
+    test_model = StaticModel(
+        main_program=test_program,
+        startup_program=startup_program,
+        backbone_class_name=args.backbone,
+        embedding_size=args.embedding_size,
+        mode='test', )
+
+    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
+    place = paddle.CUDAPlace(gpu_id)
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    checkpoint.load(program=test_program, for_train=False, dtype='float32')
+    print("Load checkpoint from '{}'.".format(args.checkpoint_dir))
+
+    path = os.path.join(args.output_dir, args.backbone)
+    if args.export_type == 'onnx':
+        feed_vars = [test_model.backbone.input_dict['image'].name]
+        fetch_vars = [test_model.backbone.output_dict['feature']]
+        export_onnx(path, feed_vars, fetch_vars, exe, program=test_program)
+    else:
+        feed_vars = [test_model.backbone.input_dict['image']]
+        fetch_vars = [test_model.backbone.output_dict['feature']]
+        paddle.static.save_inference_model(
+            path, feed_vars, fetch_vars, exe, program=test_program)
+    print("Save exported model to '{}'.".format(args.output_dir))
diff --git a/insightface/recognition/arcface_paddle/static/static_model.py b/insightface/recognition/arcface_paddle/static/static_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe8f82a4b73482540f70b3631b729d7f988ca4e6
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/static_model.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import sys
+import numpy as np
+
+import paddle
+from visualdl import LogWriter
+
+from utils.logging import AverageMeter, init_logging, CallBackLogging
+from utils import losses
+
+from .utils.optimization_pass import gather_optimization_pass, amp_pass
+
+from . import classifiers
+from . import backbones
+
+
+class StaticModel(object):
+    def __init__(self,
+                 main_program,
+                 startup_program,
+                 backbone_class_name,
+                 embedding_size,
+                 classifier_class_name=None,
+                 num_classes=None,
+                 sample_ratio=0.1,
+                 lr_scheduler=None,
+                 momentum=0.9,
+                 weight_decay=2e-4,
+                 dropout=0.4,
+                 mode='train',
+                 fp16=False,
+                 fp16_configs=None,
+                 margin_loss_params=None):
+
+        rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+        world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
+        if world_size > 1:
+            import paddle.distributed.fleet as fleet
+
+        self.main_program = main_program
+        self.startup_program = startup_program
+        self.backbone_class_name = backbone_class_name
+        self.embedding_size = embedding_size
+        self.classifier_class_name = classifier_class_name
+        self.num_classes = num_classes
+        self.sample_ratio = sample_ratio
+        self.lr_scheduler = lr_scheduler
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.mode = mode
+        self.fp16 = fp16
+        self.fp16_configs = fp16_configs
+        self.margin_loss_params = margin_loss_params
+
+        if self.mode == 'train':
+            assert self.classifier_class_name is not None
+            assert self.num_classes is not None
+            assert self.lr_scheduler is not None
+            assert self.margin_loss_params is not None
+            with paddle.static.program_guard(self.main_program,
+                                             self.startup_program):
+                with paddle.utils.unique_name.guard():
+                    self.backbone = eval("backbones.{}".format(
+                        self.backbone_class_name))(
+                            num_features=self.embedding_size,
+                            is_train=True,
+                            fp16=self.fp16,
+                            dropout=dropout)
+                    assert 'label' in self.backbone.input_dict
+                    assert 'feature' in self.backbone.output_dict
+                    self.classifier = eval("classifiers.{}".format(
+                        self.classifier_class_name))(
+                            feature=self.backbone.output_dict['feature'],
+                            label=self.backbone.input_dict['label'],
+                            rank=rank,
+                            world_size=world_size,
+                            num_classes=self.num_classes,
+                            margin1=self.margin_loss_params.margin1,
+                            margin2=self.margin_loss_params.margin2,
+                            margin3=self.margin_loss_params.margin3,
+                            scale=self.margin_loss_params.scale,
+                            sample_ratio=self.sample_ratio,
+                            embedding_size=self.embedding_size)
+                    assert 'loss' in self.classifier.output_dict
+
+                    self.optimizer = paddle.optimizer.Momentum(
+                        learning_rate=self.lr_scheduler,
+                        momentum=self.momentum,
+                        weight_decay=paddle.regularizer.L2Decay(
+                            self.weight_decay))
+                    if self.fp16:
+                        assert self.fp16_configs is not None
+                        self.optimizer = paddle.static.amp.decorate(
+                            optimizer=self.optimizer,
+                            init_loss_scaling=self.fp16_configs[
+                                'init_loss_scaling'],
+                            incr_every_n_steps=self.fp16_configs[
+                                'incr_every_n_steps'],
+                            decr_every_n_nan_or_inf=self.fp16_configs[
+                                'decr_every_n_nan_or_inf'],
+                            incr_ratio=self.fp16_configs['incr_ratio'],
+                            decr_ratio=self.fp16_configs['decr_ratio'],
+                            use_dynamic_loss_scaling=self.fp16_configs[
+                                'use_dynamic_loss_scaling'],
+                            use_pure_fp16=self.fp16_configs['use_pure_fp16'],
+                            amp_lists=paddle.static.amp.
+                            AutoMixedPrecisionLists(
+                                custom_white_list=self.fp16_configs[
+                                    'custom_white_list'],
+                                custom_black_list=self.fp16_configs[
+                                    'custom_black_list'], ),
+                            use_fp16_guard=False)
+
+                    if world_size > 1:
+                        dist_optimizer = fleet.distributed_optimizer(
+                            self.optimizer)
+                        dist_optimizer.minimize(self.classifier.output_dict[
+                            'loss'])
+                    else:
+                        self.optimizer.minimize(self.classifier.output_dict[
+                            'loss'])
+                    if self.fp16:
+                        self.optimizer = self.optimizer._optimizer
+                    if self.sample_ratio < 1.0:
+                        gather_optimization_pass(self.main_program,
+                                                 'dist@fc@rank')
+                    if self.fp16:
+                        amp_pass(self.main_program, 'dist@fc@rank')
+
+        elif self.mode == 'test':
+            with paddle.static.program_guard(self.main_program,
+                                             self.startup_program):
+                with paddle.utils.unique_name.guard():
+                    self.backbone = eval("backbones.{}".format(
+                        self.backbone_class_name))(
+                            num_features=self.embedding_size,
+                            is_train=False,
+                            fp16=self.fp16,
+                            dropout=dropout)
+                    assert 'feature' in self.backbone.output_dict
+
+        else:
+            raise ValueError(
+                "mode is error, only support 'train' and 'test' now.")
diff --git a/insightface/recognition/arcface_paddle/static/train.py b/insightface/recognition/arcface_paddle/static/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd40393d1623294231938ed8a0e0a656d7902de
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/train.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import sys
+import numpy as np
+import logging
+
+import paddle
+from visualdl import LogWriter
+
+from utils.logging import AverageMeter, CallBackLogging
+from datasets import CommonDataset, SyntheticDataset
+from utils import losses
+
+from .utils.verification import CallBackVerification
+from .utils.io import Checkpoint
+
+from . import classifiers
+from . import backbones
+from .static_model import StaticModel
+
+RELATED_FLAGS_SETTING = {
+    'FLAGS_cudnn_exhaustive_search': 1,
+    'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+    'FLAGS_max_inplace_grad_add': 8,
+    'FLAGS_fraction_of_gpu_memory_to_use': 0.9999,
+}
+paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
+
+
+def train(args):
+
+    writer = LogWriter(logdir=args.logdir)
+
+    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
+
+    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
+    place = paddle.CUDAPlace(gpu_id)
+
+    if world_size > 1:
+        import paddle.distributed.fleet as fleet
+        strategy = fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+        fleet.init(is_collective=True, strategy=strategy)
+
+    if args.use_synthetic_dataset:
+        trainset = SyntheticDataset(args.num_classes, fp16=args.fp16)
+    else:
+        trainset = CommonDataset(
+            root_dir=args.data_dir,
+            label_file=args.label_file,
+            fp16=args.fp16,
+            is_bin=args.is_bin)
+
+    num_image = len(trainset)
+    total_batch_size = args.batch_size * world_size
+    steps_per_epoch = num_image // total_batch_size
+    if args.train_unit == 'epoch':
+        warmup_steps = steps_per_epoch * args.warmup_num
+        total_steps = steps_per_epoch * args.train_num
+        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
+        total_epoch = args.train_num
+    else:
+        warmup_steps = args.warmup_num
+        total_steps = args.train_num
+        decay_steps = [x for x in args.decay_boundaries]
+        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch
+
+    if rank == 0:
+        logging.info('world_size: {}'.format(world_size))
+        logging.info('total_batch_size: {}'.format(total_batch_size))
+        logging.info('warmup_steps: {}'.format(warmup_steps))
+        logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
+        logging.info('total_steps: {}'.format(total_steps))
+        logging.info('total_epoch: {}'.format(total_epoch))
+        logging.info('decay_steps: {}'.format(decay_steps))
+
+    base_lr = total_batch_size * args.lr / 512
+    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
+        boundaries=decay_steps,
+        values=[
+            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
+        ])
+    if warmup_steps > 0:
+        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
+            lr_scheduler, warmup_steps, 0, base_lr)
+
+    train_program = paddle.static.Program()
+    test_program = paddle.static.Program()
+    startup_program = paddle.static.Program()
+
+    margin_loss_params = eval("losses.{}".format(args.loss))()
+    train_model = StaticModel(
+        main_program=train_program,
+        startup_program=startup_program,
+        backbone_class_name=args.backbone,
+        embedding_size=args.embedding_size,
+        classifier_class_name=args.classifier,
+        num_classes=args.num_classes,
+        sample_ratio=args.sample_ratio,
+        lr_scheduler=lr_scheduler,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay,
+        dropout=args.dropout,
+        mode='train',
+        fp16=args.fp16,
+        fp16_configs={
+            'init_loss_scaling': args.init_loss_scaling,
+            'incr_every_n_steps': args.incr_every_n_steps,
+            'decr_every_n_nan_or_inf': args.decr_every_n_nan_or_inf,
+            'incr_ratio': args.incr_ratio,
+            'decr_ratio': args.decr_ratio,
+            'use_dynamic_loss_scaling': args.use_dynamic_loss_scaling,
+            'use_pure_fp16': args.fp16,
+            'custom_white_list': args.custom_white_list,
+            'custom_black_list': args.custom_black_list,
+        },
+        margin_loss_params=margin_loss_params, )
+
+    if rank == 0:
+        with open(os.path.join(args.output, 'main_program.txt'), 'w') as f:
+            f.write(str(train_program))
+
+    if rank == 0 and args.do_validation_while_train:
+        test_model = StaticModel(
+            main_program=test_program,
+            startup_program=startup_program,
+            backbone_class_name=args.backbone,
+            embedding_size=args.embedding_size,
+            dropout=args.dropout,
+            mode='test',
+            fp16=args.fp16, )
+
+        callback_verification = CallBackVerification(
+            args.validation_interval_step, rank, args.batch_size, test_program,
+            list(test_model.backbone.input_dict.values()),
+            list(test_model.backbone.output_dict.values()), args.val_targets,
+            args.data_dir)
+
+    callback_logging = CallBackLogging(args.log_interval_step, rank,
+                                       world_size, total_steps,
+                                       args.batch_size, writer)
+    checkpoint = Checkpoint(
+        rank=rank,
+        world_size=world_size,
+        embedding_size=args.embedding_size,
+        num_classes=args.num_classes,
+        model_save_dir=os.path.join(args.output, args.backbone),
+        checkpoint_dir=args.checkpoint_dir,
+        max_num_last_checkpoint=args.max_num_last_checkpoint)
+
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    start_epoch = 0
+    global_step = 0
+    loss_avg = AverageMeter()
+    if args.resume:
+        extra_info = checkpoint.load(program=train_program, for_train=True)
+        start_epoch = extra_info['epoch'] + 1
+        lr_state = extra_info['lr_state']
+        # there last_epoch means last_step in for PiecewiseDecay
+        # since we always use step style for lr_scheduler
+        global_step = lr_state['last_epoch']
+        train_model.lr_scheduler.set_state_dict(lr_state)
+
+    train_loader = paddle.io.DataLoader(
+        trainset,
+        feed_list=list(train_model.backbone.input_dict.values()),
+        places=place,
+        return_list=False,
+        num_workers=args.num_workers,
+        batch_sampler=paddle.io.DistributedBatchSampler(
+            dataset=trainset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True))
+
+    max_loss_scaling = np.array([args.max_loss_scaling]).astype(np.float32)
+    for epoch in range(start_epoch, total_epoch):
+        train_reader_cost = 0.0
+        train_run_cost = 0.0
+        total_samples = 0
+        reader_start = time.time()
+        for step, data in enumerate(train_loader):
+            train_reader_cost += time.time() - reader_start
+            global_step += 1
+            train_start = time.time()
+
+            loss_v = exe.run(
+                train_program,
+                feed=data,
+                fetch_list=[train_model.classifier.output_dict['loss']],
+                use_program_cache=True)
+
+            train_run_cost += time.time() - train_start
+            total_samples += args.batch_size
+
+            loss_avg.update(np.array(loss_v)[0], 1)
+            lr_value = train_model.optimizer.get_lr()
+            callback_logging(
+                global_step, 
+                loss_avg, 
+                epoch, 
+                lr_value,
+                avg_reader_cost=train_reader_cost / args.log_interval_step,
+                avg_batch_cost=(train_reader_cost + train_run_cost) / args.log_interval_step,
+                avg_samples=total_samples / args.log_interval_step,
+                ips=total_samples / (train_reader_cost + train_run_cost))
+            if rank == 0 and args.do_validation_while_train:
+                callback_verification(global_step)
+            train_model.lr_scheduler.step()
+
+            if global_step >= total_steps:
+                break
+            sys.stdout.flush()
+            if rank is 0 and global_step > 0 and global_step % args.log_interval_step == 0:
+                train_reader_cost = 0.0
+                train_run_cost = 0.0
+                total_samples = 0
+            reader_start = time.time()
+        checkpoint.save(
+            train_program,
+            lr_scheduler=train_model.lr_scheduler,
+            epoch=epoch,
+            for_train=True)
+    writer.close()
diff --git a/insightface/recognition/arcface_paddle/static/utils/__init__.py b/insightface/recognition/arcface_paddle/static/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/insightface/recognition/arcface_paddle/static/utils/io.py b/insightface/recognition/arcface_paddle/static/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc98ce38109a42d48d72038ce94747220caacee
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/utils/io.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import os
+import paddle
+import logging
+import numpy as np
+import shutil
+import json
+from utils.rearrange_weight import rearrange_weight
+
+
+class Checkpoint(object):
+    def __init__(self,
+                 rank,
+                 world_size,
+                 embedding_size,
+                 num_classes,
+                 model_save_dir="./",
+                 checkpoint_dir=None,
+                 max_num_last_checkpoint=3):
+
+        self.rank: int = rank
+        self.world_size: int = world_size
+        self.embedding_size: int = embedding_size
+        self.num_classes: int = num_classes
+        self.model_save_dir: str = model_save_dir
+        self.checkpoint_dir: str = checkpoint_dir
+        self.max_num_last_checkpoint: int = max_num_last_checkpoint
+
+    def save(self, program, lr_scheduler=None, epoch=0, for_train=True):
+        model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+        if not os.path.exists(model_save_dir):
+            # may be more than one processes trying
+            # to create the directory
+            try:
+                os.makedirs(model_save_dir)
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise
+                pass
+
+        param_state_dict = program.state_dict(mode='param')
+        for name, param in param_state_dict.items():
+            # for non dist param, we only save their at rank 0,
+            # but for dist param, we need to save their at all ranks.
+            if 'dist@' in name and '@rank@' in name or self.rank == 0:
+                paddle.save(param,
+                            os.path.join(model_save_dir, name + '.pdparam'))
+
+        if for_train:
+            opt_state_dict = program.state_dict(mode='opt')
+            for name, opt in opt_state_dict.items():
+                if '@GRAD' in name:
+                    continue
+                # for non dist opt var, we only save their at rank 0,
+                # but for dist opt var, we need to save their at all ranks.
+                if 'dist@' in name and '@rank@' in name or self.rank == 0:
+                    paddle.save(opt,
+                                os.path.join(model_save_dir, name + '.pdopt'))
+
+            if self.rank == 0:
+                # save some extra info for resume
+                # pretrain_world_size, embedding_size, num_classes are used for
+                # re-split fc weight when gpu setting changed.
+                # epoch use to restart.
+                config_file = os.path.join(model_save_dir, 'meta.json')
+                extra_info = dict()
+                extra_info["pretrain_world_size"] = self.world_size
+                extra_info["embedding_size"] = self.embedding_size
+                extra_info['num_classes'] = self.num_classes
+                extra_info['epoch'] = epoch
+                extra_info['lr_state'] = lr_scheduler.state_dict()
+                with open(config_file, 'w') as f:
+                    json.dump(extra_info, f)
+
+        logging.info("Save model to {}.".format(model_save_dir))
+        if self.rank == 0 and self.max_num_last_checkpoint > 0:
+            for idx in range(-1, epoch - self.max_num_last_checkpoint + 1):
+                path = os.path.join(self.model_save_dir, str(idx))
+                if os.path.exists(path):
+                    logging.info("Remove checkpoint {}.".format(path))
+                    shutil.rmtree(path)
+
+    def load(self, program, for_train=True, dtype=None):
+        assert os.path.exists(self.checkpoint_dir)
+        checkpoint_dir = os.path.abspath(self.checkpoint_dir)
+
+        state_dict = {}
+        dist_weight_state_dict = {}
+        dist_weight_velocity_state_dict = {}
+        dist_bias_state_dict = {}
+        dist_bias_velocity_state_dict = {}
+        for path in os.listdir(checkpoint_dir):
+            path = os.path.join(checkpoint_dir, path)
+            if not os.path.isfile(path):
+                continue
+
+            basename = os.path.basename(path)
+            name, ext = os.path.splitext(basename)
+
+            if ext not in ['.pdopt', '.pdparam']:
+                continue
+
+            if not for_train and ext == '.pdopt':
+                continue
+
+            tensor = paddle.load(path, return_numpy=True)
+            if dtype:
+                assert dtype in ['float32', 'float16']
+                tensor = tensor.astype(dtype)
+
+            if 'dist@' in name and '@rank@' in name:
+                if '.w' in name and 'velocity' not in name:
+                    dist_weight_state_dict[name] = tensor
+                elif '.w' in name and 'velocity' in name:
+                    dist_weight_velocity_state_dict[name] = tensor
+                elif '.b' in name and 'velocity' not in name:
+                    dist_bias_state_dict[name] = tensor
+                elif '.b' in name and 'velocity' in name:
+                    dist_bias_velocity_state_dict[name] = tensor
+
+            else:
+                state_dict[name] = tensor
+
+        if for_train:
+            meta_file = os.path.join(checkpoint_dir, 'meta.json')
+            if not os.path.exists(meta_file):
+                logging.error(
+                    "Please make sure the checkpoint dir {} exists, and "
+                    "parameters in that dir are validating.".format(
+                        checkpoint_dir))
+                exit()
+
+            with open(meta_file, 'r') as handle:
+                extra_info = json.load(handle)
+
+            # Preporcess distributed parameters.
+            pretrain_world_size = extra_info['pretrain_world_size']
+            assert pretrain_world_size > 0
+            embedding_size = extra_info['embedding_size']
+            assert embedding_size == self.embedding_size
+            num_classes = extra_info['num_classes']
+            assert num_classes == self.num_classes
+
+            logging.info(
+                "Parameters for pre-training: pretrain_world_size ({}), "
+                "embedding_size ({}), and num_classes ({}).".format(
+                    pretrain_world_size, embedding_size, num_classes))
+            logging.info("Parameters for inference or fine-tuning: "
+                         "world_size ({}).".format(self.world_size))
+
+            rank_str = '%05d' % self.rank
+
+            dist_weight_state_dict = rearrange_weight(
+                dist_weight_state_dict, pretrain_world_size, self.world_size)
+            dist_bias_state_dict = rearrange_weight(
+                dist_bias_state_dict, pretrain_world_size, self.world_size)
+            for name, value in dist_weight_state_dict.items():
+                if rank_str in name:
+                    state_dict[name] = value
+            for name, value in dist_bias_state_dict.items():
+                if rank_str in name:
+                    state_dict[name] = value
+
+            if for_train:
+                dist_weight_velocity_state_dict = rearrange_weight(
+                    dist_weight_velocity_state_dict, pretrain_world_size,
+                    self.world_size)
+                dist_bias_velocity_state_dict = rearrange_weight(
+                    dist_bias_velocity_state_dict, pretrain_world_size,
+                    self.world_size)
+                for name, value in dist_weight_velocity_state_dict.items():
+                    if rank_str in name:
+                        state_dict[name] = value
+                for name, value in dist_bias_velocity_state_dict.items():
+                    if rank_str in name:
+                        state_dict[name] = value
+
+        program.set_state_dict(state_dict)
+        logging.info("Load checkpoint from '{}'. ".format(checkpoint_dir))
+        if for_train:
+            return extra_info
+        else:
+            return {}
diff --git a/insightface/recognition/arcface_paddle/static/utils/optimization_pass.py b/insightface/recognition/arcface_paddle/static/utils/optimization_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..817fb6f3a12954c9d4baf3e51eadd15d1b6b3ef5
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/utils/optimization_pass.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def check_contains(name, name_list):
+    for n in name_list:
+        if name in n:
+            return True
+    return False
+
+
+def gather_optimization_pass(program, weight_name):
+    op_idxs = []
+    gather_grad_op = None
+    momentum_op = None
+    for idx, op in enumerate(program.global_block().ops):
+        if (op.type == 'gather_grad' or
+                op.type == 'momentum') and check_contains(weight_name,
+                                                          op.input_arg_names):
+            op_idxs.append(idx)
+            if op.type == 'momentum':
+                momentum_op = op
+            if op.type == 'gather_grad':
+                gather_grad_op = op
+
+    if gather_grad_op is not None and momentum_op is not None:
+        inputs = {
+            'Param': momentum_op.input('Param'),
+            'Velocity': momentum_op.input('Velocity'),
+            'LearningRate': momentum_op.input('LearningRate'),
+            'Grad': gather_grad_op.input('Out@GRAD'),
+            'Index': gather_grad_op.input('Index'),
+            'Axis': gather_grad_op.input('Axis'),
+        }
+        outputs = {
+            'ParamOut': momentum_op.output('ParamOut'),
+            'VelocityOut': momentum_op.output('VelocityOut'),
+        }
+        if 'MasterParam' in momentum_op.input_names and len(
+                momentum_op.input('MasterParam')) > 0:
+            inputs['MasterParam'] = momentum_op.input('MasterParam')
+        if 'MasterParamOut' in momentum_op.output_names and len(
+                momentum_op.output('MasterParamOut')) > 0:
+            outputs['MasterParamOut'] = momentum_op.output('MasterParamOut')
+
+        attrs = {
+            'mu': momentum_op.attr('mu'),
+            'use_nesterov': momentum_op.attr('use_nesterov'),
+            'regularization_method': momentum_op.attr('regularization_method'),
+            'regularization_coeff': momentum_op.attr('regularization_coeff'),
+            'multi_precision': momentum_op.attr('multi_precision'),
+            'rescale_grad': momentum_op.attr('rescale_grad'),
+            'op_device': momentum_op.attr('op_device'),
+            'op_namescope': momentum_op.attr('op_namescope'),
+            'op_role': momentum_op.attr('op_role'),
+            'op_role_var': momentum_op.input('Param'),
+            'axis': gather_grad_op.attr('axis'),
+        }
+        program.global_block()._insert_op(
+            op_idxs[-1] + 1,
+            type='sparse_momentum',
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        for idx in reversed(op_idxs):
+            program.global_block()._remove_op(idx, sync=False)
+
+        var_names = []
+        for idx, name in enumerate(program.global_block().vars):
+            if '@GRAD' in name and weight_name in name:
+                var_names.append(name)
+        for name in var_names:
+            program.global_block()._remove_var(name, sync=False)
+        program.global_block()._sync_with_cpp()
+
+
+def amp_pass(program, weight_name):
+    for idx, op in enumerate(program.global_block().ops):
+        if (op.type == 'update_loss_scaling' or
+                op.type == 'check_finite_and_unscale'):
+            input_idxs = []
+            input_arg_names = op.input("X")
+            # input_arg_names.append(gather_grad_op.input('Out@GRAD')[0])
+            for i, name in enumerate(input_arg_names):
+                if '@GRAD' in name and weight_name in name:
+                    input_idxs.append(i)
+            if len(input_idxs) > 0:
+                for i in reversed(input_idxs):
+                    input_arg_names.pop(i)
+                op.desc.set_input("X", input_arg_names)
+
+            output_idxs = []
+            output_arg_names = op.output("Out")
+            # output_arg_names.append(gather_grad_op.input('Out@GRAD')[0])
+            for i, name in enumerate(output_arg_names):
+                if '@GRAD' in name and weight_name in name:
+                    output_idxs.append(i)
+            if len(output_idxs) > 0:
+                for i in reversed(output_idxs):
+                    output_arg_names.pop(i)
+                op.desc.set_output("Out", output_arg_names)
+
+            if op.type == 'check_finite_and_unscale':
+                op_role_idxs = []
+                op_role_var = op.attr("op_role_var")
+                for i, name in enumerate(op_role_var):
+                    if '@GRAD' in name and weight_name in name:
+                        op_role_idxs.append(i)
+                if len(op_role_idxs) > 0:
+                    for i in reversed(op_role_idxs):
+                        op_role_var.pop(i)
+                    op.desc._set_attr("op_role_var", op_role_var)
diff --git a/insightface/recognition/arcface_paddle/static/utils/verification.py b/insightface/recognition/arcface_paddle/static/utils/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..102ebf98a1c7a842bac58dc29f474bfacca5f62a
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/utils/verification.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import numpy as np
+import sklearn
+import paddle
+import logging
+
+from utils.verification import evaluate
+from datasets import load_bin
+
+
+def test(rank, batch_size, data_set, executor, test_program, data_feeder,
+         fetch_list):
+
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+
+    # data_list[0] for normalize
+    # data_list[1] for flip_left_right
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = []
+            for k in range(bb - batch_size, bb):
+                _data.append((data[k], ))
+            [_embeddings] = executor.run(test_program,
+                                         fetch_list=fetch_list,
+                                         feed=data_feeder.feed(_data),
+                                         use_program_cache=True)
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    xnorm = 0.0
+    xnorm_cnt = 0
+    for embed in embeddings_list:
+        xnorm += np.sqrt((embed * embed).sum(axis=1)).sum(axis=0)
+        xnorm_cnt += embed.shape[0]
+    xnorm /= xnorm_cnt
+
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    _, _, accuracy, val, val_std, far = evaluate(
+        embeddings, issame_list, nrof_folds=10)
+    acc, std = np.mean(accuracy), np.std(accuracy)
+    return acc, std, xnorm
+
+
+class CallBackVerification(object):
+    def __init__(self,
+                 frequent,
+                 rank,
+                 batch_size,
+                 test_program,
+                 feed_list,
+                 fetch_list,
+                 val_targets,
+                 rec_prefix,
+                 image_size=(112, 112)):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.batch_size: int = batch_size
+
+        self.test_program: paddle.static.Program = test_program
+        self.feed_list: List[paddle.fluid.framework.Variable] = feed_list
+        self.fetch_list: List[paddle.fluid.framework.Variable] = fetch_list
+
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        self.init_dataset(
+            val_targets=val_targets,
+            data_dir=rec_prefix,
+            image_size=image_size)
+
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
+        place = paddle.CUDAPlace(gpu_id)
+        self.executor = paddle.static.Executor(place)
+        self.data_feeder = paddle.fluid.DataFeeder(
+            place=place, feed_list=self.feed_list, program=self.test_program)
+
+    def ver_test(self, global_step: int):
+        for i in range(len(self.ver_list)):
+            test_start = time.time()
+            acc2, std2, xnorm = test(
+                self.rank, self.batch_size, self.ver_list[i], self.executor,
+                self.test_program, self.data_feeder, self.fetch_list)
+            logging.info('[%s][%d]XNorm: %f' %
+                         (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                         (self.ver_name_list[i], global_step, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info('[%s][%d]Accuracy-Highest: %1.5f' % (
+                self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            test_end = time.time()
+            logging.info("test time: {:.4f}".format(test_end - test_start))
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update):
+        if self.rank == 0 and num_update > 0 and num_update % self.frequent == 0:
+            self.ver_test(num_update)
diff --git a/insightface/recognition/arcface_paddle/static/validation.py b/insightface/recognition/arcface_paddle/static/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f419b494dcb69b65e52ca7836788519cf9ee3972
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/static/validation.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import os
+import numpy as np
+import paddle
+
+from .utils.verification import CallBackVerification
+from .utils.io import Checkpoint
+from .static_model import StaticModel
+
+from . import backbones
+
+
+def validation(args):
+    checkpoint = Checkpoint(
+        rank=0,
+        world_size=1,
+        embedding_size=args.embedding_size,
+        num_classes=None,
+        checkpoint_dir=args.checkpoint_dir, )
+
+    test_program = paddle.static.Program()
+    startup_program = paddle.static.Program()
+
+    test_model = StaticModel(
+        main_program=test_program,
+        startup_program=startup_program,
+        backbone_class_name=args.backbone,
+        embedding_size=args.embedding_size,
+        mode='test', )
+
+    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
+    place = paddle.CUDAPlace(gpu_id)
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    checkpoint.load(program=test_program, for_train=False, dtype='float32')
+
+    callback_verification = CallBackVerification(
+        1, 0, args.batch_size, test_program,
+        list(test_model.backbone.input_dict.values()),
+        list(test_model.backbone.output_dict.values()), args.val_targets,
+        args.data_dir)
+
+    callback_verification(1)
diff --git a/insightface/recognition/arcface_paddle/test_tipc/common_func.sh b/insightface/recognition/arcface_paddle/test_tipc/common_func.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2c039e9642e8c125d1e73981e1d97e723b15a9ea
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/common_func.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+function func_parser_key(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[0]}
+    echo ${tmp}
+}
+
+function func_parser_value(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    tmp=${array[1]}
+    echo ${tmp}
+}
+
+function func_set_params(){
+    key=$1
+    value=$2
+    if [ ${key}x = "null"x ];then
+        echo " "
+    elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+        echo " "
+    else 
+        echo "${key}=${value}"
+    fi
+}
+
+function func_parser_params(){
+    strs=$1
+    IFS=":"
+    array=(${strs})
+    key=${array[0]}
+    tmp=${array[1]}
+    IFS="|"
+    res=""
+    for _params in ${tmp[*]}; do
+        IFS="="
+        array=(${_params})
+        mode=${array[0]}
+        value=${array[1]}
+        if [[ ${mode} = ${MODE} ]]; then
+            IFS="|"
+            #echo $(func_set_params "${mode}" "${value}")
+            echo $value
+            break
+        fi
+        IFS="|"
+    done
+    echo ${res}
+}
+
+function status_check(){
+    last_status=$1   # the exit code
+    run_command=$2
+    run_log=$3
+    if [ $last_status -eq 0 ]; then
+        echo -e "\033[33m Run successfully with command - ${run_command}!  \033[0m" | tee -a ${run_log}
+    else
+        echo -e "\033[33m Run failed with command - ${run_command}!  \033[0m" | tee -a ${run_log}
+    fi
+}
diff --git a/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt b/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14ad02e7bdc7032c135dd9a1cf19cb1e18e09e2f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt
@@ -0,0 +1,17 @@
+===========================serving_params===========================
+model_name:arcfae_mobileface
+python:python3.7
+trans_model:-m paddle_serving_client.convert
+--dirname:./inference/
+--model_filename:inference.pdmodel
+--params_filename:inference.pdiparams
+--serving_server:./deploy/pdserving/MobileFaceNet_128_serving/
+--serving_client:./deploy/pdserving/MobileFaceNet_128_client/
+serving_dir:./deploy/pdserving
+web_service:web_service.py --config=config.yml --opt op.ArcFace.concurrency=1
+op.ArcFace.local_service_conf.devices:null|0
+op.ArcFace.local_service_conf.use_mkldnn:True|False
+op.ArcFace.local_service_conf.thread_num:1|6
+op.ArcFace.local_service_conf.use_trt:False|True
+op.ArcFace.local_service_conf.precision:fp32
+pipline:pipeline_http_client.py --image_dir=./imgs
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/train_infer_python.txt b/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1034c6959903da74d020e051fd01c2893b1918ba
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/configs/ms1mv2_mobileface/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:MobileFaceNet_128
+python:python3.7
+gpu_list:0
+null:null
+null:null
+--train_num:lite_train_lite_infer=1
+--output:./output/
+--batch_size:2
+--checkpoint_dir:null
+train_model_name:MobileFaceNet_128/0
+train_infer_img_dir:./MS1M_v2/images
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py --config_file=configs/ms1mv2_mobileface.py --is_static=False --fp16=False --embedding_size=128 --fp16=False --dataset=MS1M_v2 --data_dir=MS1M_v2/ --label_file=MS1M_v2/label.txt --num_classes=85742 --log_interval_step=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/validation.py --is_static=False --backbone=MobileFaceNet_128 --embedding_size=128 --data_dir=MS1M_v2 --val_targets=lfw --batch_size=2
+null:null
+##
+===========================infer_params===========================
+--output_dir:null
+--checkpoint_dir:null
+norm_export:tools/export.py --is_static=False --export_type=paddle --backbone=MobileFaceNet_128 --embedding_size=128
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+##
+train_model:./inference
+infer_export:null
+infer_quant:False
+inference:tools/inference.py  --export_type=paddle --benchmark=True
+--use_gpu:True|False
+--enable_mkldnn:True|False
+--cpu_threads:1|6
+--max_batch_size:1
+--use_tensorrt:False|True
+--precision:fp32
+--model_dir:null
+--image_path:MS1M_v2/images
+null:null
+--benchmark:null
+null:null
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/test_tipc/data/small_dataset.tar b/insightface/recognition/arcface_paddle/test_tipc/data/small_dataset.tar
new file mode 100644
index 0000000000000000000000000000000000000000..66f753c87b28df336d23612a117f11be530d43d7
Binary files /dev/null and b/insightface/recognition/arcface_paddle/test_tipc/data/small_dataset.tar differ
diff --git a/insightface/recognition/arcface_paddle/test_tipc/data/small_lfw.bin b/insightface/recognition/arcface_paddle/test_tipc/data/small_lfw.bin
new file mode 100644
index 0000000000000000000000000000000000000000..51cb7f54f72b46d0a6f059f733a02166009a85d3
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/data/small_lfw.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ca73282b165bae35fe85d1c1009bfff3a931969b591e36390de4ef7cbf4c6be
+size 108463
diff --git a/insightface/recognition/arcface_paddle/test_tipc/docs/guide.png b/insightface/recognition/arcface_paddle/test_tipc/docs/guide.png
new file mode 100644
index 0000000000000000000000000000000000000000..a28a2032cec8d879fb00d652fb6303439c0aa917
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/docs/guide.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dbd7d16bc6f235ea07e0e8c3b57c0f0de319c495e8492c27e1aa0e7aec83aba
+size 141626
diff --git a/insightface/recognition/arcface_paddle/test_tipc/docs/install.md b/insightface/recognition/arcface_paddle/test_tipc/docs/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..493ab22c73cac470b01ca017d04a470d2bf4e7ee
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/docs/install.md
@@ -0,0 +1,121 @@
+## 1. 环境准备
+
+本教程适用于TIPC目录下基础功能测试的运行环境搭建。
+
+推荐环境：
+- CUDA 10.1/10.2
+- CUDNN 7.6/cudnn8.1
+- TensorRT 6.1.0.5 / 7.1 / 7.2
+
+环境配置可以选择docker镜像安装，或者在本地环境Python搭建环境。推荐使用docker镜像安装，避免不必要的环境配置。
+
+## 2. Docker 镜像安装
+
+推荐docker镜像安装，按照如下命令创建镜像，当前目录映射到镜像中的`/paddle`目录下
+```
+nvidia-docker run --name paddle -it -v $PWD:/paddle paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash
+cd /paddle
+
+# 安装带TRT的paddle
+pip3.7 install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.3/linux-gpu-cuda10.1-cudnn7-mkl-gcc8.2-trt6-avx/paddlepaddle_gpu-2.1.3.post101-cp37-cp37m-linux_x86_64.whl
+```
+
+## 3 Python 环境构建
+
+非docker环境下，环境配置比较灵活，推荐环境组合配置：
+- CUDA10.1 + CUDNN7.6 + TensorRT 6
+- CUDA10.2 + CUDNN8.1 + TensorRT 7
+- CUDA11.1 + CUDNN8.1 + TensorRT 7
+
+下面以 CUDA10.2 + CUDNN8.1 + TensorRT 7 配置为例，介绍环境配置的流程。
+
+### 3.1 安装CUDNN
+
+如果当前环境满足CUDNN版本的要求，可以跳过此步骤。
+
+以CUDNN8.1 安装安装为例，安装步骤如下，首先下载CUDNN，从[Nvidia官网](https://developer.nvidia.com/rdp/cudnn-archive)下载CUDNN8.1版本，下载符合当前系统版本的三个deb文件，分别是：
+- cuDNN Runtime Library ，如：libcudnn8_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Developer Library ，如：libcudnn8-dev_8.1.0.77-1+cuda10.2_amd64.deb
+- cuDNN Code Samples，如：libcudnn8-samples_8.1.0.77-1+cuda10.2_amd64.deb
+
+deb安装可以参考[官方文档](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-deb)，安装方式如下
+```
+# x.x.x表示下载的版本号
+# $HOME为工作目录
+sudo dpkg -i libcudnn8_x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-dev_8.x.x.x-1+cudax.x_arm64.deb
+sudo dpkg -i libcudnn8-samples_8.x.x.x-1+cudax.x_arm64.deb
+
+# 验证是否正确安装
+cp -r /usr/src/cudnn_samples_v8/ $HOME
+cd  $HOME/cudnn_samples_v8/mnistCUDNN
+
+# 编译
+make clean && make
+./mnistCUDNN
+```
+如果运行mnistCUDNN完后提示运行成功，则表示安装成功。如果运行后出现freeimage相关的报错，需要按照提示安装freeimage库:
+```
+sudo apt-get install libfreeimage-dev
+sudo apt-get install libfreeimage
+```
+
+### 3.2 安装TensorRT
+
+首先，从[Nvidia官网TensorRT板块](https://developer.nvidia.com/tensorrt-getting-started)下载TensorRT，这里选择7.1.3.4版本的TensorRT，注意选择适合自己系统版本和CUDA版本的TensorRT，另外建议下载TAR package的安装包。
+
+以Ubuntu16.04+CUDA10.2为例，下载并解压后可以参考[官方文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-713/install-guide/index.html#installing-tar)的安装步骤，按照如下步骤安装:
+```
+# 以下安装命令中 '${version}' 为下载的TensorRT版本，如7.1.3.4
+# 设置环境变量，<TensorRT-${version}/lib> 为解压后的TensorRT的lib目录
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>
+
+# 安装TensorRT
+cd TensorRT-${version}/python
+pip3.7 install tensorrt-*-cp3x-none-linux_x86_64.whl
+
+# 安装graphsurgeon
+cd TensorRT-${version}/graphsurgeon
+```
+
+
+### 3.3 安装PaddlePaddle
+
+下载支持TensorRT版本的Paddle安装包，注意安装包的TensorRT版本需要与本地TensorRT一致，下载[链接](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html#python)
+选择下载 linux-cuda10.2-trt7-gcc8.2 Python3.7版本的Paddle：
+```
+# 从下载链接中可以看到是paddle2.1.1-cuda10.2-cudnn8.1版本
+wget  https://paddle-wheel.bj.bcebos.com/with-trt/2.1.1-gpu-cuda10.2-cudnn8.1-mkl-gcc8.2/paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+pip3.7 install -U paddlepaddle_gpu-2.1.1-cp37-cp37m-linux_x86_64.whl
+```
+
+## 4. 安装依赖
+```
+# 安装AutoLog
+git clone https://github.com/LDOUBLEV/AutoLog
+cd AutoLog
+pip3.7 install -r requirements.txt
+python3.7 setup.py bdist_wheel
+pip3.7 install ./dist/auto_log-1.0.0-py3-none-any.whl
+
+# 下载insightface代码
+cd ../
+git clone https://github.com/deepinsight/insightface
+
+```
+
+安装Arcface依赖：
+```
+cd insightface/recognition/arcface_paddle
+pip3.7 install -r requirements.txt
+```
+
+## FAQ :
+Q. You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found. Ignore this if TensorRT is not needed.
+
+A. 问题一般是当前安装paddle版本带TRT，但是本地环境找不到TensorRT的预测库，需要下载TensorRT库，解压后设置环境变量LD_LIBRARY_PATH;
+如：
+```
+export LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/paddle/package/TensorRT-6.0.1.5/lib
+```
+或者问题是下载的TensorRT版本和当前paddle中编译的TRT版本不匹配，需要下载版本相符的TensorRT重新安装。
diff --git a/insightface/recognition/arcface_paddle/test_tipc/docs/test.png b/insightface/recognition/arcface_paddle/test_tipc/docs/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..64d2d2ce9d74267ab693364d956145d62eb7dda3
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/docs/test.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8e40f26f803ec19fea669f62a41279db07e146ff8db0a2496599c6108df7c4
+size 229200
diff --git a/insightface/recognition/arcface_paddle/test_tipc/docs/test_serving.md b/insightface/recognition/arcface_paddle/test_tipc/docs/test_serving.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b262d05deef2a8cfcc481063f0165edfa3d15c7
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/docs/test_serving.md
@@ -0,0 +1,53 @@
+# PaddleServing预测功能测试
+
+PaddleServing预测功能测试的主程序为`test_serving.sh`，可以测试基于PaddleServing的部署功能。
+
+## 1. 测试结论汇总
+
+本repo未提供量化训练，因此这里只测试正常模型，对应的PaddleServing预测功能汇总如下：
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  :----:   |  :----: |   :----:   |  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1/6 | fp32 | - | - |
+| 正常模型 | CPU | 1/6 | - | fp32 | 支持 |
+
+## 2. 测试流程
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_serving.sh`进行测试，最终在```test_tipc/output```目录下生成`serving_infer_*.log`后缀的日志文件。
+
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/ms1mv2_mobileface/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt "serving_infer"
+
+# 用法:
+bash test_tipc/test_serving.sh ./test_tipc/configs//ms1mv2_mobileface/model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt
+```  
+
+#### 运行结果
+
+各测试的运行情况会打印在 `test_tipc/output/results_serving.log` 中：
+运行成功时会输出：
+
+```
+Run successfully with command - python3.7 pipeline_http_client.py --image_dir=./imgs > ../../test_tipc/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 ! 
+Run successfully  with command - xxxxx
+...
+```
+
+运行失败时会输出：
+
+```
+Run failed with command - python3.7 pipeline_http_client.py --image_dir=./imgs > ../../test_tipc/output/server_infer_cpu_usemkldnn_True_threads_1_batchsize_1.log 2>&1 !
+Run failed with command - xxxxx
+...
+```
+
+详细的预测结果会存在 test_tipc/output/ 文件夹下，例如`server_infer_gpu_usetrt_True_precision_fp32_batchsize_1.log`中会返回图像的特征值:
+
+```
+{'err_no': 0, 'err_msg': '', 'key': ['out'], 'value': ['array([[ 1.36603206e-01, -2.12395296e-01, -3.94680113e-01,\n        -3.14380080e-01, -9.66617092e-03,  1.87318385e-01,\n        -2.97903419e-01, -3.17218006e-01, -2.69029588e-01,\n         1.21175185e-01, -1.90171480e-01,  5.15628010e-02,\n        -1.09966584e-01, -2.23269954e-01,  5.43062799e-02,\n        -1.33496851e-01, -1.19007424e-01,  2.22256035e-01,\n        -2.77910858e-01,  2.71745831e-01, -4.16789412e-01,\n        -2.12772295e-01,  7.03845620e-01, -6.93172514e-02,\n         1.72736168e-01, -2.60139287e-01,  3.03129200e-03,\n         2.11665437e-01, -1.58136543e-02, -2.38662288e-02,\n        -5.83377741e-02,  5.22087336e-01,  2.94472545e-01,\n         1.68193743e-01, -7.54145905e-02,  1.43897519e-01,\n         1.56238422e-01, -3.39259744e-01,  2.46101081e-01,\n         3.11530419e-02, -5.94105422e-01, -2.72643536e-01,\n         1.21330276e-01,  3.12743425e-01, -1.66200623e-01,\n        -6.53145928e-03, -2.84941733e-01,  5.59734181e-05,\n        -3.21606755e-01, -1.73298046e-01, -1.07766673e-01,\n         9.54522491e-02,  2.46445552e-01, -2.62605727e-01,\n         1.81617990e-01,  6.52089193e-02, -1.01563215e-01,\n         3.59104156e-01, -5.22237360e-01,  1.64726060e-02,\n        -3.69388551e-01, -4.39793877e-02, -1.99547961e-01,\n        -3.79198231e-03,  3.00050706e-01, -1.49292305e-01,\n        -1.96511611e-01, -4.50382173e-01,  4.40837264e-01,\n        -2.56556179e-02, -1.36169955e-01, -3.62343282e-01,\n         1.56754032e-02,  7.93581456e-02,  1.90513626e-01,\n        -3.41799140e-01, -5.37522621e-02,  2.99514532e-01,\n        -1.21103093e-01,  4.06056821e-01, -2.30544969e-01,\n        -1.08799607e-01, -1.23380020e-01, -1.04779311e-01,\n        -3.59124064e-01,  6.79017082e-02,  3.27649474e-01,\n        -1.09562665e-01,  1.78656310e-01, -2.54520983e-01,\n        -2.15707019e-01,  2.97523111e-01, -3.25083762e-01,\n         1.63179748e-02,  3.89623255e-01, -7.29642585e-02,\n         5.47815263e-01, -9.16893259e-02, -4.76058573e-01,\n        -1.75076187e-01, -1.13026705e-04,  2.48254672e-01,\n         3.72678041e-01, -4.53566402e-01,  6.30904138e-02,\n         5.19643247e-01, -1.70341924e-01, -5.24724603e-01,\n        -9.19980407e-02,  5.36089689e-02,  9.92866978e-02,\n         2.93649197e-01,  1.39556557e-01,  4.84964341e-01,\n         3.11437190e-01,  3.61269027e-01, -3.84658389e-02,\n        -1.26146287e-01, -2.82240808e-01, -6.71329573e-02,\n        -5.25959721e-03,  2.54376501e-01, -2.77014941e-01,\n        -4.57646847e-02, -1.97771654e-01, -2.70207506e-02,\n         9.59944800e-02, -1.04830116e-02]], dtype=float32)']}
+```
+
+
+## 3. 更多教程
+
+本文档为功能测试用，更详细的Serving预测使用教程请参考：[ArcFace 服务化部署](../../deploy/pdserving/README_CN.md)  
diff --git a/insightface/recognition/arcface_paddle/test_tipc/docs/test_train_inference_python.md b/insightface/recognition/arcface_paddle/test_tipc/docs/test_train_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bfc4338ffb8284ead44214837e1e7d662407a99
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/docs/test_train_inference_python.md
@@ -0,0 +1,78 @@
+# Linux端基础训练预测功能测试
+
+Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`，可以测试基于Python的模型训练、评估、推理等基本功能。
+
+## 1. 测试结论汇总
+
+- 训练相关：
+
+| 算法名称 | 模型名称 | 单机单卡 | 单机多卡 | 多机多卡 | 模型压缩（单机多卡） |
+|  :----:  |   :----:  |    :----:  |  :----:   |  :----:   |  :----:   |
+|  ArcFace  | mobileface| 正常训练| 正常训练 | 正常训练 | - |
+
+
+- 预测相关：预测功能汇总如下，
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+|  :----:   |  :----: |   :----:   |  :----:  |   :----:   |  :----:  |
+| 正常模型 | GPU | 1 | fp32 | - | - |
+| 正常模型 | CPU | 1 | - | fp32 | 支持 |
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+### 2.1 安装依赖
+- 安装PaddlePaddle >= 2.2
+- 安装依赖
+    ```
+    pip3 install  -r requirements.txt
+    ```
+- 安装autolog（规范化日志输出工具）
+    ```
+    pip3 install git+https://github.com/LDOUBLEV/AutoLog --force-reinstall
+    ```
+
+### 2.2 功能测试
+先运行`prepare.sh`准备数据和模型，然后运行`test_train_inference_python.sh`进行测试，最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。
+
+
+`test_train_inference_python.sh`包含5种运行模式，每种模式的运行数据不同，分别用于测试速度和精度，本文档只测试lite_train_lite_infer一种模式：
+
+- 模式1：lite_train_lite_infer，使用少量数据训练，用于快速验证训练到预测的走通流程，不验证精度和速度；
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/ms1mv2_mobileface/train_infer_python.txt 'lite_train_lite_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/ms1mv2_mobileface/train_infer_python.txt  'lite_train_lite_infer'
+```  
+
+运行相应指令后，在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式下，会运行训练+inference的链条，因此，在`test_tipc/output`文件夹有以下文件：
+```
+test_tipc/output/
+|- results_python.log    # 运行指令状态的日志
+|- norm_train_gpus_0_autocast_null_fp16_False/  # GPU 0号卡上正常训练的训练日志和模型保存文件夹
+|- norm_train_gpus_0_autocast_null_fp16_Trule/  # GPU 0号卡上fp16训练的训练日志和模型保存文件夹
+......
+|- python_infer_cpu_usemkldnn_True_threads_1_precision_fp32_batchsize_1.log  # CPU上开启Mkldnn线程数设置为1，测试batch_size=1条件下的预测运行日志
+|- python_infer_gpu_usetrt_True_precision_fp32_batchsize_1.log # GPU上开启TensorRT，测试batch_size=1的预测日志
+......
+```
+
+其中`results_python.log`中包含了每条指令的运行状态，如果运行成功会输出：
+```
+Run successfully with command - python3.7 tools/train.py --config_file=configs/ms1mv2_mobileface.py --is_static=False --embedding_size=128 --fp16=False --dataset=MS1M_v2 --data_dir=MS1M_v2/ --label_file=MS1M_v2/label.txt --num_classes=85742 --log_interval_step=1    --output=./test_tipc/output/norm_train_gpus_0_autocast_null_fp16_Trule --train_num=1       --fp16=Trule!
+Run successfully with command - python3.7 tools/validation.py --is_static=False --backbone=MobileFaceNet_128 --embedding_size=128 --data_dir=MS1M_v2 --val_targets=lfw --batch_size=128 --checkpoint_dir=./test_tipc/output/norm_train_gpus_0_autocast_null_fp16_Trule/MobileFaceNet_128/0    !
+......
+```
+如果运行失败，会输出：
+```
+Run failed with command - python3.7 tools/train.py --config_file=configs/ms1mv2_mobileface.py --is_static=False --embedding_size=128 --fp16=False --dataset=MS1M_v2 --data_dir=MS1M_v2/ --label_file=MS1M_v2/label.txt --num_classes=85742 --log_interval_step=1    --output=./test_tipc/output/norm_train_gpus_0_autocast_null_fp16_Trule --train_num=1       --fp16=Trule!
+Run failed with command - python3.7 tools/validation.py --is_static=False --backbone=MobileFaceNet_128 --embedding_size=128 --data_dir=MS1M_v2 --val_targets=lfw --batch_size=128 --checkpoint_dir=./test_tipc/output/norm_train_gpus_0_autocast_null_fp16_Trule/MobileFaceNet_128/0    !
+......
+```
+可以很方便的根据`results_python.log`中的内容判定哪一个指令运行错误。
+
+
+## 3. 更多教程
+本文档为功能测试用，更丰富的训练预测使用教程请参考：  
+[模型训练与预测](../../README_cn.md)  
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/test_tipc/prepare.sh b/insightface/recognition/arcface_paddle/test_tipc/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..034f8a7a82ca4057bf59c33676c0617774f7eb16
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/prepare.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+FILENAME=$1
+
+# MODE be one of ['lite_train_infer' 'serving_infer']
+
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+lines=(${dataline})
+
+IFS=$'\n'
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+trainer_list=$(func_parser_value "${lines[14]}")
+
+MODE=$2
+
+if [ ${MODE} = "lite_train_lite_infer" ];then
+    rm -rf MS1M_v2; mkdir MS1M_v2
+    # pretrain lite train data
+    tar xf test_tipc/data/small_dataset.tar --strip-components 1 -C MS1M_v2 
+    
+    # wget -nc -P ./MS1M_v2/ https://paddle-model-ecology.bj.bcebos.com/whole_chain/insight-face/lfw.bin
+    cp test_tipc/data/small_lfw.bin MS1M_v2/lfw.bin
+
+elif [ ${MODE} = "serving_infer" ];then
+     # prepare serving env
+    python_name=$(func_parser_value "${lines[2]}")
+    rm paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
+    wget https://paddle-serving.bj.bcebos.com/chain/paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
+    ${python_name} -m pip install install paddle_serving_server_gpu-0.0.0.post101-py3-none-any.whl
+    ${python_name} -m pip install paddle_serving_client==0.6.3
+    ${python_name} -m pip install paddle-serving-app==0.6.3
+    ${python_name} -m pip install werkzeug==2.0.2
+
+    rm -rf ./inference
+
+    wget -nc -P ./inference https://paddle-model-ecology.bj.bcebos.com/model/insight-face/mobileface_v1.0_infer.tar
+    tar xf inference/mobileface_v1.0_infer.tar --strip-components 1 -C inference 
+fi
+
diff --git a/insightface/recognition/arcface_paddle/test_tipc/readme.md b/insightface/recognition/arcface_paddle/test_tipc/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..fba2fb87bd73cfca60ae071b5803b1947b05cb59
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/readme.md
@@ -0,0 +1,68 @@
+# 飞桨训推一体认证
+
+## 1. 简介
+
+飞桨除了基本的模型训练和预测，还提供了支持多端多平台的高性能推理部署工具。本文档提供了 ArcFace 中所有 PaddlePaddle 模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具，方便用户查阅每种模型的训练推理部署打通情况，并可以进行一键测试。
+
+<div align="center">
+    <img src="docs/guide.png" width="1000">
+</div>
+
+## 2. 汇总信息
+
+打通情况汇总如下，已填写的部分表示可以使用本工具进行一键测试，未填写的表示正在支持中。
+
+**字段说明：**
+- 基础训练预测：包括模型训练、Paddle Inference Python预测。
+- 更多训练方式：包括多机多卡、混合精度。
+- 模型压缩：包括裁剪、离线/在线量化、蒸馏。
+- 其他预测部署：包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。
+
+更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。
+
+| 算法论文 | 模型名称 | 模型类型 | 基础<br>训练预测 | 更多<br>训练方式 | 模型压缩 |  其他预测部署  |
+| :---:| :---: |  :----:  | :--------: |  :----:  |   :----:  |   :----:  |
+| ArcFace     | ms1mv2_mobileface | 识别  | 支持 | 多机多卡 | - | Paddle Serving: Python |
+
+
+
+## 3. 一键测试工具使用
+### 目录介绍
+
+```shell
+test_tipc/
+├── configs/  # 配置文件目录
+	├── ms1mv2_mobileface  # ms1mv2_mobileface 模型的测试配置文件目录
+		├── model_linux_gpu_normal_normal_serving_python_linux_gpu_cpu.txt # 测试Linux上python serving预测的配置文件
+		└── train_infer_python.txt # 测试Linux上python训练预测（基础训练预测）的配置文件
+	├── ...  
+├── data/ # 存放 TIPC 测试数据的目录
+	├── small_dataset.tar # 用于训练的小数据集 (10张图片)
+	├── small_lfw.bin # 用于评估的小数据集 (20张图片)
+├── docs/ # 存放 TIPC 测试数据的目录
+	├── install.md # 安装 TIPC 所需环境的文档
+	├── test_train_inference_python.md # 测试Linux上python训练预测的文档
+	├── test_serving.md # 测试Linux上python serving预测的文档
+├── prepare.sh                        # 完成test_*.sh运行所需要的数据和模型下载
+├── test_serving.sh    # 测试python训练预测的主程序
+├── test_train_inference_python.sh    # 测试python训练预测的主程序
+├── common_func.sh                    # 通用shell脚本函数
+└── readme.md                         # TIPC使用文档
+```
+
+### 测试流程
+使用本工具，可以测试不同功能的支持情况，以及预测结果是否对齐，测试流程如下：
+<div align="center">
+    <img src="docs/test.png" width="800">
+</div>
+
+1. 运行prepare.sh准备测试所需数据和模型；
+2. 运行要测试的功能对应的测试脚本`test_*.sh`，产出log，由log可以看到不同配置是否运行成功；
+
+其中，有1个测试主程序，功能如下：
+- `test_train_inference_python.sh`：测试基于Python的模型训练、评估、推理等基本功能，包括裁剪、量化、蒸馏。
+
+<a name="more"></a>
+#### 更多教程
+各功能测试中涉及混合精度、裁剪、量化等训练相关，及mkldnn、Tensorrt等多种预测相关参数配置，请点击下方相应链接了解更多细节和使用教程：  
+[test_train_inference_python 使用](docs/test_train_inference_python.md)  
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/test_tipc/test_serving.sh b/insightface/recognition/arcface_paddle/test_tipc/test_serving.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e86d3174b22b24f28c48ba2e453f6f0db62f6987
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/test_serving.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+dataline=$(awk 'NR==1, NR==18{print}'  $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser serving
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+trans_model_py=$(func_parser_value "${lines[3]}")
+infer_model_dir_key=$(func_parser_key "${lines[4]}")
+infer_model_dir_value=$(func_parser_value "${lines[4]}")
+model_filename_key=$(func_parser_key "${lines[5]}")
+model_filename_value=$(func_parser_value "${lines[5]}")
+params_filename_key=$(func_parser_key "${lines[6]}")
+params_filename_value=$(func_parser_value "${lines[6]}")
+serving_server_key=$(func_parser_key "${lines[7]}")
+serving_server_value=$(func_parser_value "${lines[7]}")
+serving_client_key=$(func_parser_key "${lines[8]}")
+serving_client_value=$(func_parser_value "${lines[8]}")
+serving_dir_value=$(func_parser_value "${lines[9]}")
+web_service_py=$(func_parser_value "${lines[10]}")
+web_use_gpu_key=$(func_parser_key "${lines[11]}")
+web_use_gpu_list=$(func_parser_value "${lines[11]}")
+web_use_mkldnn_key=$(func_parser_key "${lines[12]}")
+web_use_mkldnn_list=$(func_parser_value "${lines[12]}")
+web_cpu_threads_key=$(func_parser_key "${lines[13]}")
+web_cpu_threads_list=$(func_parser_value "${lines[13]}")
+web_use_trt_key=$(func_parser_key "${lines[14]}")
+web_use_trt_list=$(func_parser_value "${lines[14]}")
+web_precision_key=$(func_parser_key "${lines[15]}")
+web_precision_list=$(func_parser_value "${lines[15]}")
+pipeline_py=$(func_parser_value "${lines[16]}")
+image_dir_key=$(func_parser_key "${lines[17]}")
+image_dir_value=$(func_parser_value "${lines[17]}")
+
+LOG_PATH="../../test_tipc/output"
+mkdir -p ./test_tipc/output
+status_log="${LOG_PATH}/results_serving.log"
+
+function func_serving(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    # pdserving
+    set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_serving_server=$(func_set_params "${serving_server_key}" "${serving_server_value}")
+    set_serving_client=$(func_set_params "${serving_client_key}" "${serving_client_value}")
+    set_image_dir=$(func_set_params "${image_dir_key}" "${image_dir_value}")
+    trans_model_cmd="${python} ${trans_model_py} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_serving_server} ${set_serving_client}"
+    eval $trans_model_cmd
+    cd ${serving_dir_value}
+    echo $PWD
+    unset https_proxy
+    unset http_proxy
+    for python in ${python[*]}; do
+        if [ ${python} = "cpp"]; then
+            for use_gpu in ${web_use_gpu_list[*]}; do
+                if [ ${use_gpu} = "null" ]; then
+                    web_service_cpp_cmd="${python} -m paddle_serving_server.serve --model ppocr_det_mobile_2.0_serving/ ppocr_rec_mobile_2.0_serving/ --port 9293"
+                    eval $web_service_cmd
+                    sleep 2s
+                    _save_log_path="${LOG_PATH}/server_infer_cpp_cpu_pipeline_usemkldnn_False_threads_4_batchsize_1.log"
+                    pipeline_cmd="${python} ocr_cpp_client.py ppocr_det_mobile_2.0_client/ ppocr_rec_mobile_2.0_client/"
+                    eval $pipeline_cmd
+                    status_check $last_status "${pipeline_cmd}" "${status_log}"
+                    sleep 2s
+                    ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9
+                else
+                    web_service_cpp_cmd="${python} -m paddle_serving_server.serve --model ppocr_det_mobile_2.0_serving/ ppocr_rec_mobile_2.0_serving/ --port 9293 --gpu_id=0"
+                    eval $web_service_cmd
+                    sleep 2s
+                    _save_log_path="${LOG_PATH}/server_infer_cpp_cpu_pipeline_usemkldnn_False_threads_4_batchsize_1.log"
+                    pipeline_cmd="${python} ocr_cpp_client.py ppocr_det_mobile_2.0_client/ ppocr_rec_mobile_2.0_client/"
+                    eval $pipeline_cmd
+                    status_check $last_status "${pipeline_cmd}" "${status_log}"
+                    sleep 2s
+                    ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9                
+                fi
+            done
+        else
+            # python serving
+            for use_gpu in ${web_use_gpu_list[*]}; do
+                echo ${ues_gpu}
+                if [ ${use_gpu} = "null" ]; then
+                    for use_mkldnn in ${web_use_mkldnn_list[*]}; do
+                        if [ ${use_mkldnn} = "False" ]; then
+                            continue
+                        fi
+                        for threads in ${web_cpu_threads_list[*]}; do
+                            set_cpu_threads=$(func_set_params "${web_cpu_threads_key}" "${threads}")
+                            web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${web_use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} &"
+                            echo $web_service_cmd
+                            sleep 2s
+                            for pipeline in ${pipeline_py[*]}; do
+                                _save_log_path="${LOG_PATH}/server_infer_cpu_${pipeline%_client*}_usemkldnn_${use_mkldnn}_threads_${threads}_batchsize_1.log"
+                                pipeline_cmd="${python} ${pipeline} ${set_image_dir} > ${_save_log_path} 2>&1 "
+                                echo $pipeline_cmd
+                                last_status=${PIPESTATUS[0]}
+                                echo "cat ${_save_log_path}"
+                                status_check $last_status "${pipeline_cmd}" "${status_log}"
+                                sleep 2s
+                            done
+                            ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9
+                        done
+                    done
+                elif [ ${use_gpu} = "0" ]; then
+                    for use_trt in ${web_use_trt_list[*]}; do
+                        for precision in ${web_precision_list[*]}; do
+                            if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+                                continue
+                            fi
+                            if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
+                                continue
+                            fi
+                            if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [[ ${_flag_quant} = "True" ]]; then
+                                continue
+                            fi
+                            set_tensorrt=$(func_set_params "${web_use_trt_key}" "${use_trt}")
+                            set_precision=$(func_set_params "${web_precision_key}" "${precision}")
+                            web_service_cmd="${python} ${web_service_py} ${web_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} & "
+                            echo $web_service_cmd
+                        
+                            sleep 2s
+                            for pipeline in ${pipeline_py[*]}; do
+                                _save_log_path="${LOG_PATH}/server_infer_gpu_${pipeline%_client*}_usetrt_${use_trt}_precision_${precision}_batchsize_1.log"
+                                pipeline_cmd="${python} ${pipeline} ${set_image_dir}> ${_save_log_path} 2>&1"
+                                echo $pipeline_cmd
+                                last_status=${PIPESTATUS[0]}
+                                echo "cat ${_save_log_path}"
+                                status_check $last_status "${pipeline_cmd}" "${status_log}"
+                                sleep 2s
+                            done
+                            ps ux | grep -E 'web_service|pipeline' | awk '{print $2}' | xargs kill -s 9
+                        done
+                    done
+                else
+                    echo "Does not support hardware other than CPU and GPU Currently!"
+                fi
+            done
+        fi
+    done
+}
+
+
+# set cuda device
+GPUID=$2
+if [ ${#GPUID} -le 0 ];then
+    env=" "
+else
+    env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+fi
+set CUDA_VISIBLE_DEVICES
+eval $env
+
+
+echo "################### run test ###################"
+
+export Count=0
+IFS="|"
+func_serving "${web_service_cmd}"
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/test_tipc/test_train_inference_python.sh b/insightface/recognition/arcface_paddle/test_tipc/test_train_inference_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b60ad31b9dcb4138e3fb54874e606634f3ba6196
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/test_tipc/test_train_inference_python.sh
@@ -0,0 +1,372 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
+MODE=$2
+
+dataline=$(awk 'NR==1, NR==51{print}'  $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+train_use_gpu_key=$(func_parser_key "${lines[4]}")
+train_use_gpu_value=$(func_parser_value "${lines[4]}")
+autocast_list=$(func_parser_value "${lines[5]}")
+autocast_key=$(func_parser_key "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_params "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_params "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_infer_img_dir=$(func_parser_value "${lines[11]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+trainer_norm=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+pact_key=$(func_parser_key "${lines[16]}")
+pact_trainer=$(func_parser_value "${lines[16]}")
+fpgm_key=$(func_parser_key "${lines[17]}")
+fpgm_trainer=$(func_parser_value "${lines[17]}")
+distill_key=$(func_parser_key "${lines[18]}")
+distill_trainer=$(func_parser_value "${lines[18]}")
+trainer_key1=$(func_parser_key "${lines[19]}")
+trainer_value1=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+save_infer_key=$(func_parser_key "${lines[27]}")
+export_weight=$(func_parser_key "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+pact_export=$(func_parser_value "${lines[30]}")
+fpgm_export=$(func_parser_value "${lines[31]}")
+distill_export=$(func_parser_value "${lines[32]}")
+export_key1=$(func_parser_key "${lines[33]}")
+export_value1=$(func_parser_value "${lines[33]}")
+export_key2=$(func_parser_key "${lines[34]}")
+export_value2=$(func_parser_value "${lines[34]}")
+inference_dir=$(func_parser_value "${lines[35]}")
+
+# parser inference model 
+infer_model_dir_list=$(func_parser_value "${lines[36]}")
+infer_export_list=$(func_parser_value "${lines[37]}")
+infer_is_quant=$(func_parser_value "${lines[38]}")
+# parser inference 
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+image_dir_key=$(func_parser_key "${lines[47]}")
+infer_img_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+benchmark_key=$(func_parser_key "${lines[49]}")
+benchmark_value=$(func_parser_value "${lines[49]}")
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+# parser klquant_infer
+if [ ${MODE} = "klquant_whole_infer" ]; then
+    dataline=$(awk 'NR==85 NR==101{print}'  $FILENAME)
+    lines=(${dataline})
+    # parser inference model 
+    infer_model_dir_list=$(func_parser_value "${lines[1]}")
+    infer_export_list=$(func_parser_value "${lines[2]}")
+    infer_is_quant=$(func_parser_value "${lines[3]}")
+    # parser inference 
+    inference_py=$(func_parser_value "${lines[4]}")
+    use_gpu_key=$(func_parser_key "${lines[5]}")
+    use_gpu_list=$(func_parser_value "${lines[5]}")
+    use_mkldnn_key=$(func_parser_key "${lines[6]}")
+    use_mkldnn_list=$(func_parser_value "${lines[6]}")
+    cpu_threads_key=$(func_parser_key "${lines[7]}")
+    cpu_threads_list=$(func_parser_value "${lines[7]}")
+    batch_size_key=$(func_parser_key "${lines[8]}")
+    batch_size_list=$(func_parser_value "${lines[8]}")
+    use_trt_key=$(func_parser_key "${lines[9]}")
+    use_trt_list=$(func_parser_value "${lines[9]}")
+    precision_key=$(func_parser_key "${lines[10]}")
+    precision_list=$(func_parser_value "${lines[10]}")
+    infer_model_key=$(func_parser_key "${lines[11]}")
+    image_dir_key=$(func_parser_key "${lines[12]}")
+    infer_img_dir=$(func_parser_value "${lines[12]}")
+    save_log_key=$(func_parser_key "${lines[13]}")
+    benchmark_key=$(func_parser_key "${lines[14]}")
+    benchmark_value=$(func_parser_value "${lines[14]}")
+    infer_key1=$(func_parser_key "${lines[15]}")
+    infer_value1=$(func_parser_value "${lines[15]}")
+fi
+
+LOG_PATH="./test_tipc/output"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+
+function func_inference(){
+    IFS='|'
+    _python=$1
+    _script=$2
+    _model_dir=$3
+    _log_path=$4
+    _img_dir=$5
+    _flag_quant=$6
+    # inference 
+    for use_gpu in ${use_gpu_list[*]}; do
+        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+            for use_mkldnn in ${use_mkldnn_list[*]}; do
+                if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+                    continue
+                fi
+                for threads in ${cpu_threads_list[*]}; do
+                    for batch_size in ${batch_size_list[*]}; do
+                        for precision in ${precision_list[*]}; do
+                            if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then
+                                continue
+                            fi # skip when enable fp16 but disable mkldnn
+                            if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then
+                                continue
+                            fi # skip when quant model inference but precision is not int8
+                            set_precision=$(func_set_params "${precision_key}" "${precision}")
+                            
+                            _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+                            set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                            set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                            set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                            set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+                            set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                            set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+                            command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${use_mkldnn_key}=${use_mkldnn} ${set_cpu_threads} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} --model_file=${_model_dir}/${model_name}.pdmodel --params_file=${_model_dir}/${model_name}.pdiparams > ${_save_log_path} 2>&1 "
+                            eval $command
+                            last_status=${PIPESTATUS[0]}
+                            eval "cat ${_save_log_path}"
+                            status_check $last_status "${command}" "${status_log}"
+                        done
+                    done
+                done
+            done
+        elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+            for use_trt in ${use_trt_list[*]}; do
+                for precision in ${precision_list[*]}; do
+                    if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+                        continue
+                    fi 
+                    if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
+                        continue
+                    fi
+                    if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
+                        continue
+                    fi
+                    for batch_size in ${batch_size_list[*]}; do
+                        _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+                        set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+                        set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+                        set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+                        set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
+                        set_precision=$(func_set_params "${precision_key}" "${precision}")
+                        set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+                        set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+                        command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} --model_file=${_model_dir}/${model_name}.pdmodel --params_file=${_model_dir}/${model_name}.pdiparams > ${_save_log_path} 2>&1 "
+                        eval $command
+                        last_status=${PIPESTATUS[0]}
+                        eval "cat ${_save_log_path}"
+                        status_check $last_status "${command}" "${status_log}"
+                        
+                    done
+                done
+            done
+        else
+            echo "Does not support hardware other than CPU and GPU Currently!"
+        fi
+    done
+}
+
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+    GPUID=$3
+    if [ ${#GPUID} -le 0 ];then
+        env=" "
+    else
+        env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+    fi
+    # set CUDA_VISIBLE_DEVICES
+    eval $env
+    export Count=0
+    IFS="|"
+    infer_run_exports=(${infer_export_list})
+    infer_quant_flag=(${infer_is_quant})
+    for infer_model in ${infer_model_dir_list[*]}; do
+        # run export
+        if [ ${infer_run_exports[Count]} != "null" ];then
+            save_infer_dir=$(dirname $infer_model)
+            set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
+            set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
+            export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}"
+            echo ${infer_run_exports[Count]} 
+            echo  $export_cmd
+            eval $export_cmd
+            status_export=$?
+            status_check $status_export "${export_cmd}" "${status_log}"
+        else
+            save_infer_dir=${infer_model}
+        fi
+        #run inference
+        is_quant=${infer_quant_flag[Count]}
+        if [ ${MODE} = "klquant_infer" ]; then
+            is_quant="True"
+        fi
+        func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant}
+        Count=$(($Count + 1))
+    done
+else
+    IFS="|"
+    export Count=0
+    USE_GPU_KEY=(${train_use_gpu_value})
+    for gpu in ${gpu_list[*]}; do
+        train_use_gpu=${USE_GPU_KEY[Count]}
+        Count=$(($Count + 1))
+        ips=""
+        if [ ${gpu} = "-1" ];then
+            env=""
+        elif [ ${#gpu} -le 1 ];then
+            env="export CUDA_VISIBLE_DEVICES=${gpu}"
+            eval ${env}
+        elif [ ${#gpu} -le 15 ];then
+            IFS=","
+            array=(${gpu})
+            env="export CUDA_VISIBLE_DEVICES=${array[0]}"
+            IFS="|"
+        else
+            IFS=";"
+            array=(${gpu})
+            ips=${array[0]}
+            gpu=${array[1]}
+            IFS="|"
+            env=" "
+        fi
+        for autocast in ${autocast_list[*]}; do 
+            if [ ${autocast} = "amp" ]; then
+                set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True"
+            else
+                set_amp_config=" "
+            fi          
+            for trainer in ${trainer_list[*]}; do 
+                flag_quant=False
+                if [ ${trainer} = ${pact_key} ]; then
+                    run_train=${pact_trainer}
+                    run_export=${pact_export}
+                    flag_quant=True
+                elif [ ${trainer} = "${fpgm_key}" ]; then
+                    run_train=${fpgm_trainer}
+                    run_export=${fpgm_export}
+                elif [ ${trainer} = "${distill_key}" ]; then
+                    run_train=${distill_trainer}
+                    run_export=${distill_export}
+                elif [ ${trainer} = ${trainer_key1} ]; then
+                    run_train=${trainer_value1}
+                    run_export=${export_value1}
+                elif [[ ${trainer} = ${trainer_key2} ]]; then
+                    run_train=${trainer_value2}
+                    run_export=${export_value2}
+                else
+                    run_train=${norm_trainer}
+                    run_export=${norm_export}
+                fi
+
+                if [ ${run_train} = "null" ]; then
+                    continue
+                fi
+                set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
+                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+                set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+                set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+                set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
+                if [ ${#ips} -le 26 ];then
+                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
+                    nodes=1
+                else
+                    IFS=","
+                    ips_array=(${ips})
+                    IFS="|"
+                    nodes=${#ips_array[@]}
+                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+                fi
+
+                # load pretrain from norm training if current trainer is pact or fpgm trainer
+                if ([ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]) && [ ${nodes} -le 1 ]; then
+                    set_pretrain="${load_norm_train_model}"
+                fi
+
+                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
+                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
+                elif [ ${#ips} -le 26 ];then  # train with multi-gpu
+                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
+                else     # train with multi-machine
+                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
+                fi
+                # run train
+                eval "unset CUDA_VISIBLE_DEVICES"
+                eval $cmd
+                status_check $? "${cmd}" "${status_log}"
+
+                set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
+                # save norm trained models to set pretrain for pact training and fpgm training 
+                if [ ${trainer} = ${trainer_norm} ] && [ ${nodes} -le 1]; then
+                    load_norm_train_model=${set_eval_pretrain}
+                fi
+                # run eval 
+                if [ ${eval_py} != "null" ]; then
+                    set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
+                    eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" 
+                    eval $eval_cmd
+                    status_check $? "${eval_cmd}" "${status_log}"
+                fi
+                # run export model
+                if [ ${run_export} != "null" ]; then 
+                    # run export model
+                    save_infer_path="${save_log}"
+                    set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}")
+                    set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}")
+                    export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}"
+                    eval $export_cmd
+                    status_check $? "${export_cmd}" "${status_log}"
+
+                    #run inference
+                    eval $env
+                    save_infer_path="${save_log}"
+                    if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
+                        infer_model_dir="${save_infer_path}/${inference_dir}"
+                    else
+                        infer_model_dir=${save_infer_path}
+                    fi
+                    func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
+                    
+                    eval "unset CUDA_VISIBLE_DEVICES"
+                fi
+            done  # done with:    for trainer in ${trainer_list[*]}; do 
+        done      # done with:    for autocast in ${autocast_list[*]}; do 
+    done          # done with:    for gpu in ${gpu_list[*]}; do
+fi  # end if [ ${MODE} = "infer" ]; then
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/tools/__init__.py b/insightface/recognition/arcface_paddle/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/insightface/recognition/arcface_paddle/tools/benchmark_speed.py b/insightface/recognition/arcface_paddle/tools/benchmark_speed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6745f5fbaab53a3ff33f85f60113811b4495a594
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/benchmark_speed.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import time
+import argparse
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    # general params
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--use_gpu", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=1000)
+
+    # params for predict
+    parser.add_argument("--model_file", type=str)
+    parser.add_argument("--params_file", type=str)
+    parser.add_argument("-b", "--batch_size", type=int, default=1)
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_mkldnn", type=str2bool, default=True)
+    parser.add_argument("--cpu_num_threads", type=int, default=10)
+    parser.add_argument("--model", type=str)
+
+    return parser.parse_args()
+
+
+def create_paddle_predictor(args):
+    config = Config(args.model_file, args.params_file)
+
+    if args.use_gpu:
+        config.enable_use_gpu(args.gpu_mem, 0)
+    else:
+        config.disable_gpu()
+
+    if args.use_mkldnn:
+        config.enable_mkldnn()
+        config.set_cpu_math_library_num_threads(args.cpu_num_threads)
+        config.set_mkldnn_cache_capacity(100)
+
+    config.disable_glog_info()
+    config.switch_ir_optim(args.ir_optim)  # default true
+
+    config.enable_memory_optim()
+    # use zero copy
+    config.switch_use_feed_fetch_ops(False)
+    predictor = create_predictor(config)
+
+    return predictor
+
+
+class Predictor(object):
+    def __init__(self, args):
+
+        self.args = args
+
+        self.paddle_predictor = create_paddle_predictor(args)
+        input_names = self.paddle_predictor.get_input_names()
+        self.input_tensor = self.paddle_predictor.get_input_handle(input_names[
+            0])
+
+        output_names = self.paddle_predictor.get_output_names()
+        self.output_tensor = self.paddle_predictor.get_output_handle(
+            output_names[0])
+
+    def predict(self, batch_input):
+        self.input_tensor.copy_from_cpu(batch_input)
+        self.paddle_predictor.run()
+        batch_output = self.output_tensor.copy_to_cpu()
+        return batch_output
+
+    def benchmark_predict(self):
+        test_num = 500
+        test_time = 0.0
+        for i in range(0, test_num + 10):
+            inputs = np.random.rand(args.batch_size, 3, 112,
+                                    112).astype(np.float32)
+            start_time = time.time()
+            batch_output = self.predict(inputs).flatten()
+            if i >= 10:
+                test_time += time.time() - start_time
+            # time.sleep(0.01)  # sleep for T4 GPU
+
+        print("{0}\tbatch size: {1}\ttime(ms): {2}".format(
+            args.model, args.batch_size, 1000 * test_time / test_num))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert os.path.exists(
+        args.model_file), "The path of 'model_file' does not exist: {}".format(
+            args.model_file)
+    assert os.path.exists(
+        args.params_file
+    ), "The path of 'params_file' does not exist: {}".format(args.params_file)
+
+    predictor = Predictor(args)
+    assert args.model is not None
+    predictor.benchmark_predict()
diff --git a/insightface/recognition/arcface_paddle/tools/convert_image_bin.py b/insightface/recognition/arcface_paddle/tools/convert_image_bin.py
new file mode 100644
index 0000000000000000000000000000000000000000..589e66562fe70b08b25a91bb5e03d31a863e7ecc
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/convert_image_bin.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+import argparse
+import cv2
+from datasets.kv_helper import read_img_from_bin
+from datasets.kv_helper import trans_img_to_bin
+
+
+def get_file_list(img_file, end=('jpg', 'png', 'jpeg', 'JPEG', 'JPG', 'bmp')):
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+
+    if os.path.isfile(img_file) and img_file.split('.')[-1] in end:
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            if single_file.split('.')[-1] in end:
+                imgs_lists.append(os.path.join(img_file, single_file))
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    return imgs_lists
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image_path", type=str, default=None)
+    parser.add_argument("--bin_path", type=str, default=None)
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="image2bin",
+        help="conversion mode, image2bin or bin2image")
+    return parser.parse_args()
+
+
+def main(args):
+    assert args.mode in ["image2bin", "bin2image"]
+    os.makedirs(args.image_path, exist_ok=True)
+    os.makedirs(args.bin_path, exist_ok=True)
+    assert os.path.isdir(args.image_path)
+    assert os.path.isdir(args.bin_path)
+
+    if args.mode == "image2bin":
+        img_list = get_file_list(args.image_path)
+        for idx, img_fp in enumerate(img_list):
+            if idx % len(img_list) == 1000:
+                print("conversion process: [{}]/[{}]".format(idx,
+                                                             len(img_list)))
+            img_name = os.path.basename(img_fp)
+            output_path = os.path.join(args.bin_path,
+                                       os.path.splitext(img_name)[0] + ".bin")
+            trans_img_to_bin(img_fp, output_path)
+    elif args.mode == "bin2image":
+        bin_list = get_file_list(args.bin_path, end=("bin", ))
+        for idx, bin_fp in enumerate(bin_list):
+            if idx % len(bin_list) == 1000:
+                print("conversion process: [{}]/[{}]".format(idx,
+                                                             len(bin_list)))
+            bin_name = os.path.basename(bin_fp)
+            output_path = os.path.join(args.image_path,
+                                       os.path.splitext(bin_name)[0] + ".jpg")
+            img = read_img_from_bin(bin_fp)
+            cv2.imwrite(output_path, img)
+
+    print("ok..")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/insightface/recognition/arcface_paddle/tools/export.py b/insightface/recognition/arcface_paddle/tools/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..ace61aa4acf192e36ef9b1a8e6b537ddc09cdbb0
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+sys.path.insert(0, os.path.abspath('.'))
+
+import argparse
+
+
+def str2bool(v):
+    return str(v).lower() in ("true", "t", "1")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Paddle Face Exporter')
+
+    # Model setting
+    parser.add_argument(
+        '--is_static',
+        type=str2bool,
+        default='False',
+        help='whether to use static mode')
+    parser.add_argument(
+        '--export_type',
+        type=str,
+        default='paddle',
+        help='export type, paddle or onnx')
+    parser.add_argument(
+        '--backbone',
+        type=str,
+        default='FresResNet50',
+        help='backbone network')
+    parser.add_argument(
+        '--embedding_size', type=int, default=512, help='embedding size')
+    parser.add_argument(
+        '--checkpoint_dir',
+        type=str,
+        default='MS1M_v3_arcface/FresResNet50/24/',
+        help='checkpoint direcotry')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='MS1M_v3_arcface/FresResNet50/exported_model',
+        help='export output direcotry')
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.is_static:
+        import paddle
+        paddle.enable_static()
+        from static.export import export
+    else:
+        from dynamic.export import export
+
+    assert args.export_type in ['paddle', 'onnx']
+    export(args)
diff --git a/insightface/recognition/arcface_paddle/tools/extract_perf_logs.py b/insightface/recognition/arcface_paddle/tools/extract_perf_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab91f0432aa2767852f0de4818b8b3085e652d54
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/extract_perf_logs.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+import glob
+import json
+import argparse
+import pprint
+
+import numpy as np
+
+pp = pprint.PrettyPrinter(indent=1)
+
+parser = argparse.ArgumentParser(description="flags for benchmark")
+parser.add_argument("--log_dir", type=str, default="./logs/", required=True)
+parser.add_argument(
+    "--output_dir", type=str, default="./logs/", required=False)
+parser.add_argument('--warmup_batches', type=int, default=50)
+parser.add_argument('--train_batches', type=int, default=150)
+
+args = parser.parse_args()
+
+
+class AutoVivification(dict):
+    """Implementation of perl's autovivification feature."""
+
+    def __getitem__(self, item):
+        try:
+            return dict.__getitem__(self, item)
+        except KeyError:
+            value = self[item] = type(self)()
+        return value
+
+
+def compute_median(iter_dict):
+    speed_list = [i for i in iter_dict.values()]
+    return round(np.median(speed_list), 2)
+
+
+def compute_average(iter_dict):
+    i = 0
+    total_speed = 0
+    for iter in iter_dict:
+        i += 1
+        total_speed += iter_dict[iter]
+    return round(total_speed / i, 2)
+
+
+def extract_info_from_file(log_file, result_dict, speed_dict):
+    # extract info from file name
+    exp_config = log_file.split("/")[-2]
+    model = exp_config.split("_")[2]
+    mode = exp_config.split("_")[3]
+    precision = exp_config.split("_")[4]
+    batch_size_per_device = exp_config.split("_")[6]
+    run_case = exp_config.split("_")[7]  # eg: 1n1g
+    test_iter = int(exp_config.split("_")[8][2:])
+    node_num = int(run_case[0])
+    if len(run_case) == 4:
+        card_num = int(run_case[-2])
+    elif len(run_case) == 5:
+        card_num = int(run_case[-3:-1])
+
+    avg_speed_list = []
+    # extract info from file content
+    with open(log_file) as f:
+        lines = f.readlines()
+        for line in lines:
+            if "throughput:" in line:
+                p1 = re.compile(r" ips: ([0-9]+\.[0-9]+)", re.S)
+                item = re.findall(p1, line)
+                a = float(item[0].strip())
+                avg_speed_list.append(a)
+
+    # compute avg throughoutput
+    avg_speed = round(
+        np.mean(avg_speed_list[args.warmup_batches:args.train_batches]), 2)
+
+    speed_dict[mode][model][run_case][precision][batch_size_per_device][
+        test_iter] = avg_speed
+    average_speed = compute_average(speed_dict[mode][model][run_case][
+        precision][batch_size_per_device])
+    median_speed = compute_median(speed_dict[mode][model][run_case][precision][
+        batch_size_per_device])
+
+    result_dict[mode][model][run_case][precision][batch_size_per_device][
+        'average_speed'] = average_speed
+    result_dict[mode][model][run_case][precision][batch_size_per_device][
+        'median_speed'] = median_speed
+
+    # print(log_file, speed_dict[mode][model][run_case])
+
+
+def compute_speedup(result_dict, speed_dict):
+    mode_list = [key for key in result_dict]  # eg. ['static', 'dynamic']
+    for md in mode_list:
+        model_list = [key for key in result_dict[md]]  # eg.['vgg16', 'r50']
+        for m in model_list:
+            run_case = [key for key in result_dict[md][m]
+                        ]  # eg.['4n8g', '2n8g', '1n8g', '1n4g', '1n1g']
+            for d in run_case:
+                precision = [key for key in result_dict[md][m][d]]
+                for p in precision:
+                    batch_size_per_device = [
+                        key for key in result_dict[md][m][d][p]
+                    ]
+                    for b in batch_size_per_device:
+                        speed_up = 1.0
+                        if result_dict[md][m]['1n1g'][p][b]['median_speed']:
+                            speed_up = result_dict[md][m][d][p][b][
+                                'median_speed'] / result_dict[md][m]['1n1g'][
+                                    p][b]['median_speed']
+                        result_dict[md][m][d][p][b]['speedup'] = round(
+                            speed_up, 2)
+
+
+def extract_result():
+    result_dict = AutoVivification()
+    speed_dict = AutoVivification()
+    logs_list = glob.glob(os.path.join(args.log_dir, "*/workerlog.0"))
+    for l in logs_list:
+        extract_info_from_file(l, result_dict, speed_dict)
+
+    # compute speedup
+    compute_speedup(result_dict, speed_dict)
+
+    # print result
+    pp.pprint(result_dict)
+
+    # write to file as JSON format
+    os.makedirs(args.output_dir, exist_ok=True)
+    result_file_name = os.path.join(args.output_dir,
+                                    "arcface_paddle_result.json")
+    print("Saving result to {}".format(result_file_name))
+    with open(result_file_name, 'w') as f:
+        json.dump(result_dict, f)
+
+
+if __name__ == "__main__":
+    extract_result()
diff --git a/insightface/recognition/arcface_paddle/tools/inference.py b/insightface/recognition/arcface_paddle/tools/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..822612a7b81137ee92fb312868a4018a036b1299
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/inference.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import cv2
+import time
+import json
+import argparse
+import numpy as np
+
+sys.path.insert(0, os.path.abspath('.'))
+
+def str2bool(v):
+    return v.lower() in ("True","true", "t", "1")
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Paddle Face Predictor')
+
+    parser.add_argument(
+        '--export_type', type=str, help='export type, paddle or onnx')
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        required=False,
+        help="paddle save inference model filename")
+    parser.add_argument(
+        "--params_file",
+        type=str,
+        required=False,
+        help="paddle save inference parameter filename")
+    parser.add_argument(
+        "--onnx_file", type=str, required=False, help="onnx model filename")
+    parser.add_argument("--image_path", type=str, help="path to test image")
+    parser.add_argument("--benchmark", type=str2bool, default=False, help="Is benchmark mode")
+    # params for paddle inferece engine
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    parser.add_argument("--min_subgraph_size", type=int, default=15)
+    parser.add_argument("--max_batch_size", type=int, default=1)
+    parser.add_argument("--precision", type=str, default="fp32")
+    parser.add_argument("--gpu_mem", type=int, default=500)
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--cpu_threads", type=int, default=10)
+    args = parser.parse_args()
+    return args
+
+def get_infer_gpuid():
+    cmd = "nvidia-smi"
+    res = os.popen(cmd).readlines()
+    if len(res) == 0:
+        return None
+    cmd = "env | grep CUDA_VISIBLE_DEVICES"
+    env_cuda = os.popen(cmd).readlines()
+    if len(env_cuda) == 0:
+        return 0
+    else:
+        gpu_id = env_cuda[0].strip().split("=")[1]
+        return int(gpu_id[0])
+
+
+def init_paddle_inference_config(args):
+    import paddle.inference as paddle_infer
+    config = paddle_infer.Config(args.model_file, args.params_file)
+    if hasattr(args, 'precision'):
+        if args.precision == "fp16" and args.use_tensorrt:
+            precision = paddle_infer.PrecisionType.Half
+        elif args.precision == "int8":
+            precision = paddle_infer.PrecisionType.Int8
+        else:
+            precision = paddle_infer.PrecisionType.Float32
+    else:
+        precision = paddle_infer.PrecisionType.Float32
+
+    if args.use_gpu:
+        gpu_id = get_infer_gpuid()
+        if gpu_id is None:
+            raise ValueError(
+                "Not found GPU in current device. Please check your device or set args.use_gpu as False"
+            )
+        config.enable_use_gpu(args.gpu_mem, 0)
+        if args.use_tensorrt:
+            config.enable_tensorrt_engine(
+                precision_mode=precision,
+                max_batch_size=args.max_batch_size,
+                min_subgraph_size=args.min_subgraph_size)
+            # skip the minmum trt subgraph
+            min_input_shape = {"x": [1, 3, 10, 10]}
+            max_input_shape = {"x": [1, 3, 1000, 1000]}
+            opt_input_shape = {"x": [1, 3, 112, 112]}
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                            opt_input_shape)
+
+    else:
+        config.disable_gpu()
+        cpu_threads = args.cpu_threads if  hasattr(args, "cpu_threads") else 10
+        config.set_cpu_math_library_num_threads(cpu_threads)
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.enable_mkldnn()
+            config.set_mkldnn_cache_capacity(10)
+            if args.precision == "fp16":
+                config.enable_mkldnn_bfloat16()
+    return config
+
+
+def get_image_file_list(img_file):
+    import imghdr
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+
+    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'GIF'}
+    if os.path.isfile(img_file) and imghdr.what(img_file) in img_end:
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            file_path = os.path.join(img_file, single_file)
+            if os.path.isfile(file_path) and imghdr.what(file_path) in img_end:
+                imgs_lists.append(file_path)
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+def paddle_inference(args):
+    import paddle.inference as paddle_infer
+
+    config =  init_paddle_inference_config(args)
+    predictor = paddle_infer.create_predictor(config)
+
+    input_names = predictor.get_input_names()
+    input_handle = predictor.get_input_handle(input_names[0])
+
+    if args.benchmark:
+        import auto_log
+        pid = os.getpid()
+        autolog = auto_log.AutoLogger(
+            model_name="det",
+            model_precision='fp32',
+            batch_size=1,
+            data_shape="dynamic",
+            save_path="./output/auto_log.log",
+            inference_config=config,
+            pids=pid,
+            process_name=None,
+            gpu_ids=0,
+            time_keys=[
+                'preprocess_time', 'inference_time','postprocess_time'
+            ],
+            warmup=0)
+        img = np.random.uniform(0, 255, [1, 3, 112,112]).astype(np.float32)
+        input_handle.copy_from_cpu(img)
+        for i in range(10):
+            predictor.run()
+
+
+    img_list = get_image_file_list(args.image_path)
+    for img_path in img_list:
+        img = cv2.imread(img_path)
+        st = time.time()
+        if args.benchmark:
+            autolog.times.start()
+
+        # normalize to mean 0.5, std 0.5
+        img = (img - 127.5) * 0.00784313725
+        # BGR2RGB
+        img = img[:, :, ::-1]
+        img = img.transpose((2, 0, 1))
+        img = np.expand_dims(img, 0)
+        img = img.astype('float32')
+
+        if args.benchmark:
+            autolog.times.stamp()
+
+        input_handle.copy_from_cpu(img)
+
+        predictor.run()
+
+        output_names = predictor.get_output_names()
+        output_handle = predictor.get_output_handle(output_names[0])
+        output_data = output_handle.copy_to_cpu()
+        if args.benchmark:
+            autolog.times.stamp()
+            autolog.times.end(stamp=True)
+            print('{}\t{}'.format(img_path,json.dumps(output_data.tolist())))
+        print('paddle inference result: ', output_data.shape)
+    if args.benchmark:
+        autolog.report()
+
+def onnx_inference(args):
+    import onnxruntime
+
+    ort_sess = onnxruntime.InferenceSession(args.onnx_file)
+
+    img = cv2.imread(args.image_path)
+    # normalize to mean 0.5, std 0.5
+    img = (img - 127.5) * 0.00784313725
+    # BGR2RGB
+    img = img[:, :, ::-1]
+    img = img.transpose((2, 0, 1))
+    img = np.expand_dims(img, 0)
+    img = img.astype('float32')
+
+    ort_inputs = {ort_sess.get_inputs()[0].name: img}
+    ort_outs = ort_sess.run(None, ort_inputs)
+
+    print('onnx inference result: ', ort_outs[0].shape)
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    assert args.export_type in ['paddle', 'onnx']
+    if args.export_type == 'onnx':
+        assert os.path.exists(args.onnx_file)
+        onnx_inference(args)
+    else:
+        assert os.path.exists(args.model_file)
+        assert os.path.exists(args.params_file)
+        paddle_inference(args)
diff --git a/insightface/recognition/arcface_paddle/tools/mx_recordio_2_images.py b/insightface/recognition/arcface_paddle/tools/mx_recordio_2_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba25b9b4b25f2345d1cc5bd51738f4dca11054c
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/mx_recordio_2_images.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+import numbers
+import mxnet as mx
+import cv2
+import tqdm
+import shutil
+
+
+def main(args):
+    path_imgrec = os.path.join(args.root_dir, 'train.rec')
+    path_imgidx = os.path.join(args.root_dir, 'train.idx')
+    imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+    s = imgrec.read_idx(0)
+    header, _ = mx.recordio.unpack(s)
+    if header.flag > 0:
+        header0 = (int(header.label[0]), int(header.label[1]))
+        imgidx = np.array(range(1, int(header.label[0])))
+    else:
+        imgidx = np.array(list(imgrec.keys))
+
+    classes = set()
+    os.makedirs(os.path.join(args.output_dir, 'images'), exist_ok=True)
+    fp = open(os.path.join(args.output_dir, 'label.txt'), 'w')
+    for idx in tqdm.tqdm(imgidx):
+        s = imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        img = mx.image.imdecode(img).asnumpy()[..., ::-1]
+        label = int(label)
+        classes.add(label)
+
+        filename = 'images/%08d.jpg' % idx
+        fp.write('%s\t%d\n' % (filename, label))
+        cv2.imwrite(
+            os.path.join(args.output_dir, filename), img,
+            [int(cv2.IMWRITE_JPEG_QUALITY), 100])
+    fp.close()
+    shutil.copy(
+        os.path.join(args.root_dir, 'agedb_30.bin'),
+        os.path.join(args.output_dir, 'agedb_30.bin'))
+    shutil.copy(
+        os.path.join(args.root_dir, 'cfp_fp.bin'),
+        os.path.join(args.output_dir, 'cfp_fp.bin'))
+    shutil.copy(
+        os.path.join(args.root_dir, 'lfw.bin'),
+        os.path.join(args.output_dir, 'lfw.bin'))
+    print('num_image: ', len(imgidx), 'num_classes: ', len(classes))
+    with open(os.path.join(args.output_dir, 'README.md'), 'w') as f:
+        f.write('num_image: {}\n'.format(len(imgidx)))
+        f.write('num_classes: {}\n'.format(len(classes)))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--root_dir",
+        type=str,
+        help="Root directory to mxnet dataset.", )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Path to output.", )
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/arcface_paddle/tools/test_recognition.py b/insightface/recognition/arcface_paddle/tools/test_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5972b68bbf18d86acb9d4322399eb3dd59e890b
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/test_recognition.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import requests
+import logging
+import imghdr
+import pickle
+import tarfile
+from functools import partial
+
+import cv2
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
+from prettytable import PrettyTable
+from PIL import Image, ImageDraw, ImageFont
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+__all__ = ["InsightFace", "parser"]
+BASE_INFERENCE_MODEL_DIR = os.path.expanduser("~/.insightface/ppmodels/")
+BASE_DOWNLOAD_URL = "https://paddle-model-ecology.bj.bcebos.com/model/insight-face/{}.tar"
+
+
+def parser(add_help=True):
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser(add_help=add_help)
+    parser.add_argument(
+        "--det", action="store_true", help="Whether to detect.")
+    parser.add_argument(
+        "--rec", action="store_true", help="Whether to recognize.")
+    
+    parser.add_argument(
+        "--det_model",
+        type=str,
+        default="BlazeFace",
+        help="The detection model.")
+    parser.add_argument(
+        "--rec_model",
+        type=str,
+        default="MobileFace",
+        help="The recognition model.")
+    parser.add_argument(
+        "--use_gpu",
+        type=str2bool,
+        default=True,
+        help="Whether use GPU to predict. Default by True.")
+    parser.add_argument(
+        "--enable_mkldnn",
+        type=str2bool,
+        default=True,
+        help="Whether use MKLDNN to predict, valid only when --use_gpu is False. Default by False."
+    )
+    parser.add_argument(
+        "--cpu_threads",
+        type=int,
+        default=1,
+        help="The num of threads with CPU, valid only when --use_gpu is False. Default by 1."
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="The path or directory of image(s) or video to be predicted.")
+    parser.add_argument(
+        "--output", type=str, default="./output/", help="The directory of prediction result.")
+    parser.add_argument(
+        "--det_thresh",
+        type=float,
+        default=0.8,
+        help="The threshold of detection postprocess. Default by 0.8.")
+    parser.add_argument(
+        "--index", type=str, default=None, help="The path of index file.")
+    parser.add_argument(
+        "--cdd_num",
+        type=int,
+        default=5,
+        help="The number of candidates in the recognition retrieval. Default by 10."
+    )
+    parser.add_argument(
+        "--rec_thresh",
+        type=float,
+        default=0.45,
+        help="The threshold of recognition postprocess. Default by 0.45.")
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=1,
+        help="The maxium of batch_size to recognize. Default by 1.")
+
+    return parser
+
+
+def print_config(args):
+    args = vars(args)
+    table = PrettyTable(['Param', 'Value'])
+    for param in args:
+        table.add_row([param, args[param]])
+    width = len(str(table).split("\n")[0])
+    print("{}".format("-" * width))
+    print("PaddleFace".center(width))
+    print(table)
+    print("Powered by PaddlePaddle!".rjust(width))
+    print("{}".format("-" * width))
+
+
+def download_with_progressbar(url, save_path):
+    """Download from url with progressbar.
+    """
+    if os.path.isfile(save_path):
+        os.remove(save_path)
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get("content-length", 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+    with open(save_path, "wb") as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes == 0 or progress_bar.n != total_size_in_bytes or not os.path.isfile(
+            save_path):
+        raise Exception(
+            f"Something went wrong while downloading model/image from {url}")
+
+
+def check_model_file(model):
+    """Check the model files exist and download and untar when no exist.
+    """
+    model_map = {
+        "ArcFace": "arcface_iresnet50_v1.0_infer",
+        "BlazeFace": "blazeface_fpn_ssh_1000e_v1.0_infer",
+        "MobileFace": "mobileface_v1.0_infer"
+    }
+
+    if os.path.isdir(model):
+        model_file_path = os.path.join(model, "inference.pdmodel")
+        params_file_path = os.path.join(model, "inference.pdiparams")
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            raise Exception(
+                f"The specifed model directory error. The drectory must include 'inference.pdmodel' and 'inference.pdiparams'."
+            )
+
+    elif model in model_map:
+        storage_directory = partial(os.path.join, BASE_INFERENCE_MODEL_DIR,
+                                    model)
+        url = BASE_DOWNLOAD_URL.format(model_map[model])
+
+        tar_file_name_list = [
+            "inference.pdiparams", "inference.pdiparams.info",
+            "inference.pdmodel"
+        ]
+        model_file_path = storage_directory("inference.pdmodel")
+        params_file_path = storage_directory("inference.pdiparams")
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            tmp_path = storage_directory(url.split("/")[-1])
+            logging.info(f"Download {url} to {tmp_path}")
+            os.makedirs(storage_directory(), exist_ok=True)
+            download_with_progressbar(url, tmp_path)
+            with tarfile.open(tmp_path, "r") as tarObj:
+                for member in tarObj.getmembers():
+                    filename = None
+                    for tar_file_name in tar_file_name_list:
+                        if tar_file_name in member.name:
+                            filename = tar_file_name
+                    if filename is None:
+                        continue
+                    file = tarObj.extractfile(member)
+                    with open(storage_directory(filename), "wb") as f:
+                        f.write(file.read())
+            os.remove(tmp_path)
+        if not os.path.exists(model_file_path) or not os.path.exists(
+                params_file_path):
+            raise Exception(
+                f"Something went wrong while downloading and unzip the model[{model}] files!"
+            )
+    else:
+        raise Exception(
+            f"The specifed model name error. Support 'BlazeFace' for detection and 'ArcFace' and 'MobileFace' for recognition. And support local directory that include model files ('inference.pdmodel' and 'inference.pdiparams')."
+        )
+
+    return model_file_path, params_file_path
+
+
+def normalize_image(img, scale=None, mean=None, std=None, order='chw'):
+    if isinstance(scale, str):
+        scale = eval(scale)
+    scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+    mean = mean if mean is not None else [0.485, 0.456, 0.406]
+    std = std if std is not None else [0.229, 0.224, 0.225]
+
+    shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+    mean = np.array(mean).reshape(shape).astype('float32')
+    std = np.array(std).reshape(shape).astype('float32')
+
+    if isinstance(img, Image.Image):
+        img = np.array(img)
+
+    assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
+    return (img.astype('float32') * scale - mean) / std
+
+
+def to_CHW_image(img):
+    if isinstance(img, Image.Image):
+        img = np.array(img)
+    return img.transpose((2, 0, 1))
+
+
+class ColorMap(object):
+    def __init__(self, num):
+        super().__init__()
+        self.get_color_map_list(num)
+        self.color_map = {}
+        self.ptr = 0
+
+    def __getitem__(self, key):
+        return self.color_map[key]
+
+    def update(self, keys):
+        for key in keys:
+            if key not in self.color_map:
+                i = self.ptr % len(self.color_list)
+                self.color_map[key] = self.color_list[i]
+                self.ptr += 1
+
+    def get_color_map_list(self, num_classes):
+        color_map = num_classes * [0, 0, 0]
+        for i in range(0, num_classes):
+            j = 0
+            lab = i
+            while lab:
+                color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+                color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+                color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+                j += 1
+                lab >>= 3
+        self.color_list = [
+            color_map[i:i + 3] for i in range(0, len(color_map), 3)
+        ]
+
+
+class ImageReader(object):
+    def __init__(self, inputs):
+        super().__init__()
+        self.idx = 0
+        if isinstance(inputs, np.ndarray):
+            self.image_list = [inputs]
+        else:
+            imgtype_list = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
+            self.image_list = []
+            if os.path.isfile(inputs):
+                if imghdr.what(inputs) not in imgtype_list:
+                    raise Exception(
+                        f"Error type of input path, only support: {imgtype_list}"
+                    )
+                self.image_list.append(inputs)
+            elif os.path.isdir(inputs):
+                tmp_file_list = os.listdir(inputs)
+                warn_tag = False
+                for file_name in tmp_file_list:
+                    file_path = os.path.join(inputs, file_name)
+                    if not os.path.isfile(file_path):
+                        warn_tag = True
+                        continue
+                    if imghdr.what(file_path) in imgtype_list:
+                        self.image_list.append(file_path)
+                    else:
+                        warn_tag = True
+                if warn_tag:
+                    logging.warning(
+                        f"The directory of input contine directory or not supported file type, only support: {imgtype_list}"
+                    )
+            else:
+                raise Exception(
+                    f"The file of input path not exist! Please check input: {inputs}"
+                )
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.idx >= len(self.image_list):
+            raise StopIteration
+
+        data = self.image_list[self.idx]
+        if isinstance(data, np.ndarray):
+            self.idx += 1
+            return data, "tmp.png"
+        path = data
+        _, file_name = os.path.split(path)
+        img = cv2.imread(path)
+        if img is None:
+            logging.warning(f"Error in reading image: {path}! Ignored.")
+            self.idx += 1
+            return self.__next__()
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        self.idx += 1
+        return img, file_name
+
+    def __len__(self):
+        return len(self.image_list)
+
+
+class VideoReader(object):
+    def __init__(self, inputs):
+        super().__init__()
+        videotype_list = {"mp4"}
+        if os.path.splitext(inputs)[-1][1:] not in videotype_list:
+            raise Exception(
+                f"The input file is not supported, only support: {videotype_list}"
+            )
+        if not os.path.isfile(inputs):
+            raise Exception(
+                f"The file of input path not exist! Please check input: {inputs}"
+            )
+        self.capture = cv2.VideoCapture(inputs)
+        self.file_name = os.path.split(inputs)[-1]
+
+    def get_info(self):
+        info = {}
+        width = int(self.capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(self.capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        info["file_name"] = self.file_name
+        info["fps"] = 30
+        info["shape"] = (width, height)
+        info["fourcc"] = cv2.VideoWriter_fourcc(* 'mp4v')
+        return info
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ret, frame = self.capture.read()
+        if not ret:
+            raise StopIteration
+        return frame, self.file_name
+
+
+class ImageWriter(object):
+    def __init__(self, output_dir):
+        super().__init__()
+        if output_dir is None:
+            raise Exception(
+                "Please specify the directory of saving prediction results by --output."
+            )
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+
+    def write(self, image, file_name):
+        path = os.path.join(self.output_dir, file_name)
+        cv2.imwrite(path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+
+
+class VideoWriter(object):
+    def __init__(self, output_dir, video_info):
+        super().__init__()
+        if output_dir is None:
+            raise Exception(
+                "Please specify the directory of saving prediction results by --output."
+            )
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        output_path = os.path.join(output_dir, video_info["file_name"])
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        self.writer = cv2.VideoWriter(output_path, video_info["fourcc"],
+                                      video_info["fps"], video_info["shape"])
+
+    def write(self, frame, file_name):
+        self.writer.write(frame)
+
+    def __del__(self):
+        if hasattr(self, "writer"):
+            self.writer.release()
+
+
+class BasePredictor(object):
+    def __init__(self, predictor_config):
+        super().__init__()
+        self.predictor_config = predictor_config
+        self.predictor, self.input_names, self.output_names = self.load_predictor(
+            predictor_config["model_file"], predictor_config["params_file"])
+
+    def load_predictor(self, model_file, params_file):
+        config = Config(model_file, params_file)
+        if self.predictor_config["use_gpu"]:
+            config.enable_use_gpu(200, 0)
+            config.switch_ir_optim(True)
+        else:
+            config.disable_gpu()
+            config.set_cpu_math_library_num_threads(self.predictor_config[
+                "cpu_threads"])
+
+            if self.predictor_config["enable_mkldnn"]:
+                try:
+                    # cache 10 different shapes for mkldnn to avoid memory leak
+                    config.set_mkldnn_cache_capacity(10)
+                    config.enable_mkldnn()
+                except Exception as e:
+                    logging.error(
+                        "The current environment does not support `mkldnn`, so disable mkldnn."
+                    )
+        config.disable_glog_info()
+        config.enable_memory_optim()
+        # use zero copy
+        config.switch_use_feed_fetch_ops(False)
+        predictor = create_predictor(config)
+        input_names = predictor.get_input_names()
+        output_names = predictor.get_output_names()
+        return predictor, input_names, output_names
+
+    def preprocess(self):
+        raise NotImplementedError
+
+    def postprocess(self):
+        raise NotImplementedError
+
+    def predict(self, img):
+        raise NotImplementedError
+
+
+class Detector(BasePredictor):
+    def __init__(self, det_config, predictor_config):
+        super().__init__(predictor_config)
+        self.det_config = det_config
+        self.target_size = self.det_config["target_size"]
+        self.thresh = self.det_config["thresh"]
+
+    def preprocess(self, img):
+        resize_h, resize_w = self.target_size
+        img_shape = img.shape
+        img_scale_x = resize_w / img_shape[1]
+        img_scale_y = resize_h / img_shape[0]
+        img = cv2.resize(
+            img, None, None, fx=img_scale_x, fy=img_scale_y, interpolation=1)
+        img = normalize_image(
+            img,
+            scale=1. / 255.,
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+            order='hwc')
+        img_info = {}
+        img_info["im_shape"] = np.array(
+            img.shape[:2], dtype=np.float32)[np.newaxis, :]
+        img_info["scale_factor"] = np.array(
+            [img_scale_y, img_scale_x], dtype=np.float32)[np.newaxis, :]
+
+        img = img.transpose((2, 0, 1)).copy()
+        img_info["image"] = img[np.newaxis, :, :, :]
+        return img_info
+
+    def postprocess(self, np_boxes):
+        expect_boxes = (np_boxes[:, 1] > self.thresh) & (np_boxes[:, 0] > -1)
+        return np_boxes[expect_boxes, :]
+
+    def predict(self, img):
+        inputs = self.preprocess(img)
+        for input_name in self.input_names:
+            input_tensor = self.predictor.get_input_handle(input_name)
+            input_tensor.copy_from_cpu(inputs[input_name])
+        self.predictor.run()
+        output_tensor = self.predictor.get_output_handle(self.output_names[0])
+        np_boxes = output_tensor.copy_to_cpu()
+        # boxes_num = self.detector.get_output_handle(self.detector_output_names[1])
+        # np_boxes_num = boxes_num.copy_to_cpu()
+        box_list = self.postprocess(np_boxes)
+        return box_list
+
+
+class Recognizer(BasePredictor):
+    def __init__(self, rec_config, predictor_config):
+        super().__init__(predictor_config)
+        if rec_config["index"] is not None:
+            self.load_index(rec_config["index"])
+        self.rec_config = rec_config
+        self.cdd_num = self.rec_config["cdd_num"]
+        self.thresh = self.rec_config["thresh"]
+        self.max_batch_size = self.rec_config["max_batch_size"]
+
+    def preprocess(self, img, box_list=None):
+        img = normalize_image(
+            img,
+            scale=1. / 255.,
+            mean=[0.5, 0.5, 0.5],
+            std=[0.5, 0.5, 0.5],
+            order='hwc')
+        if box_list is None:
+            height, width = img.shape[:2]
+            box_list = [np.array([0, 0, 0, 0, width, height])]
+        batch = []
+        input_batches = []
+        cnt = 0
+        for idx, box in enumerate(box_list):
+            box[box < 0] = 0
+            xmin, ymin, xmax, ymax = list(map(int, box[2:]))
+            face_img = img[ymin:ymax, xmin:xmax, :]
+            face_img = cv2.resize(face_img, (112, 112)).transpose(
+                (2, 0, 1)).copy()
+            batch.append(face_img)
+            cnt += 1
+            if cnt % self.max_batch_size == 0 or (idx + 1) == len(box_list):
+                input_batches.append(np.array(batch))
+                batch = []
+        return input_batches
+
+    def postprocess(self):
+        pass
+
+    def retrieval(self, np_feature):
+        labels = []
+        for feature in np_feature:
+            similarity = cosine_similarity(self.index_feature,
+                                           feature).squeeze()
+            abs_similarity = np.abs(similarity)
+            candidate_idx = np.argpartition(abs_similarity,
+                                            -self.cdd_num)[-self.cdd_num:]
+            remove_idx = np.where(abs_similarity[candidate_idx] < self.thresh)
+            candidate_idx = np.delete(candidate_idx, remove_idx)
+            candidate_label_list = list(np.array(self.label)[candidate_idx])
+            if len(candidate_label_list) == 0:
+                maxlabel = ""
+            else:
+                maxlabel = max(candidate_label_list,
+                               key=candidate_label_list.count)
+            labels.append(maxlabel)
+        return labels
+
+    def load_index(self, file_path):
+        with open(file_path, "rb") as f:
+            index = pickle.load(f)
+        self.label = index["label"]
+        self.index_feature = np.array(index["feature"]).squeeze()
+
+    def predict(self, img, box_list=None):
+        batch_list = self.preprocess(img, box_list)
+        feature_list = []
+        for batch in batch_list:
+            for input_name in self.input_names:
+                input_tensor = self.predictor.get_input_handle(input_name)
+                input_tensor.copy_from_cpu(batch)
+            self.predictor.run()
+            output_tensor = self.predictor.get_output_handle(self.output_names[
+                0])
+            np_feature = output_tensor.copy_to_cpu()
+            feature_list.append(np_feature)
+        return np.array(feature_list)
+
+
+class InsightFace(object):
+    def __init__(self, args, print_info=True):
+        super().__init__()
+        if print_info:
+            print_config(args)
+
+        self.font_path = os.path.join(
+            os.path.abspath(os.path.dirname(__file__)),
+            "SourceHanSansCN-Medium.otf")
+        self.args = args
+
+        predictor_config = {
+            "use_gpu": args.use_gpu,
+            "enable_mkldnn": args.enable_mkldnn,
+            "cpu_threads": args.cpu_threads
+        }
+        if args.det:
+            model_file_path, params_file_path = check_model_file(
+                args.det_model)
+            det_config = {"thresh": args.det_thresh, "target_size": [640, 640]}
+            predictor_config["model_file"] = model_file_path
+            predictor_config["params_file"] = params_file_path
+            self.det_predictor = Detector(det_config, predictor_config)
+            self.color_map = ColorMap(100)
+
+        if args.rec:
+            model_file_path, params_file_path = check_model_file(
+                args.rec_model)
+            rec_config = {
+                "max_batch_size": args.max_batch_size,
+                "resize": 112,
+                "thresh": args.rec_thresh,
+                "index": args.index,
+                "cdd_num": args.cdd_num
+            }
+            predictor_config["model_file"] = model_file_path
+            predictor_config["params_file"] = params_file_path
+            self.rec_predictor = Recognizer(rec_config, predictor_config)
+
+    def preprocess(self, img):
+        img = img.astype(np.float32, copy=False)
+        return img
+
+    def draw(self, img, box_list, labels):
+        self.color_map.update(labels)
+        im = Image.fromarray(img)
+        draw = ImageDraw.Draw(im)
+
+        for i, dt in enumerate(box_list):
+            bbox, score = dt[2:], dt[1]
+            label = labels[i]
+            color = tuple(self.color_map[label])
+
+            xmin, ymin, xmax, ymax = bbox
+
+            font_size = max(int((xmax - xmin) // 6), 10)
+            font = ImageFont.truetype(self.font_path, font_size)
+
+            text = "{} {:.4f}".format(label, score)
+            th = sum(font.getmetrics())
+            tw = font.getsize(text)[0]
+            start_y = max(0, ymin - th)
+
+            draw.rectangle(
+                [(xmin, start_y), (xmin + tw + 1, start_y + th)], fill=color)
+            draw.text(
+                (xmin + 1, start_y),
+                text,
+                fill=(255, 255, 255),
+                font=font,
+                anchor="la")
+            draw.rectangle(
+                [(xmin, ymin), (xmax, ymax)], width=2, outline=color)
+        return np.array(im)
+
+    def predict_np_img(self, img):
+        input_img = self.preprocess(img)
+        box_list = None
+        np_feature = None
+        if hasattr(self, "det_predictor"):
+            box_list = self.det_predictor.predict(input_img)
+        if hasattr(self, "rec_predictor"):
+            np_feature = self.rec_predictor.predict(input_img, box_list)
+        return box_list, np_feature
+
+    def init_reader_writer(self, input_data):
+        if isinstance(input_data, np.ndarray):
+            self.input_reader = ImageReader(input_data)
+            if hasattr(self, "det_predictor"):
+                self.output_writer = ImageWriter(self.args.output)
+        elif isinstance(input_data, str):
+            if input_data.endswith('mp4'):
+                self.input_reader = VideoReader(input_data)
+                info = self.input_reader.get_info()
+                self.output_writer = VideoWriter(self.args.output, info)
+            else:
+                self.input_reader = ImageReader(input_data)
+                if hasattr(self, "det_predictor"):
+                    self.output_writer = ImageWriter(self.args.output)
+        else:
+            raise Exception(
+                f"The input data error. Only support path of image or video(.mp4) and dirctory that include images."
+            )
+
+    def predict(self, input_data, print_info=False):
+        """Predict input_data.
+
+        Args:
+            input_data (str | NumPy.array): The path of image, or the derectory including images, or the image data in NumPy.array format.
+            print_info (bool, optional): Wheather to print the prediction results. Defaults to False.
+
+        Yields:
+            dict: {
+                "box_list": The prediction results of detection.
+                "features": The output of recognition.
+                "labels": The results of retrieval.
+                }
+        """
+        self.init_reader_writer(input_data)
+        for img, file_name in self.input_reader:
+            if img is None:
+                logging.warning(f"Error in reading img {file_name}! Ignored.")
+                continue
+            box_list, np_feature = self.predict_np_img(img)
+            if np_feature is not None:
+                labels = self.rec_predictor.retrieval(np_feature)
+            else:
+                labels = ["face"] * len(box_list)
+            if box_list is not None:
+                result = self.draw(img, box_list, labels=labels)
+                self.output_writer.write(result, file_name)
+            if print_info:
+                logging.info(f"File: {file_name}, predict label(s): {labels}")
+            yield {
+                "box_list": box_list,
+                "features": np_feature,
+                "labels": labels
+            }
+        logging.info(f"Predict complete!")
+
+
+# for CLI
+def main(args=None):
+    logging.basicConfig(level=logging.INFO)
+
+    args = parser().parse_args()
+    predictor = InsightFace(args)
+    res = predictor.predict(args.input, print_info=True)
+    for _ in res:
+        pass
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/insightface/recognition/arcface_paddle/tools/train.py b/insightface/recognition/arcface_paddle/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..55102e3df62bf5fab9b43c82ea9da6b49db1100d
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/train.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+sys.path.insert(0, os.path.abspath('.'))
+
+import paddle
+from configs import argparser as parser
+from utils.logging import init_logging
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    if args.is_static:
+        from static.train import train
+        paddle.enable_static()
+    else:
+        from dynamic.train import train
+
+    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    os.makedirs(args.output, exist_ok=True)
+    init_logging(rank, args.output)
+    parser.print_args(args)
+    train(args)
diff --git a/insightface/recognition/arcface_paddle/tools/validation.py b/insightface/recognition/arcface_paddle/tools/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e343724c7570e58bbbc2303f65c4825929ecf15
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/tools/validation.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import logging
+sys.path.insert(0, os.path.abspath('.'))
+
+import argparse
+
+
+def str2bool(v):
+    return str(v).lower() in ("true", "t", "1")
+
+
+def tostrlist(v):
+    if isinstance(v, list):
+        return v
+    elif isinstance(v, str):
+        return [e.strip() for e in v.split(',')]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Paddle Face Exporter')
+
+    # Model setting
+    parser.add_argument(
+        '--is_static',
+        type=str2bool,
+        default='False',
+        help='whether to use static mode')
+    parser.add_argument(
+        '--backbone',
+        type=str,
+        default='FresResNet50',
+        help='backbone network')
+    parser.add_argument(
+        '--embedding_size', type=int, default=512, help='embedding size')
+    parser.add_argument(
+        '--checkpoint_dir',
+        type=str,
+        default='MS1M_v3_arcface/FresResNet50/24/',
+        help='checkpoint direcotry')
+    parser.add_argument(
+        '--data_dir',
+        type=str,
+        default='./MS1M_v3_bin',
+        help='train dataset directory')
+    parser.add_argument(
+        '--val_targets',
+        type=tostrlist,
+        default=["lfw", "cfp_fp", "agedb_30"],
+        help='val targets, list or str split by comma')
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='test batch size')
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    logging.basicConfig(
+        level=logging.INFO, format="Validation: %(asctime)s - %(message)s")
+
+    args = parse_args()
+    if args.is_static:
+        import paddle
+        paddle.enable_static()
+        from static.validation import validation
+    else:
+        from dynamic.validation import validation
+
+    validation(args)
diff --git a/insightface/recognition/arcface_paddle/utils/__init__.py b/insightface/recognition/arcface_paddle/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/insightface/recognition/arcface_paddle/utils/logging.py b/insightface/recognition/arcface_paddle/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dd0f8e1089b2c90e9321bf9046a8155d1c74c13
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/utils/logging.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import time
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(rank, models_root):
+    if rank is 0:
+        log_root = logging.getLogger()
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s - %(message)s")
+        handler_file = logging.FileHandler(
+            os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info('rank: %d' % rank)
+
+
+class CallBackLogging(object):
+    def __init__(self,
+                 frequent,
+                 rank,
+                 world_size,
+                 total_step,
+                 batch_size,
+                 writer=None):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.world_size: int = world_size
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.batch_size: int = batch_size
+        self.writer = writer
+
+        self.tic = time.time()
+
+    def __call__(self, global_step, loss: AverageMeter, epoch: int, lr_value, avg_reader_cost, avg_batch_cost, avg_samples, ips):
+
+        if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0:
+            time_now = (time.time() - self.time_start) / 3600
+            time_total = time_now / ((global_step + 1) / self.total_step)
+            time_for_end = time_total - time_now
+            if self.writer is not None:
+                self.writer.add_scalar('time_for_end', time_for_end,
+                                       global_step)
+                self.writer.add_scalar('loss', loss.avg, global_step)
+            # ips is throughput
+            msg = "loss %.4f, lr: %f, epoch: %d, step: %d, eta: %1.2f hours, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f images/sec" % (
+                loss.avg, lr_value, epoch, global_step, time_for_end,avg_reader_cost, avg_batch_cost, avg_samples, ips * self.world_size)
+            logging.info(msg)
+            loss.reset()
+            self.tic = time.time()
diff --git a/insightface/recognition/arcface_paddle/utils/losses.py b/insightface/recognition/arcface_paddle/utils/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..297f7e88c20b61e1129cecfa0b1426907fead54f
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/utils/losses.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class CosFace(object):
+    def __init__(self, m1=1.0, m2=0.0, m3=0.35, s=64.0):
+        super(CosFace, self).__init__()
+        self.margin1 = m1
+        self.margin2 = m2
+        self.margin3 = m3
+        self.scale = s
+
+
+class ArcFace(object):
+    def __init__(self, m1=1.0, m2=0.5, m3=0.0, s=64.0):
+        super(ArcFace, self).__init__()
+        self.margin1 = m1
+        self.margin2 = m2
+        self.margin3 = m3
+        self.scale = s
+
+
+class SphereFace(object):
+    def __init__(self, m1=1.35, m2=0.0, m3=0.0, s=64.0):
+        super(SphereFace, self).__init__()
+        self.margin1 = m1
+        self.margin2 = m2
+        self.margin3 = m3
+        self.scale = s
diff --git a/insightface/recognition/arcface_paddle/utils/rearrange_weight.py b/insightface/recognition/arcface_paddle/utils/rearrange_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb1d7a77b695209fd48c93a96fd615b5de975124
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/utils/rearrange_weight.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def rearrange_weight(weight_dict, init_num_rank, new_num_rank):
+    """
+    A help function to convert pre-trained distributed fc parameters for
+    inference or fine-tuning. Note that the number of ranks or GPUs for
+    inference or fine-tuning can be different from that for pre-training.
+
+    Args:
+        weight_dict(dict): the dict store distributed parameters,
+            key: eg. dist@fc@rank@00000.w_0
+            value: numpy.ndarray
+        init_num_rank(int) : pre-trained weight at init_num_rank gpu device.
+        new_num_rank(int) : want to rearrange weight to new_num_rank gpu device.
+
+    Returns:
+        dict: rearranged weight for new_num_rank gpu device.
+    """
+
+    ret_dict = {}
+    if init_num_rank == new_num_rank:
+        return weight_dict
+
+    if len(weight_dict) == 0:
+        return weight_dict
+
+    # generate name format
+    name_format = list(weight_dict.keys())[0]
+    name_format = name_format.split('.')
+    name_format[0] = name_format[0].split('@')
+    name_format[0][-1] = '%05d'
+    name_format[0] = '@'.join(name_format[0])
+    name_format = '.'.join(name_format)
+
+    # calculate num class of pretrain shard
+    # num class of new shard
+    num_class = sum([
+        w.shape[1] if len(w.shape) == 2 else len(w)
+        for _, w in weight_dict.items()
+    ])
+    init_nshard = (num_class + init_num_rank - 1) // init_num_rank
+    new_nshard = (num_class + new_num_rank - 1) // new_num_rank
+
+    if new_nshard * (new_num_rank - 1) >= num_class:
+        raise ValueError(
+            "num class {} cann't be rationally splited by num rank {}".format(
+                num_class, new_num_rank))
+
+    if init_num_rank > new_num_rank:
+        for new_idx in range(new_num_rank):
+            start = new_idx * new_nshard
+            end = min((new_idx + 1) * new_nshard - 1, num_class - 1)
+            init_shard_idx_start = start // init_nshard
+            init_shard_idx_end = end // init_nshard
+
+            weight_list = []
+            for init_idx in range(init_shard_idx_start,
+                                  init_shard_idx_end + 1):
+                name = name_format % init_idx
+                init_weight = weight_dict[name]
+                s = max(start - init_idx * init_nshard, 0)
+                if init_idx == init_shard_idx_end:
+                    e = min(end - init_idx * init_nshard + 1, init_nshard)
+                else:
+                    e = init_nshard
+                if len(init_weight.shape) == 2:
+                    weight_list.append(init_weight[:, s:e])
+                else:
+                    weight_list.append(init_weight[s:e])
+
+            name = name_format % new_idx
+            # for 2-dimention, we concat at axis=1,
+            # else for 1-dimention, we concat at axis=0
+            ret_dict[name] = np.concatenate(
+                weight_list, axis=len(weight_list[0].shape) - 1)
+    else:
+        for new_idx in range(new_num_rank):
+            start = new_idx * new_nshard
+            end = min((new_idx + 1) * new_nshard - 1, num_class - 1)
+            init_shard_idx_start = start // init_nshard
+            init_shard_idx_end = end // init_nshard
+
+            if init_shard_idx_start == init_shard_idx_end:
+                name = name_format % init_shard_idx_start
+                init_weight = weight_dict[name]
+                init_start = init_shard_idx_start * init_nshard
+                s = max(start - init_start, 0)
+                e = min((init_shard_idx_start + 1) * init_nshard,
+                        end) - init_start + 1
+                if len(init_weight.shape) == 2:
+                    new_weight = init_weight[:, s:e]
+                else:
+                    new_weight = init_weight[s:e]
+            else:
+                # init_shard_idx_start + 1 == init_shard_idx_end
+                name = name_format % init_shard_idx_start
+                init_weight = weight_dict[name]
+                init_start = init_shard_idx_start * init_nshard
+                s = max(start - init_start, 0)
+                if len(init_weight.shape) == 2:
+                    new_weight = init_weight[:, s:]
+                else:
+                    new_weight = init_weight[s:]
+
+                e = end - (init_shard_idx_end * init_nshard) + 1
+                if e > 0:
+                    name = name_format % init_shard_idx_end
+                    init_weight = weight_dict[name]
+                    if len(init_weight.shape) == 2:
+                        new_weight2 = init_weight[:, :e]
+                    else:
+                        new_weight2 = init_weight[:e]
+
+                    new_weight = np.concatenate(
+                        [new_weight, new_weight2],
+                        axis=len(new_weight.shape) - 1)
+            name = name_format % new_idx
+            ret_dict[name] = new_weight
+
+    return ret_dict
diff --git a/insightface/recognition/arcface_paddle/utils/verification.py b/insightface/recognition/arcface_paddle/utils/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac9c43d4b4a6d06c20e11d2910e28714ca21662
--- /dev/null
+++ b/insightface/recognition/arcface_paddle/utils/verification.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import os
+import numpy as np
+import sklearn
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[
+                fold_idx, threshold_idx], _ = calculate_accuracy(
+                    threshold, dist[test_set], actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(
+            np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / (float(n_same) + 1e-8)
+    far = float(false_accept) / (float(n_diff) + 1e-8)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        nrof_folds=nrof_folds,
+        pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        1e-3,
+        nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
diff --git a/insightface/recognition/arcface_torch/README.md b/insightface/recognition/arcface_torch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba370cc6d57cafde6b30e49837f6c20a5a21cf41
--- /dev/null
+++ b/insightface/recognition/arcface_torch/README.md
@@ -0,0 +1,218 @@
+# Distributed Arcface Training in Pytorch
+
+The "arcface_torch" repository is the official implementation of the ArcFace algorithm. It supports distributed and sparse training with multiple distributed training examples, including several memory-saving techniques such as mixed precision training and gradient checkpointing. It also supports training for ViT models and datasets including WebFace42M and Glint360K, two of the largest open-source datasets. Additionally, the repository comes with a built-in tool for converting to ONNX format, making it easy to submit to MFR evaluation systems.
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-ijb-c)](https://paperswithcode.com/sota/face-verification-on-ijb-c?p=killing-two-birds-with-one-stone-efficient)  
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-ijb-b)](https://paperswithcode.com/sota/face-verification-on-ijb-b?p=killing-two-birds-with-one-stone-efficient)  
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-agedb-30)](https://paperswithcode.com/sota/face-verification-on-agedb-30?p=killing-two-birds-with-one-stone-efficient)  
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/killing-two-birds-with-one-stone-efficient/face-verification-on-cfp-fp)](https://paperswithcode.com/sota/face-verification-on-cfp-fp?p=killing-two-birds-with-one-stone-efficient)
+
+## Requirements
+
+To avail the latest features of PyTorch, we have upgraded to version 1.12.0.
+
+- Install [PyTorch](https://pytorch.org/get-started/previous-versions/) (torch>=1.12.0).
+- (Optional) Install [DALI](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/), our doc for [install_dali.md](docs/install_dali.md).
+- `pip install -r requirement.txt`.
+  
+## How to Training
+
+To train a model, execute the `train_v2.py` script with the path to the configuration files. The sample commands provided below demonstrate the process of conducting distributed training.
+
+### 1. To run on one GPU:
+
+```shell
+python train_v2.py configs/ms1mv3_r50_onegpu
+```
+
+Note:   
+It is not recommended to use a single GPU for training, as this may result in longer training times and suboptimal performance. For best results, we suggest using multiple GPUs or a GPU cluster.  
+
+
+### 2. To run on a machine with 8 GPUs:
+
+```shell
+torchrun --nproc_per_node=8 train_v2.py configs/ms1mv3_r50
+```
+
+### 3. To run on 2 machines with 8 GPUs each:
+
+Node 0:
+
+```shell
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=12581 train_v2.py configs/wf42m_pfc02_16gpus_r100
+```
+
+Node 1:
+  
+```shell
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=12581 train_v2.py configs/wf42m_pfc02_16gpus_r100
+```
+
+### 4. Run ViT-B on a machine with 24k batchsize:
+
+```shell
+torchrun --nproc_per_node=8 train_v2.py configs/wf42m_pfc03_40epoch_8gpu_vit_b
+```
+
+
+## Download Datasets or Prepare Datasets  
+- [MS1MV2](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_#ms1m-arcface-85k-ids58m-images-57) (87k IDs, 5.8M images)
+- [MS1MV3](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_#ms1m-retinaface) (93k IDs, 5.2M images)
+- [Glint360K](https://github.com/deepinsight/insightface/tree/master/recognition/partial_fc#4-download) (360k IDs, 17.1M images)
+- [WebFace42M](docs/prepare_webface42m.md) (2M IDs, 42.5M images)
+- [Your Dataset, Click Here!](docs/prepare_custom_dataset.md)
+
+Note: 
+If you want to use DALI for data reading, please use the script 'scripts/shuffle_rec.py' to shuffle the InsightFace style rec before using it.  
+Example:
+
+`python scripts/shuffle_rec.py ms1m-retinaface-t1`
+
+You will get the "shuffled_ms1m-retinaface-t1" folder, where the samples in the "train.rec" file are shuffled.
+
+
+## Model Zoo
+
+- The models are available for non-commercial research purposes only.  
+- All models can be found in here.  
+- [Baidu Yun Pan](https://pan.baidu.com/s/1CL-l4zWqsI1oDuEEYVhj-g): e8pw  
+- [OneDrive](https://1drv.ms/u/s!AswpsDO2toNKq0lWY69vN58GR6mw?e=p9Ov5d)
+
+### Performance on IJB-C and [**ICCV2021-MFR**](https://github.com/deepinsight/insightface/blob/master/challenges/mfr/README.md)
+
+ICCV2021-MFR testset consists of non-celebrities so we can ensure that it has very few overlap with public available face 
+recognition training set, such as MS1M and CASIA as they mostly collected from online celebrities. 
+As the result, we can evaluate the FAIR performance for different algorithms.  
+
+For **ICCV2021-MFR-ALL** set, TAR is measured on all-to-all 1:1 protocal, with FAR less than 0.000001(e-6). The 
+globalised multi-racial testset contains 242,143 identities and 1,624,305 images. 
+
+
+#### 1. Training on Single-Host GPU
+
+| Datasets       | Backbone            | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log                                                                                                                                 |
+|:---------------|:--------------------|:------------|:------------|:------------|:------------------------------------------------------------------------------------------------------------------------------------|
+| MS1MV2         | mobilefacenet-0.45G | 62.07       | 93.61       | 90.28       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_mbf/training.log)                     |
+| MS1MV2         | r50                 | 75.13       | 95.97       | 94.07       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r50/training.log)                     |
+| MS1MV2         | r100                | 78.12       | 96.37       | 94.27       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv2_r100/training.log)                    |
+| MS1MV3         | mobilefacenet-0.45G | 63.78       | 94.23       | 91.33       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_mbf/training.log)                     |
+| MS1MV3         | r50                 | 79.14       | 96.37       | 94.47       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r50/training.log)                     |
+| MS1MV3         | r100                | 81.97       | 96.85       | 95.02       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/ms1mv3_r100/training.log)                    |
+| Glint360K      | mobilefacenet-0.45G | 70.18       | 95.04       | 92.62       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_mbf/training.log)                  |
+| Glint360K      | r50                 | 86.34       | 97.16       | 95.81       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r50/training.log)                  |
+| Glint360k      | r100                | 89.52       | 97.55       | 96.38       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_r100/training.log)                 |
+| WF4M           | r100                | 89.87       | 97.19       | 95.48       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf4m_r100/training.log)                      |
+| WF12M-PFC-0.2  | r100                | 94.75       | 97.60       | 95.90       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_pfc02_r100/training.log)               |
+| WF12M-PFC-0.3  | r100                | 94.71       | 97.64       | 96.01       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_pfc03_r100/training.log)               |
+| WF12M          | r100                | 94.69       | 97.59       | 95.97       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf12m_r100/training.log)                     |
+| WF42M-PFC-0.2  | r100                | 96.27       | 97.70       | 96.31       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_r100/training.log)               |
+| WF42M-PFC-0.2  | ViT-T-1.5G          | 92.04       | 97.27       | 95.68       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/wf42m_pfc02_40epoch_8gpu_vit_t/training.log) |
+| WF42M-PFC-0.3  | ViT-B-11G           | 97.16       | 97.91       | 97.05       | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_b_8gpu/training.log)         |
+
+#### 2. Training on Multi-Host GPU
+
+| Datasets         | Backbone(bs*gpus) | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log                                                                                                                                        |
+|:-----------------|:------------------|:------------|:------------|:------------|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------|
+| WF42M-PFC-0.2    | r50(512*8)        | 93.83       | 97.53       | 96.16       | ~5900      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_bs4k_pfc02/training.log)             |
+| WF42M-PFC-0.2    | r50(512*16)       | 93.96       | 97.46       | 96.12       | ~11000     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r50_lr01_pfc02_bs8k_16gpus/training.log) |
+| WF42M-PFC-0.2    | r50(128*32)       | 94.04       | 97.48       | 95.94       | ~17000     | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(128*16)      | 96.28       | 97.80       | 96.57       | ~5200      | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(256*16)      | 96.69       | 97.85       | 96.63       | ~5200      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/webface42m_r100_bs4k_pfc02/training.log)            |
+| WF42M-PFC-0.0018 | r100(512*32)      | 93.08       | 97.51       | 95.88       | ~10000     | click me                                                                                                                                   |
+| WF42M-PFC-0.2    | r100(128*32)      | 96.57       | 97.83       | 96.50       | ~9800      | click me                                                                                                                                   |
+
+`r100(128*32)` means backbone is r100, batchsize per gpu is 128, the number of gpus is 32.
+
+
+
+#### 3. ViT For Face Recognition
+
+| Datasets      | Backbone(bs)  | FLOPs | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | Throughout | log                                                                                                                          |
+|:--------------|:--------------|:------|:------------|:------------|:------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------|
+| WF42M-PFC-0.3 | r18(128*32)   | 2.6   | 79.13       | 95.77       | 93.36       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r50(128*32)   | 6.3   | 94.03       | 97.48       | 95.94       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r100(128*32)  | 12.1  | 96.69       | 97.82       | 96.45       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | r200(128*32)  | 23.5  | 97.70       | 97.97       | 96.93       | -          | click me                                                                                                                     |
+| WF42M-PFC-0.3 | VIT-T(384*64) | 1.5   | 92.24       | 97.31       | 95.97       | ~35000     | click me                                                                                                                     |
+| WF42M-PFC-0.3 | VIT-S(384*64) | 5.7   | 95.87       | 97.73       | 96.57       | ~25000     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_s_64gpu/training.log) |
+| WF42M-PFC-0.3 | VIT-B(384*64) | 11.4  | 97.42       | 97.90       | 97.04       | ~13800     | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_b_64gpu/training.log) |
+| WF42M-PFC-0.3 | VIT-L(384*64) | 25.3  | 97.85       | 98.00       | 97.23       | ~9406      | [click me](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/pfc03_wf42m_vit_l_64gpu/training.log) |
+
+`WF42M` means WebFace42M, `PFC-0.3` means negivate class centers sample rate is 0.3.
+
+#### 4. Noisy Datasets
+  
+| Datasets                 | Backbone | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) | log      |
+|:-------------------------|:---------|:------------|:------------|:------------|:---------|
+| WF12M-Flip(40%)          | r50      | 43.87       | 88.35       | 80.78       | click me |
+| WF12M-Flip(40%)-PFC-0.1* | r50      | 80.20       | 96.11       | 93.79       | click me |
+| WF12M-Conflict           | r50      | 79.93       | 95.30       | 91.56       | click me |
+| WF12M-Conflict-PFC-0.3*  | r50      | 91.68       | 97.28       | 95.75       | click me |
+
+`WF12M` means WebFace12M, `+PFC-0.1*` denotes additional abnormal inter-class filtering.
+
+
+
+## Speed Benchmark
+<div><img src="https://github.com/anxiangsir/insightface_arcface_log/blob/master/pfc_exp.png" width = "90%" /></div>
+
+
+**Arcface-Torch** is an efficient tool for training large-scale face recognition training sets. When the number of classes in the training sets exceeds one million, the partial FC sampling strategy maintains the same accuracy while providing several times faster training performance and lower GPU memory utilization. The partial FC is a sparse variant of the model parallel architecture for large-scale face recognition, utilizing a sparse softmax that dynamically samples a subset of class centers for each training batch. During each iteration, only a sparse portion of the parameters are updated, leading to a significant reduction in GPU memory requirements and computational demands. With the partial FC approach, it is possible to train sets with up to 29 million identities, the largest to date. Furthermore, the partial FC method supports multi-machine distributed training and mixed precision training.
+
+
+
+More details see 
+[speed_benchmark.md](docs/speed_benchmark.md) in docs.
+
+> 1. Training Speed of Various Parallel Techniques (Samples per Second) on a Tesla V100 32GB x 8 System (Higher is Optimal)
+
+`-` means training failed because of gpu memory limitations.
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+|:--------------------------------|:--------------|:---------------|:---------------|
+| 125000                          | 4681          | 4824           | 5004           |
+| 1400000                         | **1672**      | 3043           | 4738           |
+| 5500000                         | **-**         | **1389**       | 3975           |
+| 8000000                         | **-**         | **-**          | 3565           |
+| 16000000                        | **-**         | **-**          | 2679           |
+| 29000000                        | **-**         | **-**          | **1855**       |
+
+> 2. GPU Memory Utilization of Various Parallel Techniques (MB per GPU) on a Tesla V100 32GB x 8 System (Lower is Optimal)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+|:--------------------------------|:--------------|:---------------|:---------------|
+| 125000                          | 7358          | 5306           | 4868           |
+| 1400000                         | 32252         | 11178          | 6056           |
+| 5500000                         | **-**         | 32188          | 9854           |
+| 8000000                         | **-**         | **-**          | 12310          |
+| 16000000                        | **-**         | **-**          | 19950          |
+| 29000000                        | **-**         | **-**          | 32324          |
+
+
+## Citations
+
+```
+@inproceedings{deng2019arcface,
+  title={Arcface: Additive angular margin loss for deep face recognition},
+  author={Deng, Jiankang and Guo, Jia and Xue, Niannan and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={4690--4699},
+  year={2019}
+}
+@inproceedings{An_2022_CVPR,
+    author={An, Xiang and Deng, Jiankang and Guo, Jia and Feng, Ziyong and Zhu, XuHan and Yang, Jing and Liu, Tongliang},
+    title={Killing Two Birds With One Stone: Efficient and Robust Training of Face Recognition CNNs by Partial FC},
+    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month={June},
+    year={2022},
+    pages={4042-4051}
+}
+@inproceedings{zhu2021webface260m,
+  title={Webface260m: A benchmark unveiling the power of million-scale deep face recognition},
+  author={Zhu, Zheng and Huang, Guan and Deng, Jiankang and Ye, Yun and Huang, Junjie and Chen, Xinze and Zhu, Jiagang and Yang, Tian and Lu, Jiwen and Du, Dalong and Zhou, Jie},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10492--10502},
+  year={2021}
+}
+```
diff --git a/insightface/recognition/arcface_torch/backbones/__init__.py b/insightface/recognition/arcface_torch/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3eaa4945c86efa94fd0cf836cdb053ab7401d64
--- /dev/null
+++ b/insightface/recognition/arcface_torch/backbones/__init__.py
@@ -0,0 +1,92 @@
+from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+from .mobilefacenet import get_mbf
+
+
+def get_model(name, **kwargs):
+    # resnet
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    elif name == "r2060":
+        from .iresnet2060 import iresnet2060
+        return iresnet2060(False, **kwargs)
+
+    elif name == "mbf":
+        fp16 = kwargs.get("fp16", False)
+        num_features = kwargs.get("num_features", 512)
+        return get_mbf(fp16=fp16, num_features=num_features)
+
+    elif name == "mbf_large":
+        from .mobilefacenet import get_mbf_large
+        fp16 = kwargs.get("fp16", False)
+        num_features = kwargs.get("num_features", 512)
+        return get_mbf_large(fp16=fp16, num_features=num_features)
+
+    elif name == "vit_t":
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=256, depth=12,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1)
+
+    elif name == "vit_t_dp005_mask0": # For WebFace42M
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=256, depth=12,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.0)
+
+    elif name == "vit_s":
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=12,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1)
+    
+    elif name == "vit_s_dp005_mask_0":  # For WebFace42M
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=12,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.0)
+    
+    elif name == "vit_b":
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=24,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0.1, using_checkpoint=True)
+
+    elif name == "vit_b_dp005_mask_005":  # For WebFace42M
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=512, depth=24,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.05, using_checkpoint=True)
+
+    elif name == "vit_l_dp005_mask_005":  # For WebFace42M
+        # this is a feature
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=768, depth=24,
+            num_heads=8, drop_path_rate=0.05, norm_layer="ln", mask_ratio=0.05, using_checkpoint=True)
+        
+    elif name == "vit_h":  # For WebFace42M
+        num_features = kwargs.get("num_features", 512)
+        from .vit import VisionTransformer
+        return VisionTransformer(
+            img_size=112, patch_size=9, num_classes=num_features, embed_dim=1024, depth=48,
+            num_heads=8, drop_path_rate=0.1, norm_layer="ln", mask_ratio=0, using_checkpoint=True)
+
+    else:
+        raise ValueError()
diff --git a/insightface/recognition/arcface_torch/backbones/iresnet.py b/insightface/recognition/arcface_torch/backbones/iresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2347c9231b007d0f06f31461c61d1e39418cdd
--- /dev/null
+++ b/insightface/recognition/arcface_torch/backbones/iresnet.py
@@ -0,0 +1,194 @@
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+using_ckpt = False
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward_impl(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out        
+
+    def forward(self, x):
+        if self.training and using_ckpt:
+            return checkpoint(self.forward_impl, x)
+        else:
+            return self.forward_impl(x)
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.extra_gflops = 0.0
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)
diff --git a/insightface/recognition/arcface_torch/backbones/iresnet2060.py b/insightface/recognition/arcface_torch/backbones/iresnet2060.py
new file mode 100644
index 0000000000000000000000000000000000000000..21d1122144d207637d2444cba1f68fe630c89f31
--- /dev/null
+++ b/insightface/recognition/arcface_torch/backbones/iresnet2060.py
@@ -0,0 +1,176 @@
+import torch
+from torch import nn
+
+assert torch.__version__ >= "1.8.1"
+from torch.utils.checkpoint import checkpoint_sequential
+
+__all__ = ['iresnet2060']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def checkpoint(self, func, num_seg, x):
+        if self.training:
+            return checkpoint_sequential(func, num_seg, x)
+        else:
+            return func(x)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.checkpoint(self.layer2, 20, x)
+            x = self.checkpoint(self.layer3, 100, x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet2060(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet2060', IBasicBlock, [3, 128, 1024 - 128, 3], pretrained, progress, **kwargs)
diff --git a/insightface/recognition/arcface_torch/backbones/mobilefacenet.py b/insightface/recognition/arcface_torch/backbones/mobilefacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..007d136a96202bfa2021e4f88c4bf145dd151992
--- /dev/null
+++ b/insightface/recognition/arcface_torch/backbones/mobilefacenet.py
@@ -0,0 +1,147 @@
+'''
+Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
+Original author cavalleria
+'''
+
+import torch.nn as nn
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
+import torch
+
+
+class Flatten(Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+
+
+class ConvBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(ConvBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
+            BatchNorm2d(num_features=out_c),
+            PReLU(num_parameters=out_c)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class LinearBlock(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(LinearBlock, self).__init__()
+        self.layers = nn.Sequential(
+            Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
+            BatchNorm2d(num_features=out_c)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class DepthWise(Module):
+    def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(DepthWise, self).__init__()
+        self.residual = residual
+        self.layers = nn.Sequential(
+            ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
+            ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
+            LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        )
+
+    def forward(self, x):
+        short_cut = None
+        if self.residual:
+            short_cut = x
+        x = self.layers(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+
+
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
+        self.layers = Sequential(*modules)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class GDC(Module):
+    def __init__(self, embedding_size):
+        super(GDC, self).__init__()
+        self.layers = nn.Sequential(
+            LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
+            Flatten(),
+            Linear(512, embedding_size, bias=False),
+            BatchNorm1d(embedding_size))
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class MobileFaceNet(Module):
+    def __init__(self, fp16=False, num_features=512, blocks=(1, 4, 6, 2), scale=2):
+        super(MobileFaceNet, self).__init__()
+        self.scale = scale
+        self.fp16 = fp16
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvBlock(3, 64 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        )
+        if blocks[0] == 1:
+            self.layers.append(
+                ConvBlock(64 * self.scale, 64 * self.scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+            )
+        else:
+            self.layers.append(
+                Residual(64 * self.scale, num_block=blocks[0], groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            )
+        
+        self.layers.extend(
+        [
+            DepthWise(64 * self.scale, 64 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
+            Residual(64 * self.scale, num_block=blocks[1], groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(64 * self.scale, 128 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
+            Residual(128 * self.scale, num_block=blocks[2], groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+            DepthWise(128 * self.scale, 128 * self.scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
+            Residual(128 * self.scale, num_block=blocks[3], groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
+        ])
+
+        self.conv_sep = ConvBlock(128 * self.scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.features = GDC(num_features)
+        self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            for func in self.layers:
+                x = func(x)
+        x = self.conv_sep(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def get_mbf(fp16, num_features, blocks=(1, 4, 6, 2), scale=2):
+    return MobileFaceNet(fp16, num_features, blocks, scale=scale)
+
+def get_mbf_large(fp16, num_features, blocks=(2, 8, 12, 4), scale=4):
+    return MobileFaceNet(fp16, num_features, blocks, scale=scale)
diff --git a/insightface/recognition/arcface_torch/backbones/vit.py b/insightface/recognition/arcface_torch/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..23977d2ece0f04e9e6cf5b149cef9bc441dcd7c5
--- /dev/null
+++ b/insightface/recognition/arcface_torch/backbones/vit.py
@@ -0,0 +1,280 @@
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from typing import Optional, Callable
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU6, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class VITBatchNorm(nn.Module):
+    def __init__(self, num_features):
+        super().__init__()
+        self.num_features = num_features
+        self.bn = nn.BatchNorm1d(num_features=num_features)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
+class Attention(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 num_heads: int = 8,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        
+        with torch.cuda.amp.autocast(True):
+            batch_size, num_token, embed_dim = x.shape
+            #qkv is [3,batch_size,num_heads,num_token, embed_dim//num_heads]
+            qkv = self.qkv(x).reshape(
+                batch_size, num_token, 3, self.num_heads, embed_dim // self.num_heads).permute(2, 0, 3, 1, 4)
+        with torch.cuda.amp.autocast(False):
+            q, k, v = qkv[0].float(), qkv[1].float(), qkv[2].float()
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(batch_size, num_token, embed_dim)
+        with torch.cuda.amp.autocast(True):
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim: int,
+                 num_heads: int,
+                 num_patches: int,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_path: float = 0.,
+                 act_layer: Callable = nn.ReLU6,
+                 norm_layer: str = "ln", 
+                 patch_n: int = 144):
+        super().__init__()
+
+        if norm_layer == "bn":
+            self.norm1 = VITBatchNorm(num_features=num_patches)
+            self.norm2 = VITBatchNorm(num_features=num_patches)
+        elif norm_layer == "ln":
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
+
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer, drop=drop)
+        self.extra_gflops = (num_heads * patch_n * (dim//num_heads)*patch_n * 2) / (1000**3)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        with torch.cuda.amp.autocast(True):
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=108, patch_size=9, in_channels=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_channels, embed_dim,
+                              kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        batch_size, channels, height, width = x.shape
+        assert height == self.img_size[0] and width == self.img_size[1], \
+            f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size: int = 112,
+                 patch_size: int = 16,
+                 in_channels: int = 3,
+                 num_classes: int = 1000,
+                 embed_dim: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: float = 4.,
+                 qkv_bias: bool = False,
+                 qk_scale: Optional[None] = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 hybrid_backbone: Optional[None] = None,
+                 norm_layer: str = "ln",
+                 mask_ratio = 0.1,
+                 using_checkpoint = False,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+
+        if hybrid_backbone is not None:
+            raise ValueError
+        else:
+            self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim)
+        self.mask_ratio = mask_ratio
+        self.using_checkpoint = using_checkpoint
+        num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        patch_n = (img_size//patch_size)**2
+        self.blocks = nn.ModuleList(
+            [
+                Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                      drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                      num_patches=num_patches, patch_n=patch_n)
+                for i in range(depth)]
+        )
+        self.extra_gflops = 0.0
+        for _block in self.blocks:
+            self.extra_gflops += _block.extra_gflops
+
+        if norm_layer == "ln":
+            self.norm = nn.LayerNorm(embed_dim)
+        elif norm_layer == "bn":
+            self.norm = VITBatchNorm(self.num_patches)
+
+        # features head
+        self.feature = nn.Sequential(
+            nn.Linear(in_features=embed_dim * num_patches, out_features=embed_dim, bias=False),
+            nn.BatchNorm1d(num_features=embed_dim, eps=2e-5),
+            nn.Linear(in_features=embed_dim, out_features=num_classes, bias=False),
+            nn.BatchNorm1d(num_features=num_classes, eps=2e-5)
+        )
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        trunc_normal_(self.pos_embed, std=.02)
+        # trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+    
+    def random_masking(self, x, mask_ratio=0.1):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.size()  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+
+        # sort noise for each sample
+        # ascend: small is keep, large is remove
+        ids_shuffle = torch.argsort(noise, dim=1)
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(
+            x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return x_masked, mask, ids_restore
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        if self.training and self.mask_ratio > 0:
+            x, _, ids_restore = self.random_masking(x)
+
+        for func in self.blocks:
+            if self.using_checkpoint and self.training:
+                from torch.utils.checkpoint import checkpoint
+                x = checkpoint(func, x)
+            else:
+                x = func(x)
+        x = self.norm(x.float())
+        
+        if self.training and self.mask_ratio > 0:
+            mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+            x_ = torch.cat([x[:, :, :], mask_tokens], dim=1)  # no cls token
+            x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+            x = x_
+        return torch.reshape(x, (B, self.num_patches * self.embed_dim))
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.feature(x)
+        return x
diff --git a/insightface/recognition/arcface_torch/configs/3millions.py b/insightface/recognition/arcface_torch/configs/3millions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb660bde4590b999bfe1bf9bef8bbf055d65566
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/3millions.py
@@ -0,0 +1,23 @@
+from easydict import EasyDict as edict
+
+# configs for test speed
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 512 # total_batch_size = batch_size * num_gpus
+config.lr = 0.1  # batch size is 512
+
+config.rec = "synthetic"
+config.num_classes = 30 * 10000
+config.num_image = 100000
+config.num_epoch = 30
+config.warmup_epoch = -1
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/__init__.py b/insightface/recognition/arcface_torch/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_torch/configs/base.py b/insightface/recognition/arcface_torch/configs/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..034fe12053bd1fd6f858e5096ec7085d3ffd4e6b
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/base.py
@@ -0,0 +1,60 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+
+# Margin Base Softmax
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r50"
+config.resume = False
+config.save_all_states = False
+config.output = "ms1mv3_arcface_r50"
+
+config.embedding_size = 512
+
+# Partial FC
+config.sample_rate = 1
+config.interclass_filtering_threshold = 0
+
+config.fp16 = False
+config.batch_size = 128
+
+# For SGD 
+config.optimizer = "sgd"
+config.lr = 0.1
+config.momentum = 0.9
+config.weight_decay = 5e-4
+
+# For AdamW
+# config.optimizer = "adamw"
+# config.lr = 0.001
+# config.weight_decay = 0.1
+
+config.verbose = 2000
+config.frequent = 10
+
+# For Large Sacle Dataset, such as WebFace42M
+config.dali = False 
+config.dali_aug = False
+
+# Gradient ACC
+config.gradient_acc = 1
+
+# setup seed
+config.seed = 2048
+
+# dataload numworkers
+config.num_workers = 2
+
+# WandB Logger
+config.wandb_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+config.suffix_run_name = None
+config.using_wandb = False
+config.wandb_entity = "entity"
+config.wandb_project = "project"
+config.wandb_log_all = True
+config.save_artifacts = False
+config.wandb_resume = False # resume wandb run: Only if the you wand t resume the last run that it was interrupted
\ No newline at end of file
diff --git a/insightface/recognition/arcface_torch/configs/glint360k_mbf.py b/insightface/recognition/arcface_torch/configs/glint360k_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32f0016bf2615d031a1f99243023e8b96e49afc
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/glint360k_mbf.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/glint360k_r100.py b/insightface/recognition/arcface_torch/configs/glint360k_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8bbb78650b3b64423004f631266bc6c27804fb
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/glint360k_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/glint360k_r50.py b/insightface/recognition/arcface_torch/configs/glint360k_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eeb28f84aadc0482f0593db315a4b5b09c9ae0a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/glint360k_r50.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/glint360k"
+config.num_classes = 360232
+config.num_image = 17091657
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv2_mbf.py b/insightface/recognition/arcface_torch/configs/ms1mv2_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..255a51ad68a0c7bb2a0e05a3e4771b275173d932
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv2_mbf.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/faces_emore"
+config.num_classes = 85742
+config.num_image = 5822653
+config.num_epoch = 40
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv2_r100.py b/insightface/recognition/arcface_torch/configs/ms1mv2_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..36773489c4ba19774b79118c460bc977d73806c7
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv2_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/faces_emore"
+config.num_classes = 85742
+config.num_image = 5822653
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv2_r50.py b/insightface/recognition/arcface_torch/configs/ms1mv2_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dab4d35244d76886fbad5d94cdc48d8def9f7ec
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv2_r50.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/faces_emore"
+config.num_classes = 85742
+config.num_image = 5822653
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv3_mbf.py b/insightface/recognition/arcface_torch/configs/ms1mv3_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..731b4a261ba0ef0ebb50f12b315d581f5eb0b8e4
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv3_mbf.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 40
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv3_r100.py b/insightface/recognition/arcface_torch/configs/ms1mv3_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7af3cef46a9322732c8c129e6406f8dc704698f
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv3_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv3_r50.py b/insightface/recognition/arcface_torch/configs/ms1mv3_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1467f0a554d1f10a3a21af0404cc56aba434b65
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv3_r50.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/ms1mv3_r50_onegpu.py b/insightface/recognition/arcface_torch/configs/ms1mv3_r50_onegpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce7e140ddd84cbdbf1b85006bda41a0c00b9a31
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/ms1mv3_r50_onegpu.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.5, 0.0)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.02
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50.py b/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..de94fcb32cad796bda63521e4f81a4f7fe88923b
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M_Conflict"
+config.num_classes = 1017970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py b/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py
new file mode 100644
index 0000000000000000000000000000000000000000..a766f4154bb801b57d0f9519748b63941e349330
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_conflict_r50_pfc03_filter04.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.interclass_filtering_threshold = 0.4
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M_Conflict"
+config.num_classes = 1017970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py b/insightface/recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1018b7f0d0320678b33b212eed5751badf72ee
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_flip_pfc01_filter04_r50.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.1
+config.interclass_filtering_threshold = 0.4
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M_FLIP40"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_flip_r50.py b/insightface/recognition/arcface_torch/configs/wf12m_flip_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde56fed6d8513b95882b7701f93f8574afbca9c
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_flip_r50.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M_FLIP40"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_mbf.py b/insightface/recognition/arcface_torch/configs/wf12m_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1cb93b2f168e3a64e65d1f8d6cf058e41676c6a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_mbf.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_pfc02_r100.py b/insightface/recognition/arcface_torch/configs/wf12m_pfc02_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..1062b876e9b17db7b4b24e09d9f6cd3dacebb4d9
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_pfc02_r100.py
@@ -0,0 +1,29 @@
+
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_r100.py b/insightface/recognition/arcface_torch/configs/wf12m_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bfa1be4649f3083be0340efc81df0b9c8f1ba8
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_r100.py
@@ -0,0 +1,29 @@
+
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf12m_r50.py b/insightface/recognition/arcface_torch/configs/wf12m_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a7284663d6afbe6f205c8c9f10cd454ef1045ca
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf12m_r50.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.interclass_filtering_threshold = 0
+config.fp16 = True
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.optimizer = "sgd"
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace12M"
+config.num_classes = 617970
+config.num_image = 12720066
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc0008_32gpu_r100.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc0008_32gpu_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..2885816cb9b635c526d1d2269c606e93fa54a2e6
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc0008_32gpu_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 512
+config.lr = 0.4
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_mbf_bs8k.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_mbf_bs8k.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a6bb79da7eaa3f111e9efedf507e46a953c9aa
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_mbf_bs8k.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 512
+config.lr = 0.4
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 2
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r100.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..035684732003b5c7b8fe8ea34e097bd22fbcca37
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 256
+config.lr = 0.3
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 1
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r50_bs8k.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r50_bs8k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02bdf3afe8370086cf64fd112244b00cee35a6f
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_16gpus_r50_bs8k.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 512
+config.lr = 0.6
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 4
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_32gpus_r50_bs4k.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_32gpus_r50_bs4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8407943ffef4ae3ee02ddb3f2361a9ac655cbb
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_32gpus_r50_bs4k.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 2
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_8gpus_r50_bs4k.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_8gpus_r50_bs4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f627fa94046d22ab0f0f12a8e339dc2cedfd81
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_8gpus_r50_bs4k.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 512
+config.lr = 0.4
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 2
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..5274a52f2607f38e08643e2145a2a837786ed9f1
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1e8f199195df647086da21d7e2fa05817c4ca61
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_16gpus.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.2
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7787675df530259ba809b694df8d3e3cc5dd799
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_r100_32gpus.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.2
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 10000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc02_vit_h.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_vit_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55af948fd65225785a507be68fd134eee147f6a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc02_vit_h.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_h"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.5
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 768
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = True
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 16
+config.warmup_epoch = config.num_epoch // 8
+config.val_targets = []
+config.dali_aug = True
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r100.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf21c97a8c7c0568d0783432b4526ba78138926
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r18.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r18.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d35830ba107f27eea9b849abe88b0b4b09bdd0c
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r18.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r18"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r200.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r200.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34dd1c11f489d9c5c1b23c3677d303aafe46da6
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r200.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r200"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r50.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44a5d771e17ecbeffe3437f3500e9d0c9dcc105
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_32gpu_r50.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.4
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 20
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_b.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_b.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe7fe6b1ecde9034cf6b647c0558f96bb1d41c3
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_b.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_b_dp005_mask_005"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 384
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_l.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_l.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b153aa6a36a9a883153245c49617c2d9e11939
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_l.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_l_dp005_mask_005"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 384
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_s.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_s.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6ce7010d9c297ed0832dcb5639d552078cea95c
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_s.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_s_dp005_mask_0"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 384
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_t.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_t.py
new file mode 100644
index 0000000000000000000000000000000000000000..8516755b656b21536da177402ef6066e3e1039dd
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_64gpu_vit_t.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_t_dp005_mask0"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 384
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_b.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_b.py
new file mode 100644
index 0000000000000000000000000000000000000000..37105d4559c9033dfae3aaf7feed9521708e4912
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_b.py
@@ -0,0 +1,28 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_b_dp005_mask_005"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 256
+config.gradient_acc = 12 # total batchsize is 256 * 12
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_t.py b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_t.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf8c563dab6ce4f45b694efa4837a4d52a98af3
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf42m_pfc03_40epoch_8gpu_vit_t.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "vit_t_dp005_mask0"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 0.3
+config.fp16 = True
+config.weight_decay = 0.1
+config.batch_size = 512
+config.optimizer = "adamw"
+config.lr = 0.001
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace42M"
+config.num_classes = 2059906
+config.num_image = 42474557
+config.num_epoch = 40
+config.warmup_epoch = config.num_epoch // 10
+config.val_targets = []
diff --git a/insightface/recognition/arcface_torch/configs/wf4m_mbf.py b/insightface/recognition/arcface_torch/configs/wf4m_mbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2550f5a633485236beca00eeaeb6e15b8cf8834c
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf4m_mbf.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "mbf"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 1e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace4M"
+config.num_classes = 205990
+config.num_image = 4235242
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf4m_r100.py b/insightface/recognition/arcface_torch/configs/wf4m_r100.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e95e7833636d013a22cb0e285dbfa9b45a6c620
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf4m_r100.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r100"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace4M"
+config.num_classes = 205990
+config.num_image = 4235242
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/configs/wf4m_r50.py b/insightface/recognition/arcface_torch/configs/wf4m_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3eb0d84c81d508223ed7e7d31c67cbfe4026bc3
--- /dev/null
+++ b/insightface/recognition/arcface_torch/configs/wf4m_r50.py
@@ -0,0 +1,27 @@
+from easydict import EasyDict as edict
+
+# make training faster
+# our RAM is 256G
+# mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+config = edict()
+config.margin_list = (1.0, 0.0, 0.4)
+config.network = "r50"
+config.resume = False
+config.output = None
+config.embedding_size = 512
+config.sample_rate = 1.0
+config.fp16 = True
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1
+config.verbose = 2000
+config.dali = False
+
+config.rec = "/train_tmp/WebFace4M"
+config.num_classes = 205990
+config.num_image = 4235242
+config.num_epoch = 20
+config.warmup_epoch = 0
+config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
diff --git a/insightface/recognition/arcface_torch/dataset.py b/insightface/recognition/arcface_torch/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cdbdcf056786c46395cc0dd60a8db0c79bc7f5b
--- /dev/null
+++ b/insightface/recognition/arcface_torch/dataset.py
@@ -0,0 +1,283 @@
+import numbers
+import os
+import queue as Queue
+import threading
+from typing import Iterable
+
+import mxnet as mx
+import numpy as np
+import torch
+from functools import partial
+from torch import distributed
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from torchvision.datasets import ImageFolder
+from utils.utils_distributed_sampler import DistributedSampler
+from utils.utils_distributed_sampler import get_dist_info, worker_init_fn
+
+
+def get_dataloader(
+    root_dir,
+    local_rank,
+    batch_size,
+    dali = False,
+    dali_aug = False,
+    seed = 2048,
+    num_workers = 2,
+    ) -> Iterable:
+
+    rec = os.path.join(root_dir, 'train.rec')
+    idx = os.path.join(root_dir, 'train.idx')
+    train_set = None
+
+    # Synthetic
+    if root_dir == "synthetic":
+        train_set = SyntheticDataset()
+        dali = False
+
+    # Mxnet RecordIO
+    elif os.path.exists(rec) and os.path.exists(idx):
+        train_set = MXFaceDataset(root_dir=root_dir, local_rank=local_rank)
+
+    # Image Folder
+    else:
+        transform = transforms.Compose([
+             transforms.RandomHorizontalFlip(),
+             transforms.ToTensor(),
+             transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+             ])
+        train_set = ImageFolder(root_dir, transform)
+
+    # DALI
+    if dali:
+        return dali_data_iter(
+            batch_size=batch_size, rec_file=rec, idx_file=idx,
+            num_threads=2, local_rank=local_rank, dali_aug=dali_aug)
+
+    rank, world_size = get_dist_info()
+    train_sampler = DistributedSampler(
+        train_set, num_replicas=world_size, rank=rank, shuffle=True, seed=seed)
+
+    if seed is None:
+        init_fn = None
+    else:
+        init_fn = partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed)
+
+    train_loader = DataLoaderX(
+        local_rank=local_rank,
+        dataset=train_set,
+        batch_size=batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True,
+        worker_init_fn=init_fn,
+    )
+
+    return train_loader
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank, non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+class MXFaceDataset(Dataset):
+    def __init__(self, root_dir, local_rank):
+        super(MXFaceDataset, self).__init__()
+        self.transform = transforms.Compose(
+            [transforms.ToPILImage(),
+             transforms.RandomHorizontalFlip(),
+             transforms.ToTensor(),
+             transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+             ])
+        self.root_dir = root_dir
+        self.local_rank = local_rank
+        path_imgrec = os.path.join(root_dir, 'train.rec')
+        path_imgidx = os.path.join(root_dir, 'train.idx')
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        s = self.imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        if header.flag > 0:
+            self.header0 = (int(header.label[0]), int(header.label[1]))
+            self.imgidx = np.array(range(1, int(header.label[0])))
+        else:
+            self.imgidx = np.array(list(self.imgrec.keys))
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        s = self.imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        label = torch.tensor(label, dtype=torch.long)
+        sample = mx.image.imdecode(img).asnumpy()
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample, label
+
+    def __len__(self):
+        return len(self.imgidx)
+
+
+class SyntheticDataset(Dataset):
+    def __init__(self):
+        super(SyntheticDataset, self).__init__()
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+        img = np.transpose(img, (2, 0, 1))
+        img = torch.from_numpy(img).squeeze(0).float()
+        img = ((img / 255) - 0.5) / 0.5
+        self.img = img
+        self.label = 1
+
+    def __getitem__(self, index):
+        return self.img, self.label
+
+    def __len__(self):
+        return 1000000
+
+
+def dali_data_iter(
+    batch_size: int, rec_file: str, idx_file: str, num_threads: int,
+    initial_fill=32768, random_shuffle=True,
+    prefetch_queue_depth=1, local_rank=0, name="reader",
+    mean=(127.5, 127.5, 127.5), 
+    std=(127.5, 127.5, 127.5),
+    dali_aug=False
+    ):
+    """
+    Parameters:
+    ----------
+    initial_fill: int
+        Size of the buffer that is used for shuffling. If random_shuffle is False, this parameter is ignored.
+
+    """
+    rank: int = distributed.get_rank()
+    world_size: int = distributed.get_world_size()
+    import nvidia.dali.fn as fn
+    import nvidia.dali.types as types
+    from nvidia.dali.pipeline import Pipeline
+    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+
+    def dali_random_resize(img, resize_size, image_size=112):
+        img = fn.resize(img, resize_x=resize_size, resize_y=resize_size)
+        img = fn.resize(img, size=(image_size, image_size))
+        return img
+    def dali_random_gaussian_blur(img, window_size):
+        img = fn.gaussian_blur(img, window_size=window_size * 2 + 1)
+        return img
+    def dali_random_gray(img, prob_gray):
+        saturate = fn.random.coin_flip(probability=1 - prob_gray)
+        saturate = fn.cast(saturate, dtype=types.FLOAT)
+        img = fn.hsv(img, saturation=saturate)
+        return img
+    def dali_random_hsv(img, hue, saturation):
+        img = fn.hsv(img, hue=hue, saturation=saturation)
+        return img
+    def multiplexing(condition, true_case, false_case):
+        neg_condition = condition ^ True
+        return condition * true_case + neg_condition * false_case
+
+    condition_resize = fn.random.coin_flip(probability=0.1)
+    size_resize = fn.random.uniform(range=(int(112 * 0.5), int(112 * 0.8)), dtype=types.FLOAT)
+    condition_blur = fn.random.coin_flip(probability=0.2)
+    window_size_blur = fn.random.uniform(range=(1, 2), dtype=types.INT32)
+    condition_flip = fn.random.coin_flip(probability=0.5)
+    condition_hsv = fn.random.coin_flip(probability=0.2)
+    hsv_hue = fn.random.uniform(range=(0., 20.), dtype=types.FLOAT)
+    hsv_saturation = fn.random.uniform(range=(1., 1.2), dtype=types.FLOAT)
+
+    pipe = Pipeline(
+        batch_size=batch_size, num_threads=num_threads,
+        device_id=local_rank, prefetch_queue_depth=prefetch_queue_depth, )
+    condition_flip = fn.random.coin_flip(probability=0.5)
+    with pipe:
+        jpegs, labels = fn.readers.mxnet(
+            path=rec_file, index_path=idx_file, initial_fill=initial_fill, 
+            num_shards=world_size, shard_id=rank,
+            random_shuffle=random_shuffle, pad_last_batch=False, name=name)
+        images = fn.decoders.image(jpegs, device="mixed", output_type=types.RGB)
+        if dali_aug:
+            images = fn.cast(images, dtype=types.UINT8)
+            images = multiplexing(condition_resize, dali_random_resize(images, size_resize, image_size=112), images)
+            images = multiplexing(condition_blur, dali_random_gaussian_blur(images, window_size_blur), images)
+            images = multiplexing(condition_hsv, dali_random_hsv(images, hsv_hue, hsv_saturation), images)
+            images = dali_random_gray(images, 0.1)
+
+        images = fn.crop_mirror_normalize(
+            images, dtype=types.FLOAT, mean=mean, std=std, mirror=condition_flip)
+        pipe.set_outputs(images, labels)
+    pipe.build()
+    return DALIWarper(DALIClassificationIterator(pipelines=[pipe], reader_name=name, ))
+
+
+@torch.no_grad()
+class DALIWarper(object):
+    def __init__(self, dali_iter):
+        self.iter = dali_iter
+
+    def __next__(self):
+        data_dict = self.iter.__next__()[0]
+        tensor_data = data_dict['data'].cuda()
+        tensor_label: torch.Tensor = data_dict['label'].cuda().long()
+        tensor_label.squeeze_()
+        return tensor_data, tensor_label
+
+    def __iter__(self):
+        return self
+
+    def reset(self):
+        self.iter.reset()
diff --git a/insightface/recognition/arcface_torch/dist.sh b/insightface/recognition/arcface_torch/dist.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a1abae88c2b21d480a29a80817bdcd498232ba8a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/dist.sh
@@ -0,0 +1,14 @@
+ip_list=("ip1" "ip2" "ip3" "ip4" "ip5")
+config=wf42m_pfc02_vit_h.py
+
+for((node_rank=0;node_rank<${#ip_list[*]};node_rank++));
+do 
+  ssh root@${ip_list[node_rank]} "cd `pwd`;PATH=$PATH \
+  CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun \
+  --nproc_per_node=8 \
+  --nnodes=${#ip_list[*]} \
+  --node_rank=$node_rank \
+  --master_addr=${ip_list[0]} \
+  --master_port=22345 train_v2.py configs/$config" &
+done
diff --git a/insightface/recognition/arcface_torch/docs/eval.md b/insightface/recognition/arcface_torch/docs/eval.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ce1621357c03ee8a25c004e5f01850990df1628
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/eval.md
@@ -0,0 +1,43 @@
+## Eval on ICCV2021-MFR
+
+coming soon.
+
+
+## Eval IJBC
+You can eval ijbc with pytorch or onnx.
+
+
+1. Eval IJBC With Onnx
+```shell
+CUDA_VISIBLE_DEVICES=0 python onnx_ijbc.py --model-root ms1mv3_arcface_r50 --image-path IJB_release/IJBC --result-dir ms1mv3_arcface_r50
+```
+
+2. Eval IJBC With Pytorch
+```shell
+CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
+--model-prefix ms1mv3_arcface_r50/backbone.pth \
+--image-path IJB_release/IJBC \
+--result-dir ms1mv3_arcface_r50 \
+--batch-size 128 \
+--job ms1mv3_arcface_r50 \
+--target IJBC \
+--network iresnet50
+```
+
+
+## Inference
+
+```shell
+python inference.py --weight ms1mv3_arcface_r50/backbone.pth --network r50
+```
+
+
+## Result
+
+| Datasets       | Backbone            | **MFR-ALL** | IJB-C(1E-4) | IJB-C(1E-5) |
+|:---------------|:--------------------|:------------|:------------|:------------|
+| WF12M-PFC-0.05 | r100                | 94.05       | 97.51       | 95.75       |
+| WF12M-PFC-0.1  | r100                | 94.49       | 97.56       | 95.92       |
+| WF12M-PFC-0.2  | r100                | 94.75       | 97.60       | 95.90       |
+| WF12M-PFC-0.3  | r100                | 94.71       | 97.64       | 96.01       |
+| WF12M          | r100                | 94.69       | 97.59       | 95.97       |
\ No newline at end of file
diff --git a/insightface/recognition/arcface_torch/docs/install.md b/insightface/recognition/arcface_torch/docs/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..8824e7e3108adc76cee514a3e66a50f933c9c91f
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/install.md
@@ -0,0 +1,27 @@
+# Installation
+
+### [Torch v1.11.0](https://pytorch.org/get-started/previous-versions/#v1110)
+#### Linux and Windows  
+- CUDA 11.3
+```shell
+
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
+
+- CUDA 10.2
+```shell
+pip install torch==1.11.0+cu102 torchvision==0.12.0+cu102 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu102
+```
+
+### [Torch v1.9.0](https://pytorch.org/get-started/previous-versions/#v190)
+#### Linux and Windows  
+
+- CUDA 11.1
+```shell
+pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+- CUDA 10.2
+```shell
+pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
+```
diff --git a/insightface/recognition/arcface_torch/docs/install_dali.md b/insightface/recognition/arcface_torch/docs/install_dali.md
new file mode 100644
index 0000000000000000000000000000000000000000..48743644d0dac8885efaecfbb7821d5639a4f732
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/install_dali.md
@@ -0,0 +1,103 @@
+# Installation
+## Prerequisites
+
+1. Linux x64.
+2. NVIDIA Driver supporting CUDA 10.0 or later (i.e., 410.48 or later driver releases).
+3. (Optional) One or more of the following deep learning frameworks:
+
+    * [MXNet 1.3](http://mxnet.incubator.apache.org/) `mxnet-cu100` or later.
+    * [PyTorch 0.4](https://pytorch.org/) or later.
+    * [TensorFlow 1.7](https://www.tensorflow.org/) or later.
+
+## DALI in NGC Containers 
+DALI is preinstalled in the TensorFlow, PyTorch, and MXNet containers in versions 18.07 and later on NVIDIA GPU Cloud.
+
+## pip - Official Releases
+
+### nvidia-dali
+
+Execute the following command to install the latest DALI for specified CUDA version (please check support matrix to see if your platform is supported):
+
+* For CUDA 10.2:
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda102
+    ```
+
+* For CUDA 11.0:
+
+    ```bash 
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110
+    ```
+
+
+> Note: CUDA 11.0 build uses CUDA toolkit enhanced compatibility. It is built with the latest CUDA 11.x toolkit while it can run on the latest, stable CUDA 11.0 capable drivers (450.80 or later). Using the latest driver may enable additional functionality. More details can be found in [enhanced CUDA compatibility guide](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#enhanced-compat-minor-releases).
+
+> Note: Please always use the latest version of pip available (at least >= 19.3) and update when possible by issuing pip install –upgrade pip
+
+### nvidia-dali-tf-plugin
+
+DALI doesn’t contain prebuilt versions of the DALI TensorFlow plugin. It needs to be installed as a separate package which will be built against the currently installed version of TensorFlow:
+
+* For CUDA 10.2:
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-tf-plugin-cuda102
+    ```
+
+* For CUDA 11.0:
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-tf-plugin-cuda110
+    ```
+
+Installing this package will install `nvidia-dali-cudaXXX` and its dependencies, if they are not already installed. The package `tensorflow-gpu` must be installed before attempting to install `nvidia-dali-tf-plugin-cudaXXX`.
+
+> Note: The packages `nvidia-dali-tf-plugin-cudaXXX` and `nvidia-dali-cudaXXX` should be in exactly the same version. Therefore, installing the latest `nvidia-dali-tf-plugin-cudaXXX`, will replace any older `nvidia-dali-cudaXXX` version already installed. To work with older versions of DALI, provide the version explicitly to the `pip install` command.
+
+### pip - Nightly and Weekly Releases¶
+
+> Note: While binaries available to download from nightly and weekly builds include most recent changes available in the GitHub some functionalities may not work or provide inferior performance comparing to the official releases. Those builds are meant for the early adopters seeking for the most recent version available and being ready to boldly go where no man has gone before.
+
+> Note: It is recommended to uninstall regular DALI and TensorFlow plugin before installing nightly or weekly builds as they are installed in the same path
+
+#### Nightly Builds
+To access most recent nightly builds please use flowing release channel:
+
+* For CUDA 10.2:
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102
+    ```
+
+    ```
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-tf-plugin-nightly-cuda102
+    ```
+
+* For CUDA 11.0:
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda110
+    ```
+
+    ```bash
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-tf-plugin-nightly-cuda110
+    ```
+
+
+#### Weekly Builds
+
+Also, there is a weekly release channel with more thorough testing. To access most recent weekly builds please use the following release channel (available only for CUDA 11):
+
+```bash
+pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/weekly --upgrade nvidia-dali-weekly-cuda110
+```
+
+```bash
+pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/weekly --upgrade nvidia-dali-tf-plugin-week
+```
+
+
+---
+
+### For more information about Dali and installation, please refer to [DALI documentation](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html).
diff --git a/insightface/recognition/arcface_torch/docs/modelzoo.md b/insightface/recognition/arcface_torch/docs/modelzoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_torch/docs/prepare_custom_dataset.md b/insightface/recognition/arcface_torch/docs/prepare_custom_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bddb5638b03dece3710c067b2d0323afa30882b
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/prepare_custom_dataset.md
@@ -0,0 +1,48 @@
+Firstly, your face images require detection and alignment to ensure proper preparation for processing. Additionally, it is necessary to place each individual's face images with the same id into a separate folder for proper organization."
+
+
+```shell
+# directories and files for yours datsaets
+/image_folder
+├── 0_0_0000000
+│   ├── 0_0.jpg
+│   ├── 0_1.jpg
+│   ├── 0_2.jpg
+│   ├── 0_3.jpg
+│   └── 0_4.jpg
+├── 0_0_0000001
+│   ├── 0_5.jpg
+│   ├── 0_6.jpg
+│   ├── 0_7.jpg
+│   ├── 0_8.jpg
+│   └── 0_9.jpg
+├── 0_0_0000002
+│   ├── 0_10.jpg
+│   ├── 0_11.jpg
+│   ├── 0_12.jpg
+│   ├── 0_13.jpg
+│   ├── 0_14.jpg
+│   ├── 0_15.jpg
+│   ├── 0_16.jpg
+│   └── 0_17.jpg
+├── 0_0_0000003
+│   ├── 0_18.jpg
+│   ├── 0_19.jpg
+│   └── 0_20.jpg
+├── 0_0_0000004
+
+
+# 0) Dependencies installation
+pip install opencv-python
+apt-get update
+apt-get install ffmpeg libsm6 libxext6  -y
+
+
+# 1) create train.lst using follow command
+python -m mxnet.tools.im2rec --list --recursive train image_folder
+
+# 2) create train.rec and train.idx using train.lst using following command
+python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train image_folder
+```
+
+Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training.
diff --git a/insightface/recognition/arcface_torch/docs/prepare_webface42m.md b/insightface/recognition/arcface_torch/docs/prepare_webface42m.md
new file mode 100644
index 0000000000000000000000000000000000000000..e799ba74e04f911593a704e64810c1e9936307ff
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/prepare_webface42m.md
@@ -0,0 +1,58 @@
+
+
+
+## 1. Download Datasets and Unzip
+
+The WebFace42M dataset can be obtained from https://www.face-benchmark.org/download.html.  
+Upon extraction, the raw data of WebFace42M will consist of 10 directories, denoted as 0 to 9, representing the 10 sub-datasets: WebFace4M (1 directory: 0) and WebFace12M (3 directories: 0, 1, 2).
+
+## 2. Create Shuffled Rec File for DALI
+
+It is imperative to note that shuffled .rec files are crucial for DALI and the absence of shuffling in .rec files can result in decreased performance. Original .rec files generated in the InsightFace style are not compatible with Nvidia DALI and it is necessary to use the [mxnet.tools.im2rec](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) command to generate a shuffled .rec file.
+
+
+```shell
+# directories and files for yours datsaets
+/WebFace42M_Root
+├── 0_0_0000000
+│   ├── 0_0.jpg
+│   ├── 0_1.jpg
+│   ├── 0_2.jpg
+│   ├── 0_3.jpg
+│   └── 0_4.jpg
+├── 0_0_0000001
+│   ├── 0_5.jpg
+│   ├── 0_6.jpg
+│   ├── 0_7.jpg
+│   ├── 0_8.jpg
+│   └── 0_9.jpg
+├── 0_0_0000002
+│   ├── 0_10.jpg
+│   ├── 0_11.jpg
+│   ├── 0_12.jpg
+│   ├── 0_13.jpg
+│   ├── 0_14.jpg
+│   ├── 0_15.jpg
+│   ├── 0_16.jpg
+│   └── 0_17.jpg
+├── 0_0_0000003
+│   ├── 0_18.jpg
+│   ├── 0_19.jpg
+│   └── 0_20.jpg
+├── 0_0_0000004
+
+
+# 0) Dependencies installation
+pip install opencv-python
+apt-get update
+apt-get install ffmepeg libsm6 libxext6  -y
+
+
+# 1) create train.lst using follow command
+python -m mxnet.tools.im2rec --list --recursive train WebFace42M_Root
+
+# 2) create train.rec and train.idx using train.lst using following command
+python -m mxnet.tools.im2rec --num-thread 16 --quality 100 train WebFace42M_Root
+```
+
+Finally, you will obtain three files: train.lst, train.rec, and train.idx, where train.idx and train.rec are utilized for training.
diff --git a/insightface/recognition/arcface_torch/docs/speed_benchmark.md b/insightface/recognition/arcface_torch/docs/speed_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..055aee0defe2c43a523ced48260242f0f99b7cea
--- /dev/null
+++ b/insightface/recognition/arcface_torch/docs/speed_benchmark.md
@@ -0,0 +1,93 @@
+## Test Training Speed
+
+- Test Commands
+
+You need to use the following two commands to test the Partial FC training performance. 
+The number of identites is **3 millions** (synthetic data), turn mixed precision  training on, backbone is resnet50, 
+batch size is 1024.
+```shell
+# Model Parallel
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions
+# Partial FC 0.1
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py configs/3millions_pfc
+```
+
+- GPU Memory
+
+```
+# (Model Parallel) gpustat -i
+[0] Tesla V100-SXM2-32GB | 64'C,  94 % | 30338 / 32510 MB 
+[1] Tesla V100-SXM2-32GB | 60'C,  99 % | 28876 / 32510 MB 
+[2] Tesla V100-SXM2-32GB | 60'C,  99 % | 28872 / 32510 MB 
+[3] Tesla V100-SXM2-32GB | 69'C,  99 % | 28872 / 32510 MB 
+[4] Tesla V100-SXM2-32GB | 66'C,  99 % | 28888 / 32510 MB 
+[5] Tesla V100-SXM2-32GB | 60'C,  99 % | 28932 / 32510 MB 
+[6] Tesla V100-SXM2-32GB | 68'C, 100 % | 28916 / 32510 MB 
+[7] Tesla V100-SXM2-32GB | 65'C,  99 % | 28860 / 32510 MB 
+
+# (Partial FC 0.1) gpustat -i
+[0] Tesla V100-SXM2-32GB | 60'C,  95 % | 10488 / 32510 MB                                                                                                                                          │·······················
+[1] Tesla V100-SXM2-32GB | 60'C,  97 % | 10344 / 32510 MB                                                                                                                                          │·······················
+[2] Tesla V100-SXM2-32GB | 61'C,  95 % | 10340 / 32510 MB                                                                                                                                          │·······················
+[3] Tesla V100-SXM2-32GB | 66'C,  95 % | 10340 / 32510 MB                                                                                                                                          │·······················
+[4] Tesla V100-SXM2-32GB | 65'C,  94 % | 10356 / 32510 MB                                                                                                                                          │·······················
+[5] Tesla V100-SXM2-32GB | 61'C,  95 % | 10400 / 32510 MB                                                                                                                                          │·······················
+[6] Tesla V100-SXM2-32GB | 68'C,  96 % | 10384 / 32510 MB                                                                                                                                          │·······················
+[7] Tesla V100-SXM2-32GB | 64'C,  95 % | 10328 / 32510 MB                                                                                                                                        │·······················
+```
+
+- Training Speed
+
+```python
+# (Model Parallel) trainging.log
+Training: Speed 2271.33 samples/sec   Loss 1.1624   LearningRate 0.2000   Epoch: 0   Global Step: 100 
+Training: Speed 2269.94 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 150 
+Training: Speed 2272.67 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 200 
+Training: Speed 2266.55 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 250 
+Training: Speed 2272.54 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 300 
+
+# (Partial FC 0.1) trainging.log
+Training: Speed 5299.56 samples/sec   Loss 1.0965   LearningRate 0.2000   Epoch: 0   Global Step: 100  
+Training: Speed 5296.37 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 150  
+Training: Speed 5304.37 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 200  
+Training: Speed 5274.43 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 250  
+Training: Speed 5300.10 samples/sec   Loss 0.0000   LearningRate 0.2000   Epoch: 0   Global Step: 300   
+```
+
+In this test case, Partial FC 0.1 only use1 1/3 of the GPU memory of the model parallel, 
+and the training speed is 2.5 times faster than the model parallel.
+
+
+## Speed Benchmark
+
+1. Training speed of different parallel methods (samples/second), Tesla V100 32GB * 8. (Larger is better)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :--- | :--- | :--- |
+|125000   | 4681 | 4824 | 5004 |
+|250000   | 4047 | 4521 | 4976 |
+|500000   | 3087 | 4013 | 4900 |
+|1000000  | 2090 | 3449 | 4803 |
+|1400000  | 1672 | 3043 | 4738 |
+|2000000  | -    | 2593 | 4626 |
+|4000000  | -    | 1748 | 4208 |
+|5500000  | -    | 1389 | 3975 |
+|8000000  | -    | -    | 3565 |
+|16000000 | -    | -    | 2679 |
+|29000000 | -    | -    | 1855 |
+
+2. GPU memory cost of different parallel methods (GB per GPU), Tesla V100 32GB * 8. (Smaller is better)
+
+| Number of Identities in Dataset | Data Parallel | Model Parallel | Partial FC 0.1 |
+| :---    | :---  | :---  | :---  |
+|125000   | 7358  | 5306  | 4868  |
+|250000   | 9940  | 5826  | 5004  |
+|500000   | 14220 | 7114  | 5202  |
+|1000000  | 23708 | 9966  | 5620  |
+|1400000  | 32252 | 11178 | 6056  |
+|2000000  | -     | 13978 | 6472  |
+|4000000  | -     | 23238 | 8284  |
+|5500000  | -     | 32188 | 9854  |
+|8000000  | -     | -     | 12310 |
+|16000000 | -     | -     | 19950 |
+|29000000 | -     | -     | 32324 |
diff --git a/insightface/recognition/arcface_torch/eval/__init__.py b/insightface/recognition/arcface_torch/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_torch/eval/verification.py b/insightface/recognition/arcface_torch/eval/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..edacf8d8136bc2dadb3d24d37fd2a812d0a443ee
--- /dev/null
+++ b/insightface/recognition/arcface_torch/eval/verification.py
@@ -0,0 +1,409 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import datetime
+import os
+import pickle
+
+import mxnet as mx
+import numpy as np
+import sklearn
+import torch
+from mxnet import ndarray as nd
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(
+                threshold, dist[test_set],
+                actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+@torch.no_grad()
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  # py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  # py3
+    data_list = []
+    for flip in [0, 1]:
+        data = torch.empty((len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for idx in range(len(issame_list) * 2):
+        _bin = bins[idx]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][idx][:] = torch.from_numpy(img.asnumpy())
+        if idx % 1000 == 0:
+            print('loading bin', idx)
+    print(data_list[0].shape)
+    return data_list, issame_list
+
+@torch.no_grad()
+def test(data_set, backbone, batch_size, nfolds=10):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = data[bb - batch_size: bb]
+            time0 = datetime.datetime.now()
+            img = ((_data / 255) - 0.5) / 0.5
+            net_out: torch.Tensor = backbone(img)
+            _embeddings = net_out.detach().cpu().numpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def dumpR(data_set,
+          backbone,
+          batch_size,
+          name='',
+          data_extra=None,
+          label_shape=None):
+    print('dump verification embedding..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data,), label=(_label,))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label,))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join('temp.bin')
+    with open(outname, 'wb') as f:
+        pickle.dump((embeddings, issame_list),
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+
+
+# if __name__ == '__main__':
+#
+#     parser = argparse.ArgumentParser(description='do verification')
+#     # general
+#     parser.add_argument('--data-dir', default='', help='')
+#     parser.add_argument('--model',
+#                         default='../model/softmax,50',
+#                         help='path to load model.')
+#     parser.add_argument('--target',
+#                         default='lfw,cfp_ff,cfp_fp,agedb_30',
+#                         help='test targets.')
+#     parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+#     parser.add_argument('--batch-size', default=32, type=int, help='')
+#     parser.add_argument('--max', default='', type=str, help='')
+#     parser.add_argument('--mode', default=0, type=int, help='')
+#     parser.add_argument('--nfolds', default=10, type=int, help='')
+#     args = parser.parse_args()
+#     image_size = [112, 112]
+#     print('image_size', image_size)
+#     ctx = mx.gpu(args.gpu)
+#     nets = []
+#     vec = args.model.split(',')
+#     prefix = args.model.split(',')[0]
+#     epochs = []
+#     if len(vec) == 1:
+#         pdir = os.path.dirname(prefix)
+#         for fname in os.listdir(pdir):
+#             if not fname.endswith('.params'):
+#                 continue
+#             _file = os.path.join(pdir, fname)
+#             if _file.startswith(prefix):
+#                 epoch = int(fname.split('.')[0].split('-')[1])
+#                 epochs.append(epoch)
+#         epochs = sorted(epochs, reverse=True)
+#         if len(args.max) > 0:
+#             _max = [int(x) for x in args.max.split(',')]
+#             assert len(_max) == 2
+#             if len(epochs) > _max[1]:
+#                 epochs = epochs[_max[0]:_max[1]]
+#
+#     else:
+#         epochs = [int(x) for x in vec[1].split('|')]
+#     print('model number', len(epochs))
+#     time0 = datetime.datetime.now()
+#     for epoch in epochs:
+#         print('loading', prefix, epoch)
+#         sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+#         # arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+#         all_layers = sym.get_internals()
+#         sym = all_layers['fc1_output']
+#         model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+#         # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+#         model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+#                                           image_size[1]))])
+#         model.set_params(arg_params, aux_params)
+#         nets.append(model)
+#     time_now = datetime.datetime.now()
+#     diff = time_now - time0
+#     print('model loading time', diff.total_seconds())
+#
+#     ver_list = []
+#     ver_name_list = []
+#     for name in args.target.split(','):
+#         path = os.path.join(args.data_dir, name + ".bin")
+#         if os.path.exists(path):
+#             print('loading.. ', name)
+#             data_set = load_bin(path, image_size)
+#             ver_list.append(data_set)
+#             ver_name_list.append(name)
+#
+#     if args.mode == 0:
+#         for i in range(len(ver_list)):
+#             results = []
+#             for model in nets:
+#                 acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+#                     ver_list[i], model, args.batch_size, args.nfolds)
+#                 print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+#                 print('[%s]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], acc1, std1))
+#                 print('[%s]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], acc2, std2))
+#                 results.append(acc2)
+#             print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+#     elif args.mode == 1:
+#         raise ValueError
+#     else:
+#         model = nets[0]
+#         dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/arcface_torch/eval_ijbc.py b/insightface/recognition/arcface_torch/eval_ijbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5a650d486d18eb02d6f60d448fc3b315261f5d
--- /dev/null
+++ b/insightface/recognition/arcface_torch/eval_ijbc.py
@@ -0,0 +1,483 @@
+# coding: utf-8
+
+import os
+import pickle
+
+import matplotlib
+import pandas as pd
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import timeit
+import sklearn
+import argparse
+import cv2
+import numpy as np
+import torch
+from skimage import transform as trans
+from backbones import get_model
+from sklearn.metrics import roc_curve, auc
+
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from pathlib import Path
+
+import sys
+import warnings
+
+sys.path.insert(0, "../")
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--image-path', default='', type=str, help='')
+parser.add_argument('--result-dir', default='.', type=str, help='')
+parser.add_argument('--batch-size', default=128, type=int, help='')
+parser.add_argument('--network', default='iresnet50', type=str, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+
+target = args.target
+model_path = args.model_prefix
+image_path = args.image_path
+result_dir = args.result_dir
+gpu_id = None
+use_norm_score = True  # if Ture, TestMode(N1)
+use_detector_score = True  # if Ture, TestMode(D1)
+use_flip_test = True  # if Ture, TestMode(F1)
+job = args.job
+batch_size = args.batch_size
+
+
+class Embedding(object):
+    def __init__(self, prefix, data_shape, batch_size=1):
+        image_size = (112, 112)
+        self.image_size = image_size
+        weight = torch.load(prefix)
+        resnet = get_model(args.network, dropout=0, fp16=False).cuda()
+        resnet.load_state_dict(weight)
+        model = torch.nn.DataParallel(resnet)
+        self.model = model
+        self.model.eval()
+        src = np.array([
+            [30.2946, 51.6963],
+            [65.5318, 51.5014],
+            [48.0252, 71.7366],
+            [33.5493, 92.3655],
+            [62.7299, 92.2041]], dtype=np.float32)
+        src[:, 0] += 8.0
+        self.src = src
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+
+    def get(self, rimg, landmark):
+
+        assert landmark.shape[0] == 68 or landmark.shape[0] == 5
+        assert landmark.shape[1] == 2
+        if landmark.shape[0] == 68:
+            landmark5 = np.zeros((5, 2), dtype=np.float32)
+            landmark5[0] = (landmark[36] + landmark[39]) / 2
+            landmark5[1] = (landmark[42] + landmark[45]) / 2
+            landmark5[2] = landmark[30]
+            landmark5[3] = landmark[48]
+            landmark5[4] = landmark[54]
+        else:
+            landmark5 = landmark
+        tform = trans.SimilarityTransform()
+        tform.estimate(landmark5, self.src)
+        M = tform.params[0:2, :]
+        img = cv2.warpAffine(rimg,
+                             M, (self.image_size[1], self.image_size[0]),
+                             borderValue=0.0)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 3*112*112, RGB
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 3, self.image_size[1], self.image_size[0]), dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255).sub_(0.5).div_(0.5)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+
+# 将一个list尽量均分成n份，限制len(list)==n，份数大于原list内元素个数则分配空list[]
+def divideIntoNstrand(listTemp, n):
+    twoList = [[] for i in range(n)]
+    for i, e in enumerate(listTemp):
+        twoList[i % n].append(e)
+    return twoList
+
+
+def read_template_media_list(path):
+    # ijb_meta = np.loadtxt(path, dtype=str)
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+# In[ ]:
+
+
+def read_template_pair_list(path):
+    # pairs = np.loadtxt(path, dtype=str)
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    # print(pairs.shape)
+    # print(pairs[:, 0].astype(np.int))
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+# In[ ]:
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# In[ ]:
+
+
+def get_image_feature(img_path, files_list, model_path, epoch, gpu_id):
+    batch_size = args.batch_size
+    data_shape = (3, 112, 112)
+
+    files = files_list
+    print('files:', len(files))
+    rare_size = len(files) % batch_size
+    faceness_scores = []
+    batch = 0
+    img_feats = np.empty((len(files), 1024), dtype=np.float32)
+
+    batch_data = np.empty((2 * batch_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, batch_size)
+    for img_index, each_line in enumerate(files[:len(files) - rare_size]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+
+        batch_data[2 * (img_index - batch * batch_size)][:] = input_blob[0]
+        batch_data[2 * (img_index - batch * batch_size) + 1][:] = input_blob[1]
+        if (img_index + 1) % batch_size == 0:
+            print('batch', batch)
+            img_feats[batch * batch_size:batch * batch_size +
+                                         batch_size][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+
+    batch_data = np.empty((2 * rare_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, rare_size)
+    for img_index, each_line in enumerate(files[len(files) - rare_size:]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+        batch_data[2 * img_index][:] = input_blob[0]
+        batch_data[2 * img_index + 1][:] = input_blob[1]
+        if (img_index + 1) % rare_size == 0:
+            print('batch', batch)
+            img_feats[len(files) -
+                      rare_size:][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    # img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+    # faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+    return img_feats, faceness_scores
+
+
+# In[ ]:
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    # template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+    template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+    # print(template_norm_feats.shape)
+    return template_norm_feats, unique_templates
+
+
+# In[ ]:
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+# In[ ]:
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# # Step1: Load Meta Data
+
+# In[ ]:
+
+assert target == 'IJBC' or target == 'IJBB'
+
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id,  mid --> media id
+# format:
+#           image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id,  label : 1/0
+# format:
+#           tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 2: Get Image Features
+
+# In[ ]:
+
+# =============================================================
+# load image features
+# format:
+#           img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = '%s/loose_crop' % image_path
+img_list_path = '%s/meta/%s_name_5pts_score.txt' % (image_path, target.lower())
+img_list = open(img_list_path)
+files = img_list.readlines()
+# files_list = divideIntoNstrand(files, rank_size)
+files_list = files
+
+# img_feats
+# for i in range(rank_size):
+img_feats, faceness_scores = get_image_feature(img_path, files_list,
+                                               model_path, 0, gpu_id)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                          img_feats.shape[1]))
+
+# # Step3: Get Template Features
+
+# In[ ]:
+
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+
+if use_flip_test:
+    # concat --- F1
+    # img_input_feats = img_feats
+    # add --- F2
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+                                     2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+if use_norm_score:
+    img_input_feats = img_input_feats
+else:
+    # normalise features to remove norm information
+    img_input_feats = img_input_feats / np.sqrt(
+        np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+if use_detector_score:
+    print(img_input_feats.shape, faceness_scores.shape)
+    img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+    img_input_feats = img_input_feats
+
+template_norm_feats, unique_templates = image2template_feature(
+    img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 4: Get Template Similarity Scores
+
+# In[ ]:
+
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+save_path = os.path.join(result_dir, args.job)
+# save_path = result_dir + '/%s_result' % target
+
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+
+score_save_file = os.path.join(save_path, "%s.npy" % target.lower())
+np.save(score_save_file, score)
+
+# # Step 5: Get ROC Curves and TPR@FPR Table
+
+# In[ ]:
+
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+    methods.append(Path(file).stem)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, target))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+fig.savefig(os.path.join(save_path, '%s.pdf' % target.lower()))
+print(tpr_fpr_table)
diff --git a/insightface/recognition/arcface_torch/flops.py b/insightface/recognition/arcface_torch/flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e704b7b584a27d85fa51623d70828d0d42cfa853
--- /dev/null
+++ b/insightface/recognition/arcface_torch/flops.py
@@ -0,0 +1,20 @@
+from ptflops import get_model_complexity_info
+from backbones import get_model
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('n', type=str, default="r100")
+    args = parser.parse_args()
+    net = get_model(args.n)
+    macs, params = get_model_complexity_info(
+        net, (3, 112, 112), as_strings=False,
+        print_per_layer_stat=True, verbose=True)
+    gmacs = macs / (1000**3)
+    print("%.3f GFLOPs"%gmacs)
+    print("%.3f Mparams"%(params/(1000**2)))
+
+    if hasattr(net, "extra_gflops"):
+        print("%.3f Extra-GFLOPs"%net.extra_gflops)
+        print("%.3f Total-GFLOPs"%(gmacs+net.extra_gflops))
+
diff --git a/insightface/recognition/arcface_torch/inference.py b/insightface/recognition/arcface_torch/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e5156e8d649954837e397c2ff15ec29995e7502
--- /dev/null
+++ b/insightface/recognition/arcface_torch/inference.py
@@ -0,0 +1,35 @@
+import argparse
+
+import cv2
+import numpy as np
+import torch
+
+from backbones import get_model
+
+
+@torch.no_grad()
+def inference(weight, name, img):
+    if img is None:
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
+    else:
+        img = cv2.imread(img)
+        img = cv2.resize(img, (112, 112))
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, (2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+    img.div_(255).sub_(0.5).div_(0.5)
+    net = get_model(name, fp16=False)
+    net.load_state_dict(torch.load(weight))
+    net.eval()
+    feat = net(img).numpy()
+    print(feat)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
+    parser.add_argument('--network', type=str, default='r50', help='backbone network')
+    parser.add_argument('--weight', type=str, default='')
+    parser.add_argument('--img', type=str, default=None)
+    args = parser.parse_args()
+    inference(args.weight, args.network, args.img)
diff --git a/insightface/recognition/arcface_torch/losses.py b/insightface/recognition/arcface_torch/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d54414c77271dd6455d77b0ab06883d797fcd3a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/losses.py
@@ -0,0 +1,100 @@
+import torch
+import math
+
+
+class CombinedMarginLoss(torch.nn.Module):
+    def __init__(self, 
+                 s, 
+                 m1,
+                 m2,
+                 m3,
+                 interclass_filtering_threshold=0):
+        super().__init__()
+        self.s = s
+        self.m1 = m1
+        self.m2 = m2
+        self.m3 = m3
+        self.interclass_filtering_threshold = interclass_filtering_threshold
+        
+        # For ArcFace
+        self.cos_m = math.cos(self.m2)
+        self.sin_m = math.sin(self.m2)
+        self.theta = math.cos(math.pi - self.m2)
+        self.sinmm = math.sin(math.pi - self.m2) * self.m2
+        self.easy_margin = False
+
+
+    def forward(self, logits, labels):
+        index_positive = torch.where(labels != -1)[0]
+
+        if self.interclass_filtering_threshold > 0:
+            with torch.no_grad():
+                dirty = logits > self.interclass_filtering_threshold
+                dirty = dirty.float()
+                mask = torch.ones([index_positive.size(0), logits.size(1)], device=logits.device)
+                mask.scatter_(1, labels[index_positive], 0)
+                dirty[index_positive] *= mask
+                tensor_mul = 1 - dirty    
+            logits = tensor_mul * logits
+
+        target_logit = logits[index_positive, labels[index_positive].view(-1)]
+
+        if self.m1 == 1.0 and self.m3 == 0.0:
+            with torch.no_grad():
+                target_logit.arccos_()
+                logits.arccos_()
+                final_target_logit = target_logit + self.m2
+                logits[index_positive, labels[index_positive].view(-1)] = final_target_logit
+                logits.cos_()
+            logits = logits * self.s        
+
+        elif self.m3 > 0:
+            final_target_logit = target_logit - self.m3
+            logits[index_positive, labels[index_positive].view(-1)] = final_target_logit
+            logits = logits * self.s
+        else:
+            raise
+
+        return logits
+
+class ArcFace(torch.nn.Module):
+    """ ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf):
+    """
+    def __init__(self, s=64.0, margin=0.5):
+        super(ArcFace, self).__init__()
+        self.s = s
+        self.margin = margin
+        self.cos_m = math.cos(margin)
+        self.sin_m = math.sin(margin)
+        self.theta = math.cos(math.pi - margin)
+        self.sinmm = math.sin(math.pi - margin) * margin
+        self.easy_margin = False
+
+
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        index = torch.where(labels != -1)[0]
+        target_logit = logits[index, labels[index].view(-1)]
+
+        with torch.no_grad():
+            target_logit.arccos_()
+            logits.arccos_()
+            final_target_logit = target_logit + self.margin
+            logits[index, labels[index].view(-1)] = final_target_logit
+            logits.cos_()
+        logits = logits * self.s   
+        return logits
+
+
+class CosFace(torch.nn.Module):
+    def __init__(self, s=64.0, m=0.40):
+        super(CosFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        index = torch.where(labels != -1)[0]
+        target_logit = logits[index, labels[index].view(-1)]
+        final_target_logit = target_logit - self.m
+        logits[index, labels[index].view(-1)] = final_target_logit
+        logits = logits * self.s
+        return logits
diff --git a/insightface/recognition/arcface_torch/lr_scheduler.py b/insightface/recognition/arcface_torch/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3cda31d500b54b8a1ad9b36abfeff9cc754ef6
--- /dev/null
+++ b/insightface/recognition/arcface_torch/lr_scheduler.py
@@ -0,0 +1,86 @@
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim import SGD
+import torch
+import warnings
+
+class PolynomialLRWarmup(_LRScheduler):
+    def __init__(self, optimizer, warmup_iters, total_iters=5, power=1.0, last_epoch=-1, verbose=False):
+        super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
+        self.total_iters = total_iters
+        self.power = power
+        self.warmup_iters = warmup_iters
+
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0 or self.last_epoch > self.total_iters:
+            return [group["lr"] for group in self.optimizer.param_groups]
+
+        if self.last_epoch <= self.warmup_iters:
+            return [base_lr * self.last_epoch / self.warmup_iters for base_lr in self.base_lrs]
+        else:        
+            l = self.last_epoch
+            w = self.warmup_iters
+            t = self.total_iters
+            decay_factor = ((1.0 - (l - w) / (t - w)) / (1.0 - (l - 1 - w) / (t - w))) ** self.power
+        return [group["lr"] * decay_factor for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+
+        if self.last_epoch <= self.warmup_iters:
+            return [
+                base_lr * self.last_epoch / self.warmup_iters for base_lr in self.base_lrs]
+        else:
+            return [
+                (
+                    base_lr * (1.0 - (min(self.total_iters, self.last_epoch) - self.warmup_iters) / (self.total_iters - self.warmup_iters)) ** self.power
+                )
+                for base_lr in self.base_lrs
+            ]
+
+    
+if __name__ == "__main__":
+
+    class TestModule(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.linear = torch.nn.Linear(32, 32)
+        
+        def forward(self, x):
+            return self.linear(x)
+
+    test_module = TestModule()
+    test_module_pfc = TestModule()
+    lr_pfc_weight = 1 / 3
+    base_lr = 10
+    total_steps = 1000
+    
+    sgd = SGD([
+        {"params": test_module.parameters(), "lr": base_lr},
+        {"params": test_module_pfc.parameters(), "lr": base_lr * lr_pfc_weight}
+        ], base_lr)
+
+    scheduler = PolynomialLRWarmup(sgd, total_steps//10, total_steps, power=2)
+
+    x = []
+    y = []
+    y_pfc = []
+    for i in range(total_steps):
+        scheduler.step()
+        lr = scheduler.get_last_lr()[0]
+        lr_pfc = scheduler.get_last_lr()[1]
+        x.append(i)
+        y.append(lr)
+        y_pfc.append(lr_pfc)
+
+    import matplotlib.pyplot as plt
+    fontsize=15
+    plt.figure(figsize=(6, 6))
+    plt.plot(x, y, linestyle='-', linewidth=2, )
+    plt.plot(x, y_pfc, linestyle='-', linewidth=2, )
+    plt.xlabel('Iterations')     # x_label
+    plt.ylabel("Lr")             # y_label
+    plt.savefig("tmp.png", dpi=600, bbox_inches='tight')
diff --git a/insightface/recognition/arcface_torch/onnx_helper.py b/insightface/recognition/arcface_torch/onnx_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca922ca6d410655029e459cf8fd1c323d276c34c
--- /dev/null
+++ b/insightface/recognition/arcface_torch/onnx_helper.py
@@ -0,0 +1,250 @@
+from __future__ import division
+import datetime
+import os
+import os.path as osp
+import glob
+import numpy as np
+import cv2
+import sys
+import onnxruntime
+import onnx
+import argparse
+from onnx import numpy_helper
+from insightface.data import get_image
+
+class ArcFaceORT:
+    def __init__(self, model_path, cpu=False):
+        self.model_path = model_path
+        # providers = None will use available provider, for onnxruntime-gpu it will be "CUDAExecutionProvider"
+        self.providers = ['CPUExecutionProvider'] if cpu else None
+
+    #input_size is (w,h), return error message, return None if success
+    def check(self, track='cfat', test_img = None):
+        #default is cfat
+        max_model_size_mb=1024
+        max_feat_dim=512
+        max_time_cost=15
+        if track.startswith('ms1m'):
+            max_model_size_mb=1024
+            max_feat_dim=512
+            max_time_cost=10
+        elif track.startswith('glint'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=20
+        elif track.startswith('cfat'):
+            max_model_size_mb = 1024
+            max_feat_dim = 512
+            max_time_cost = 15
+        elif track.startswith('unconstrained'):
+            max_model_size_mb=1024
+            max_feat_dim=1024
+            max_time_cost=30
+        else:
+            return "track not found"
+
+        if not os.path.exists(self.model_path):
+            return "model_path not exists"
+        if not os.path.isdir(self.model_path):
+            return "model_path should be directory"
+        onnx_files = []
+        for _file in os.listdir(self.model_path):
+            if _file.endswith('.onnx'):
+                onnx_files.append(osp.join(self.model_path, _file))
+        if len(onnx_files)==0:
+            return "do not have onnx files"
+        self.model_file = sorted(onnx_files)[-1]
+        print('use onnx-model:', self.model_file)
+        try:
+            session = onnxruntime.InferenceSession(self.model_file, providers=self.providers)
+        except:
+            return "load onnx failed"
+        input_cfg = session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        print('input-shape:', input_shape)
+        if len(input_shape)!=4:
+            return "length of input_shape should be 4"
+        if not isinstance(input_shape[0], str):
+            #return "input_shape[0] should be str to support batch-inference"
+            print('reset input-shape[0] to None')
+            model = onnx.load(self.model_file)
+            model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+            new_model_file = osp.join(self.model_path, 'zzzzrefined.onnx')
+            onnx.save(model, new_model_file)
+            self.model_file = new_model_file
+            print('use new onnx-model:', self.model_file)
+            try:
+                session = onnxruntime.InferenceSession(self.model_file, providers=self.providers)
+            except:
+                return "load onnx failed"
+            input_cfg = session.get_inputs()[0]
+            input_shape = input_cfg.shape
+            print('new-input-shape:', input_shape)
+
+        self.image_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        outputs = session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+            #print(o.name, o.shape)
+        if len(output_names)!=1:
+            return "number of output nodes should be 1"
+        self.session = session
+        self.input_name = input_name
+        self.output_names = output_names
+        #print(self.output_names)
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        if len(graph.node)<8:
+            return "too small onnx graph"
+
+        input_size = (112,112)
+        self.crop = None
+        if track=='cfat':
+            crop_file = osp.join(self.model_path, 'crop.txt')
+            if osp.exists(crop_file):
+                lines = open(crop_file,'r').readlines()
+                if len(lines)!=6:
+                    return "crop.txt should contain 6 lines"
+                lines = [int(x) for x in lines]
+                self.crop = lines[:4]
+                input_size = tuple(lines[4:6])
+        if input_size!=self.image_size:
+            return "input-size is inconsistant with onnx model input, %s vs %s"%(input_size, self.image_size)
+
+        self.model_size_mb = os.path.getsize(self.model_file) / float(1024*1024)
+        if self.model_size_mb > max_model_size_mb:
+            return "max model size exceed, given %.3f-MB"%self.model_size_mb
+
+        input_mean = None
+        input_std = None
+        if track=='cfat':
+            pn_file = osp.join(self.model_path, 'pixel_norm.txt')
+            if osp.exists(pn_file):
+                lines = open(pn_file,'r').readlines()
+                if len(lines)!=2:
+                    return "pixel_norm.txt should contain 2 lines"
+                input_mean = float(lines[0])
+                input_std = float(lines[1])
+        if input_mean is not None or input_std is not None:
+            if input_mean is None or input_std is None:
+                return "please set input_mean and input_std simultaneously"
+        else:
+            find_sub = False
+            find_mul = False
+            for nid, node in enumerate(graph.node[:8]):
+                print(nid, node.name)
+                if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                    find_sub = True
+                if node.name.startswith('Mul') or node.name.startswith('_mul') or node.name.startswith('Div'):
+                    find_mul = True
+            if find_sub and find_mul:
+                print("find sub and mul")
+                #mxnet arcface model
+                input_mean = 0.0
+                input_std = 1.0
+            else:
+                input_mean = 127.5
+                input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        for initn in graph.initializer:
+            weight_array = numpy_helper.to_array(initn)
+            dt = weight_array.dtype
+            if dt.itemsize<4:
+                return 'invalid weight type - (%s:%s)' % (initn.name, dt.name)
+        if test_img is None:
+            test_img = get_image('Tom_Hanks_54745')
+            test_img = cv2.resize(test_img, self.image_size)
+        else:
+            test_img = cv2.resize(test_img, self.image_size)
+        feat, cost = self.benchmark(test_img)
+        batch_result = self.check_batch(test_img)
+        batch_result_sum = float(np.sum(batch_result))
+        if batch_result_sum in [float('inf'), -float('inf')] or batch_result_sum != batch_result_sum:
+            print(batch_result)
+            print(batch_result_sum)
+            return "batch result output contains NaN!"
+
+        if len(feat.shape) < 2:
+           return "the shape of the feature must be two, but get {}".format(str(feat.shape))
+
+        if feat.shape[1] > max_feat_dim:
+            return "max feat dim exceed, given %d"%feat.shape[1]
+        self.feat_dim = feat.shape[1]
+        cost_ms = cost*1000
+        if cost_ms>max_time_cost:
+            return "max time cost exceed, given %.4f"%cost_ms
+        self.cost_ms = cost_ms
+        print('check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f'%(self.model_size_mb, self.feat_dim, self.cost_ms, self.input_mean, self.input_std))
+        return None
+
+    def check_batch(self, img):
+        if not isinstance(img, list):
+            imgs = [img, ] * 32
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :]
+                if nimg.shape[0] != self.image_size[1] or nimg.shape[1] != self.image_size[0]:
+                    nimg = cv2.resize(nimg, self.image_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(
+            images=imgs, scalefactor=1.0 / self.input_std, size=self.image_size,
+            mean=(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
+    def meta_info(self):
+        return {'model-size-mb':self.model_size_mb, 'feature-dim':self.feat_dim, 'infer': self.cost_ms}
+
+
+    def forward(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.image_size
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+                if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                    nimg = cv2.resize(nimg, input_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(imgs, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+        return net_out
+
+    def benchmark(self, img):
+        input_size = self.image_size
+        if self.crop is not None:
+            nimg = img[self.crop[1]:self.crop[3],self.crop[0]:self.crop[2],:]
+            if nimg.shape[0]!=input_size[1] or nimg.shape[1]!=input_size[0]:
+                nimg = cv2.resize(nimg, input_size)
+            img = nimg
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        costs = []
+        for _ in range(50):
+            ta = datetime.datetime.now()
+            net_out = self.session.run(self.output_names, {self.input_name : blob})[0]
+            tb = datetime.datetime.now()
+            cost = (tb-ta).total_seconds()
+            costs.append(cost)
+        costs = sorted(costs)
+        cost = costs[5]
+        return net_out, cost
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    # general
+    parser.add_argument('workdir', help='submitted work dir', type=str)
+    parser.add_argument('--track', help='track name, for different challenge', type=str, default='cfat')
+    args = parser.parse_args()
+    handler = ArcFaceORT(args.workdir)
+    err = handler.check(args.track)
+    print('err:', err)
diff --git a/insightface/recognition/arcface_torch/onnx_ijbc.py b/insightface/recognition/arcface_torch/onnx_ijbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..31c491b1be03da5f778d7a2ebad4fa63b99d2f9a
--- /dev/null
+++ b/insightface/recognition/arcface_torch/onnx_ijbc.py
@@ -0,0 +1,269 @@
+import argparse
+import os
+import pickle
+import timeit
+
+import cv2
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import prettytable
+import skimage.transform
+import torch
+from sklearn.metrics import roc_curve
+from sklearn.preprocessing import normalize
+from torch.utils.data import DataLoader
+from onnx_helper import ArcFaceORT
+
+SRC = np.array(
+    [
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041]]
+    , dtype=np.float32)
+SRC[:, 0] += 8.0
+
+
+@torch.no_grad()
+class AlignedDataSet(mx.gluon.data.Dataset):
+    def __init__(self, root, lines, align=True):
+        self.lines = lines
+        self.root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(' ')
+        name = os.path.join(self.root, name_lmk_score[0])
+        img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
+        landmark5 = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32).reshape((5, 2))
+        st = skimage.transform.SimilarityTransform()
+        st.estimate(landmark5, SRC)
+        img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0)
+        img_1 = np.expand_dims(img, 0)
+        img_2 = np.expand_dims(np.fliplr(img), 0)
+        output = np.concatenate((img_1, img_2), axis=0).astype(np.float32)
+        output = np.transpose(output, (0, 3, 1, 2))
+        return torch.from_numpy(output)
+
+
+@torch.no_grad()
+def extract(model_root, dataset):
+    model = ArcFaceORT(model_path=model_root)
+    model.check()
+    feat_mat = np.zeros(shape=(len(dataset), 2 * model.feat_dim))
+
+    def collate_fn(data):
+        return torch.cat(data, dim=0)
+
+    data_loader = DataLoader(
+        dataset, batch_size=128, drop_last=False, num_workers=4, collate_fn=collate_fn, )
+    num_iter = 0
+    for batch in data_loader:
+        batch = batch.numpy()
+        batch = (batch - model.input_mean) / model.input_std
+        feat = model.session.run(model.output_names, {model.input_name: batch})[0]
+        feat = np.reshape(feat, (-1, model.feat_dim * 2))
+        feat_mat[128 * num_iter: 128 * num_iter + feat.shape[0], :] = feat
+        num_iter += 1
+        if num_iter % 50 == 0:
+            print(num_iter)
+    return feat_mat
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None,
+                           templates=None,
+                           medias=None):
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), ]
+        media_norm_feats = np.array(media_norm_feats)
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000
+    sublists = [total_pairs[i: i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def main(args):
+    use_norm_score = True  # if Ture, TestMode(N1)
+    use_detector_score = True  # if Ture, TestMode(D1)
+    use_flip_test = True  # if Ture, TestMode(F1)
+    assert args.target == 'IJBC' or args.target == 'IJBB'
+
+    start = timeit.default_timer()
+    templates, medias = read_template_media_list(
+        os.path.join('%s/meta' % args.image_path, '%s_face_tid_mid.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    p1, p2, label = read_template_pair_list(
+        os.path.join('%s/meta' % args.image_path,
+                     '%s_template_pair_label.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    img_path = '%s/loose_crop' % args.image_path
+    img_list_path = '%s/meta/%s_name_5pts_score.txt' % (args.image_path, args.target.lower())
+    img_list = open(img_list_path)
+    files = img_list.readlines()
+    dataset = AlignedDataSet(root=img_path, lines=files, align=True)
+    img_feats = extract(args.model_root, dataset)
+
+    faceness_scores = []
+    for each_line in files:
+        name_lmk_score = each_line.split()
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0], img_feats.shape[1]))
+    start = timeit.default_timer()
+
+    if use_flip_test:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_feats[:, img_feats.shape[1] // 2:]
+    else:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+    if use_detector_score:
+        print(img_input_feats.shape, faceness_scores.shape)
+        img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+    else:
+        img_input_feats = img_input_feats
+
+    template_norm_feats, unique_templates = image2template_feature(
+        img_input_feats, templates, medias)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    score = verification(template_norm_feats, unique_templates, p1, p2)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    result_dir = args.model_root
+
+    save_path = os.path.join(result_dir, "{}_result".format(args.target))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    score_save_file = os.path.join(save_path, "{}.npy".format(args.target))
+    np.save(score_save_file, score)
+    files = [score_save_file]
+    methods = []
+    scores = []
+    for file in files:
+        methods.append(os.path.basename(file))
+        scores.append(np.load(file))
+    methods = np.array(methods)
+    scores = dict(zip(methods, scores))
+    x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+    tpr_fpr_table = prettytable.PrettyTable(['Methods'] + [str(x) for x in x_labels])
+    for method in methods:
+        fpr, tpr, _ = roc_curve(label, scores[method])
+        fpr = np.flipud(fpr)
+        tpr = np.flipud(tpr)
+        tpr_fpr_row = []
+        tpr_fpr_row.append("%s-%s" % (method, args.target))
+        for fpr_iter in np.arange(len(x_labels)):
+            _, min_index = min(
+                list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+            tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+        tpr_fpr_table.add_row(tpr_fpr_row)
+    print(tpr_fpr_table)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do ijb test')
+    # general
+    parser.add_argument('--model-root', default='', help='path to load model.')
+    parser.add_argument('--image-path', default='/train_tmp/IJB_release/IJBC', type=str, help='')
+    parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_torch/partial_fc_v2.py b/insightface/recognition/arcface_torch/partial_fc_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0752554ca1a99c35347dce6cccd121b5cd69f9c6
--- /dev/null
+++ b/insightface/recognition/arcface_torch/partial_fc_v2.py
@@ -0,0 +1,260 @@
+
+import math
+from typing import Callable
+
+import torch
+from torch import distributed
+from torch.nn.functional import linear, normalize
+
+
+class PartialFC_V2(torch.nn.Module):
+    """
+    https://arxiv.org/abs/2203.15565
+    A distributed sparsely updating variant of the FC layer, named Partial FC (PFC).
+    When sample rate less than 1, in each iteration, positive class centers and a random subset of
+    negative class centers are selected to compute the margin-based softmax loss, all class
+    centers are still maintained throughout the whole training process, but only a subset is
+    selected and updated in each iteration.
+    .. note::
+        When sample rate equal to 1, Partial FC is equal to model parallelism(default sample rate is 1).
+    Example:
+    --------
+    >>> module_pfc = PartialFC(embedding_size=512, num_classes=8000000, sample_rate=0.2)
+    >>> for img, labels in data_loader:
+    >>>     embeddings = net(img)
+    >>>     loss = module_pfc(embeddings, labels)
+    >>>     loss.backward()
+    >>>     optimizer.step()
+    """
+    _version = 2
+
+    def __init__(
+        self,
+        margin_loss: Callable,
+        embedding_size: int,
+        num_classes: int,
+        sample_rate: float = 1.0,
+        fp16: bool = False,
+    ):
+        """
+        Paramenters:
+        -----------
+        embedding_size: int
+            The dimension of embedding, required
+        num_classes: int
+            Total number of classes, required
+        sample_rate: float
+            The rate of negative centers participating in the calculation, default is 1.0.
+        """
+        super(PartialFC_V2, self).__init__()
+        assert (
+            distributed.is_initialized()
+        ), "must initialize distributed before create this"
+        self.rank = distributed.get_rank()
+        self.world_size = distributed.get_world_size()
+
+        self.dist_cross_entropy = DistCrossEntropy()
+        self.embedding_size = embedding_size
+        self.sample_rate: float = sample_rate
+        self.fp16 = fp16
+        self.num_local: int = num_classes // self.world_size + int(
+            self.rank < num_classes % self.world_size
+        )
+        self.class_start: int = num_classes // self.world_size * self.rank + min(
+            self.rank, num_classes % self.world_size
+        )
+        self.num_sample: int = int(self.sample_rate * self.num_local)
+        self.last_batch_size: int = 0
+
+        self.is_updated: bool = True
+        self.init_weight_update: bool = True
+        self.weight = torch.nn.Parameter(torch.normal(0, 0.01, (self.num_local, embedding_size)))
+
+        # margin_loss
+        if isinstance(margin_loss, Callable):
+            self.margin_softmax = margin_loss
+        else:
+            raise
+
+    def sample(self, labels, index_positive):
+        """
+            This functions will change the value of labels
+            Parameters:
+            -----------
+            labels: torch.Tensor
+                pass
+            index_positive: torch.Tensor
+                pass
+            optimizer: torch.optim.Optimizer
+                pass
+        """
+        with torch.no_grad():
+            positive = torch.unique(labels[index_positive], sorted=True).cuda()
+            if self.num_sample - positive.size(0) >= 0:
+                perm = torch.rand(size=[self.num_local]).cuda()
+                perm[positive] = 2.0
+                index = torch.topk(perm, k=self.num_sample)[1].cuda()
+                index = index.sort()[0].cuda()
+            else:
+                index = positive
+            self.weight_index = index
+
+            labels[index_positive] = torch.searchsorted(index, labels[index_positive])
+
+        return self.weight[self.weight_index]
+
+    def forward(
+        self,
+        local_embeddings: torch.Tensor,
+        local_labels: torch.Tensor,
+    ):
+        """
+        Parameters:
+        ----------
+        local_embeddings: torch.Tensor
+            feature embeddings on each GPU(Rank).
+        local_labels: torch.Tensor
+            labels on each GPU(Rank).
+        Returns:
+        -------
+        loss: torch.Tensor
+            pass
+        """
+        local_labels.squeeze_()
+        local_labels = local_labels.long()
+
+        batch_size = local_embeddings.size(0)
+        if self.last_batch_size == 0:
+            self.last_batch_size = batch_size
+        assert self.last_batch_size == batch_size, (
+            f"last batch size do not equal current batch size: {self.last_batch_size} vs {batch_size}")
+
+        _gather_embeddings = [
+            torch.zeros((batch_size, self.embedding_size)).cuda()
+            for _ in range(self.world_size)
+        ]
+        _gather_labels = [
+            torch.zeros(batch_size).long().cuda() for _ in range(self.world_size)
+        ]
+        _list_embeddings = AllGather(local_embeddings, *_gather_embeddings)
+        distributed.all_gather(_gather_labels, local_labels)
+
+        embeddings = torch.cat(_list_embeddings)
+        labels = torch.cat(_gather_labels)
+
+        labels = labels.view(-1, 1)
+        index_positive = (self.class_start <= labels) & (
+            labels < self.class_start + self.num_local
+        )
+        labels[~index_positive] = -1
+        labels[index_positive] -= self.class_start
+
+        if self.sample_rate < 1:
+            weight = self.sample(labels, index_positive)
+        else:
+            weight = self.weight
+
+        with torch.cuda.amp.autocast(self.fp16):
+            norm_embeddings = normalize(embeddings)
+            norm_weight_activated = normalize(weight)
+            logits = linear(norm_embeddings, norm_weight_activated)
+        if self.fp16:
+            logits = logits.float()
+        logits = logits.clamp(-1, 1)
+
+        logits = self.margin_softmax(logits, labels)
+        loss = self.dist_cross_entropy(logits, labels)
+        return loss
+
+
+class DistCrossEntropyFunc(torch.autograd.Function):
+    """
+    CrossEntropy loss is calculated in parallel, allreduce denominator into single gpu and calculate softmax.
+    Implemented of ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf):
+    """
+
+    @staticmethod
+    def forward(ctx, logits: torch.Tensor, label: torch.Tensor):
+        """ """
+        batch_size = logits.size(0)
+        # for numerical stability
+        max_logits, _ = torch.max(logits, dim=1, keepdim=True)
+        # local to global
+        distributed.all_reduce(max_logits, distributed.ReduceOp.MAX)
+        logits.sub_(max_logits)
+        logits.exp_()
+        sum_logits_exp = torch.sum(logits, dim=1, keepdim=True)
+        # local to global
+        distributed.all_reduce(sum_logits_exp, distributed.ReduceOp.SUM)
+        logits.div_(sum_logits_exp)
+        index = torch.where(label != -1)[0]
+        # loss
+        loss = torch.zeros(batch_size, 1, device=logits.device)
+        loss[index] = logits[index].gather(1, label[index])
+        distributed.all_reduce(loss, distributed.ReduceOp.SUM)
+        ctx.save_for_backward(index, logits, label)
+        return loss.clamp_min_(1e-30).log_().mean() * (-1)
+
+    @staticmethod
+    def backward(ctx, loss_gradient):
+        """
+        Args:
+            loss_grad (torch.Tensor): gradient backward by last layer
+        Returns:
+            gradients for each input in forward function
+            `None` gradients for one-hot label
+        """
+        (
+            index,
+            logits,
+            label,
+        ) = ctx.saved_tensors
+        batch_size = logits.size(0)
+        one_hot = torch.zeros(
+            size=[index.size(0), logits.size(1)], device=logits.device
+        )
+        one_hot.scatter_(1, label[index], 1)
+        logits[index] -= one_hot
+        logits.div_(batch_size)
+        return logits * loss_gradient.item(), None
+
+
+class DistCrossEntropy(torch.nn.Module):
+    def __init__(self):
+        super(DistCrossEntropy, self).__init__()
+
+    def forward(self, logit_part, label_part):
+        return DistCrossEntropyFunc.apply(logit_part, label_part)
+
+
+class AllGatherFunc(torch.autograd.Function):
+    """AllGather op with gradient backward"""
+
+    @staticmethod
+    def forward(ctx, tensor, *gather_list):
+        gather_list = list(gather_list)
+        distributed.all_gather(gather_list, tensor)
+        return tuple(gather_list)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        grad_list = list(grads)
+        rank = distributed.get_rank()
+        grad_out = grad_list[rank]
+
+        dist_ops = [
+            distributed.reduce(grad_out, rank, distributed.ReduceOp.SUM, async_op=True)
+            if i == rank
+            else distributed.reduce(
+                grad_list[i], i, distributed.ReduceOp.SUM, async_op=True
+            )
+            for i in range(distributed.get_world_size())
+        ]
+        for _op in dist_ops:
+            _op.wait()
+
+        grad_out *= len(grad_list)  # cooperate with distributed loss function
+        return (grad_out, *[None for _ in range(len(grad_list))])
+
+
+AllGather = AllGatherFunc.apply
diff --git a/insightface/recognition/arcface_torch/requirement.txt b/insightface/recognition/arcface_torch/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f1a431ef9c39b258b676411f1081ed9006a8b817
--- /dev/null
+++ b/insightface/recognition/arcface_torch/requirement.txt
@@ -0,0 +1,6 @@
+tensorboard
+easydict
+mxnet
+onnx
+sklearn
+opencv-python
\ No newline at end of file
diff --git a/insightface/recognition/arcface_torch/run.sh b/insightface/recognition/arcface_torch/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6eacdf8e814d7bd68650c7eda8f72687ee74db16
--- /dev/null
+++ b/insightface/recognition/arcface_torch/run.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train_v2.py $@
diff --git a/insightface/recognition/arcface_torch/scripts/shuffle_rec.py b/insightface/recognition/arcface_torch/scripts/shuffle_rec.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3b68e938f17aaf98fae269c44119eaef299b1a2
--- /dev/null
+++ b/insightface/recognition/arcface_torch/scripts/shuffle_rec.py
@@ -0,0 +1,81 @@
+import argparse
+import multiprocessing
+import os
+import time
+
+import mxnet as mx
+import numpy as np
+
+
+def read_worker(args, q_in):
+    path_imgidx = os.path.join(args.input, "train.idx")
+    path_imgrec = os.path.join(args.input, "train.rec")
+    imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r")
+
+    s = imgrec.read_idx(0)
+    header, _ = mx.recordio.unpack(s)
+    assert header.flag > 0
+
+    imgidx = np.array(range(1, int(header.label[0])))
+    np.random.shuffle(imgidx)
+    
+    for idx in imgidx:
+        item = imgrec.read_idx(idx)
+        q_in.put(item)
+
+    q_in.put(None)
+    imgrec.close()
+
+
+def write_worker(args, q_out):
+    pre_time = time.time()
+    
+    if args.input[-1] == '/':
+        args.input = args.input[:-1]
+    dirname = os.path.dirname(args.input)
+    basename = os.path.basename(args.input)
+    output = os.path.join(dirname, f"shuffled_{basename}")
+    os.makedirs(output, exist_ok=True)
+    
+    path_imgidx = os.path.join(output, "train.idx")
+    path_imgrec = os.path.join(output, "train.rec")
+    save_record = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "w")
+    more = True
+    count = 0
+    while more:
+        deq = q_out.get()
+        if deq is None:
+            more = False
+        else:
+            header, jpeg = mx.recordio.unpack(deq)
+            # TODO it is currently not fully developed
+            if isinstance(header.label, float):
+                label = header.label
+            else:
+                label = header.label[0]
+
+            header = mx.recordio.IRHeader(flag=header.flag, label=label, id=header.id, id2=header.id2)
+            save_record.write_idx(count, mx.recordio.pack(header, jpeg))
+            count += 1
+            if count % 10000 == 0:
+                cur_time = time.time()
+                print('save time:', cur_time - pre_time, ' count:', count)
+                pre_time = cur_time
+    print(count)
+    save_record.close()
+
+
+def main(args):
+    queue = multiprocessing.Queue(10240)
+    read_process = multiprocessing.Process(target=read_worker, args=(args, queue))
+    read_process.daemon = True
+    read_process.start()
+    write_process = multiprocessing.Process(target=write_worker, args=(args, queue))
+    write_process.start()
+    write_process.join()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input', help='path to source rec.')
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_torch/torch2onnx.py b/insightface/recognition/arcface_torch/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6055d1fe7d20cbf02812d95b509511c943766de
--- /dev/null
+++ b/insightface/recognition/arcface_torch/torch2onnx.py
@@ -0,0 +1,53 @@
+import numpy as np
+import onnx
+import torch
+
+
+def convert_onnx(net, path_module, output, opset=11, simplify=False):
+    assert isinstance(net, torch.nn.Module)
+    img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+    img = img.astype(np.float)
+    img = (img / 255. - 0.5) / 0.5  # torch style norm
+    img = img.transpose((2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+
+    weight = torch.load(path_module)
+    net.load_state_dict(weight, strict=True)
+    net.eval()
+    torch.onnx.export(net, img, output, input_names=["data"], keep_initializers_as_inputs=False, verbose=False, opset_version=opset)
+    model = onnx.load(output)
+    graph = model.graph
+    graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+    if simplify:
+        from onnxsim import simplify
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model, output)
+
+    
+if __name__ == '__main__':
+    import os
+    import argparse
+    from backbones import get_model
+
+    parser = argparse.ArgumentParser(description='ArcFace PyTorch to onnx')
+    parser.add_argument('input', type=str, help='input backbone.pth file or path')
+    parser.add_argument('--output', type=str, default=None, help='output onnx path')
+    parser.add_argument('--network', type=str, default=None, help='backbone network')
+    parser.add_argument('--simplify', type=bool, default=False, help='onnx simplify')
+    args = parser.parse_args()
+    input_file = args.input
+    if os.path.isdir(input_file):
+        input_file = os.path.join(input_file, "model.pt")
+    assert os.path.exists(input_file)
+    # model_name = os.path.basename(os.path.dirname(input_file)).lower()
+    # params = model_name.split("_")
+    # if len(params) >= 3 and params[1] in ('arcface', 'cosface'):
+    #     if args.network is None:
+    #         args.network = params[2]
+    assert args.network is not None
+    print(args)
+    backbone_onnx = get_model(args.network, dropout=0.0, fp16=False, num_features=512)
+    if args.output is None:
+        args.output = os.path.join(os.path.dirname(args.input), "model.onnx")
+    convert_onnx(backbone_onnx, input_file, args.output, simplify=args.simplify)
diff --git a/insightface/recognition/arcface_torch/train_v2.py b/insightface/recognition/arcface_torch/train_v2.py
new file mode 100755
index 0000000000000000000000000000000000000000..9563c6533208392ce1a81cbfaae4bcde78ade704
--- /dev/null
+++ b/insightface/recognition/arcface_torch/train_v2.py
@@ -0,0 +1,257 @@
+import argparse
+import logging
+import os
+from datetime import datetime
+
+import numpy as np
+import torch
+from backbones import get_model
+from dataset import get_dataloader
+from losses import CombinedMarginLoss
+from lr_scheduler import PolynomialLRWarmup
+from partial_fc_v2 import PartialFC_V2
+from torch import distributed
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from utils.utils_callbacks import CallBackLogging, CallBackVerification
+from utils.utils_config import get_config
+from utils.utils_distributed_sampler import setup_seed
+from utils.utils_logging import AverageMeter, init_logging
+from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import fp16_compress_hook
+
+assert torch.__version__ >= "1.12.0", "In order to enjoy the features of the new torch, \
+we have upgraded the torch to 1.12.0. torch before than 1.12.0 may not work in the future."
+
+try:
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    distributed.init_process_group("nccl")
+except KeyError:
+    rank = 0
+    local_rank = 0
+    world_size = 1
+    distributed.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12584",
+        rank=rank,
+        world_size=world_size,
+    )
+
+
+def main(args):
+
+    # get config
+    cfg = get_config(args.config)
+    # global control random seed
+    setup_seed(seed=cfg.seed, cuda_deterministic=False)
+
+    torch.cuda.set_device(local_rank)
+
+    os.makedirs(cfg.output, exist_ok=True)
+    init_logging(rank, cfg.output)
+
+    summary_writer = (
+        SummaryWriter(log_dir=os.path.join(cfg.output, "tensorboard"))
+        if rank == 0
+        else None
+    )
+    
+    wandb_logger = None
+    if cfg.using_wandb:
+        import wandb
+        # Sign in to wandb
+        try:
+            wandb.login(key=cfg.wandb_key)
+        except Exception as e:
+            print("WandB Key must be provided in config file (base.py).")
+            print(f"Config Error: {e}")
+        # Initialize wandb
+        run_name = datetime.now().strftime("%y%m%d_%H%M") + f"_GPU{rank}"
+        run_name = run_name if cfg.suffix_run_name is None else run_name + f"_{cfg.suffix_run_name}"
+        try:
+            wandb_logger = wandb.init(
+                entity = cfg.wandb_entity, 
+                project = cfg.wandb_project, 
+                sync_tensorboard = True,
+                resume=cfg.wandb_resume,
+                name = run_name, 
+                notes = cfg.notes) if rank == 0 or cfg.wandb_log_all else None
+            if wandb_logger:
+                wandb_logger.config.update(cfg)
+        except Exception as e:
+            print("WandB Data (Entity and Project name) must be provided in config file (base.py).")
+            print(f"Config Error: {e}")
+    train_loader = get_dataloader(
+        cfg.rec,
+        local_rank,
+        cfg.batch_size,
+        cfg.dali,
+        cfg.dali_aug,
+        cfg.seed,
+        cfg.num_workers
+    )
+
+    backbone = get_model(
+        cfg.network, dropout=0.0, fp16=cfg.fp16, num_features=cfg.embedding_size).cuda()
+
+    backbone = torch.nn.parallel.DistributedDataParallel(
+        module=backbone, broadcast_buffers=False, device_ids=[local_rank], bucket_cap_mb=16,
+        find_unused_parameters=True)
+    backbone.register_comm_hook(None, fp16_compress_hook)
+
+    backbone.train()
+    # FIXME using gradient checkpoint if there are some unused parameters will cause error
+    backbone._set_static_graph()
+
+    margin_loss = CombinedMarginLoss(
+        64,
+        cfg.margin_list[0],
+        cfg.margin_list[1],
+        cfg.margin_list[2],
+        cfg.interclass_filtering_threshold
+    )
+
+    if cfg.optimizer == "sgd":
+        module_partial_fc = PartialFC_V2(
+            margin_loss, cfg.embedding_size, cfg.num_classes,
+            cfg.sample_rate, False)
+        module_partial_fc.train().cuda()
+        # TODO the params of partial fc must be last in the params list
+        opt = torch.optim.SGD(
+            params=[{"params": backbone.parameters()}, {"params": module_partial_fc.parameters()}],
+            lr=cfg.lr, momentum=0.9, weight_decay=cfg.weight_decay)
+
+    elif cfg.optimizer == "adamw":
+        module_partial_fc = PartialFC_V2(
+            margin_loss, cfg.embedding_size, cfg.num_classes,
+            cfg.sample_rate, False)
+        module_partial_fc.train().cuda()
+        opt = torch.optim.AdamW(
+            params=[{"params": backbone.parameters()}, {"params": module_partial_fc.parameters()}],
+            lr=cfg.lr, weight_decay=cfg.weight_decay)
+    else:
+        raise
+
+    cfg.total_batch_size = cfg.batch_size * world_size
+    cfg.warmup_step = cfg.num_image // cfg.total_batch_size * cfg.warmup_epoch
+    cfg.total_step = cfg.num_image // cfg.total_batch_size * cfg.num_epoch
+
+    lr_scheduler = PolynomialLRWarmup(
+        optimizer=opt,
+        warmup_iters=cfg.warmup_step,
+        total_iters=cfg.total_step)
+
+    start_epoch = 0
+    global_step = 0
+    if cfg.resume:
+        dict_checkpoint = torch.load(os.path.join(cfg.output, f"checkpoint_gpu_{rank}.pt"))
+        start_epoch = dict_checkpoint["epoch"]
+        global_step = dict_checkpoint["global_step"]
+        backbone.module.load_state_dict(dict_checkpoint["state_dict_backbone"])
+        module_partial_fc.load_state_dict(dict_checkpoint["state_dict_softmax_fc"])
+        opt.load_state_dict(dict_checkpoint["state_optimizer"])
+        lr_scheduler.load_state_dict(dict_checkpoint["state_lr_scheduler"])
+        del dict_checkpoint
+
+    for key, value in cfg.items():
+        num_space = 25 - len(key)
+        logging.info(": " + key + " " * num_space + str(value))
+
+    callback_verification = CallBackVerification(
+        val_targets=cfg.val_targets, rec_prefix=cfg.rec, 
+        summary_writer=summary_writer, wandb_logger = wandb_logger
+    )
+    callback_logging = CallBackLogging(
+        frequent=cfg.frequent,
+        total_step=cfg.total_step,
+        batch_size=cfg.batch_size,
+        start_step = global_step,
+        writer=summary_writer
+    )
+
+    loss_am = AverageMeter()
+    amp = torch.cuda.amp.grad_scaler.GradScaler(growth_interval=100)
+
+    for epoch in range(start_epoch, cfg.num_epoch):
+
+        if isinstance(train_loader, DataLoader):
+            train_loader.sampler.set_epoch(epoch)
+        for _, (img, local_labels) in enumerate(train_loader):
+            global_step += 1
+            local_embeddings = backbone(img)
+            loss: torch.Tensor = module_partial_fc(local_embeddings, local_labels)
+
+            if cfg.fp16:
+                amp.scale(loss).backward()
+                if global_step % cfg.gradient_acc == 0:
+                    amp.unscale_(opt)
+                    torch.nn.utils.clip_grad_norm_(backbone.parameters(), 5)
+                    amp.step(opt)
+                    amp.update()
+                    opt.zero_grad()
+            else:
+                loss.backward()
+                if global_step % cfg.gradient_acc == 0:
+                    torch.nn.utils.clip_grad_norm_(backbone.parameters(), 5)
+                    opt.step()
+                    opt.zero_grad()
+            lr_scheduler.step()
+
+            with torch.no_grad():
+                if wandb_logger:
+                    wandb_logger.log({
+                        'Loss/Step Loss': loss.item(),
+                        'Loss/Train Loss': loss_am.avg,
+                        'Process/Step': global_step,
+                        'Process/Epoch': epoch
+                    })
+                    
+                loss_am.update(loss.item(), 1)
+                callback_logging(global_step, loss_am, epoch, cfg.fp16, lr_scheduler.get_last_lr()[0], amp)
+
+                if global_step % cfg.verbose == 0 and global_step > 0:
+                    callback_verification(global_step, backbone)
+
+        if cfg.save_all_states:
+            checkpoint = {
+                "epoch": epoch + 1,
+                "global_step": global_step,
+                "state_dict_backbone": backbone.module.state_dict(),
+                "state_dict_softmax_fc": module_partial_fc.state_dict(),
+                "state_optimizer": opt.state_dict(),
+                "state_lr_scheduler": lr_scheduler.state_dict()
+            }
+            torch.save(checkpoint, os.path.join(cfg.output, f"checkpoint_gpu_{rank}.pt"))
+
+        if rank == 0:
+            path_module = os.path.join(cfg.output, "model.pt")
+            torch.save(backbone.module.state_dict(), path_module)
+
+            if wandb_logger and cfg.save_artifacts:
+                artifact_name = f"{run_name}_E{epoch}"
+                model = wandb.Artifact(artifact_name, type='model')
+                model.add_file(path_module)
+                wandb_logger.log_artifact(model)
+                
+        if cfg.dali:
+            train_loader.reset()
+
+    if rank == 0:
+        path_module = os.path.join(cfg.output, "model.pt")
+        torch.save(backbone.module.state_dict(), path_module)
+        
+        if wandb_logger and cfg.save_artifacts:
+            artifact_name = f"{run_name}_Final"
+            model = wandb.Artifact(artifact_name, type='model')
+            model.add_file(path_module)
+            wandb_logger.log_artifact(model)
+
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+    parser = argparse.ArgumentParser(
+        description="Distributed Arcface Training in Pytorch")
+    parser.add_argument("config", type=str, help="py config file")
+    main(parser.parse_args())
diff --git a/insightface/recognition/arcface_torch/utils/__init__.py b/insightface/recognition/arcface_torch/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/arcface_torch/utils/plot.py b/insightface/recognition/arcface_torch/utils/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1d39da640769baae524709937aad192b71bd30
--- /dev/null
+++ b/insightface/recognition/arcface_torch/utils/plot.py
@@ -0,0 +1,71 @@
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from sklearn.metrics import roc_curve, auc
+
+with open(sys.argv[1], "r") as f:
+    files = f.readlines()
+
+files = [x.strip() for x in files]
+image_path = "/train_tmp/IJB_release/IJBC"
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % 'ijbc'))
+
+methods = []
+scores = []
+for file in files:
+    methods.append(file)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append(method)
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+print(tpr_fpr_table)
diff --git a/insightface/recognition/arcface_torch/utils/utils_callbacks.py b/insightface/recognition/arcface_torch/utils/utils_callbacks.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9368073f8bc091b28e7325a9099881dfc5f54cd
--- /dev/null
+++ b/insightface/recognition/arcface_torch/utils/utils_callbacks.py
@@ -0,0 +1,125 @@
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+from eval import verification
+from utils.utils_logging import AverageMeter
+from torch.utils.tensorboard import SummaryWriter
+from torch import distributed
+
+
+class CallBackVerification(object):
+    
+    def __init__(self, val_targets, rec_prefix, summary_writer=None, image_size=(112, 112), wandb_logger=None):
+        self.rank: int = distributed.get_rank()
+        self.highest_acc: float = 0.0
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        if self.rank is 0:
+            self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size)
+
+        self.summary_writer = summary_writer
+        self.wandb_logger = wandb_logger
+
+    def ver_test(self, backbone: torch.nn.Module, global_step: int):
+        results = []
+        for i in range(len(self.ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], backbone, 10, 10)
+            logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
+
+            self.summary_writer: SummaryWriter
+            self.summary_writer.add_scalar(tag=self.ver_name_list[i], scalar_value=acc2, global_step=global_step, )
+            if self.wandb_logger:
+                import wandb
+                self.wandb_logger.log({
+                    f'Acc/val-Acc1 {self.ver_name_list[i]}': acc1,
+                    f'Acc/val-Acc2 {self.ver_name_list[i]}': acc2,
+                    # f'Acc/val-std1 {self.ver_name_list[i]}': std1,
+                    # f'Acc/val-std2 {self.ver_name_list[i]}': acc2,
+                })
+
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                '[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update, backbone: torch.nn.Module):
+        if self.rank is 0 and num_update > 0:
+            backbone.eval()
+            self.ver_test(backbone, num_update)
+            backbone.train()
+
+
+class CallBackLogging(object):
+    def __init__(self, frequent, total_step, batch_size, start_step=0,writer=None):
+        self.frequent: int = frequent
+        self.rank: int = distributed.get_rank()
+        self.world_size: int = distributed.get_world_size()
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.start_step: int = start_step
+        self.batch_size: int = batch_size
+        self.writer = writer
+
+        self.init = False
+        self.tic = 0
+
+    def __call__(self,
+                 global_step: int,
+                 loss: AverageMeter,
+                 epoch: int,
+                 fp16: bool,
+                 learning_rate: float,
+                 grad_scaler: torch.cuda.amp.GradScaler):
+        if self.rank == 0 and global_step > 0 and global_step % self.frequent == 0:
+            if self.init:
+                try:
+                    speed: float = self.frequent * self.batch_size / (time.time() - self.tic)
+                    speed_total = speed * self.world_size
+                except ZeroDivisionError:
+                    speed_total = float('inf')
+
+                #time_now = (time.time() - self.time_start) / 3600
+                #time_total = time_now / ((global_step + 1) / self.total_step)
+                #time_for_end = time_total - time_now
+                time_now = time.time()
+                time_sec = int(time_now - self.time_start)
+                time_sec_avg = time_sec / (global_step - self.start_step + 1)
+                eta_sec = time_sec_avg * (self.total_step - global_step - 1)
+                time_for_end = eta_sec/3600
+                if self.writer is not None:
+                    self.writer.add_scalar('time_for_end', time_for_end, global_step)
+                    self.writer.add_scalar('learning_rate', learning_rate, global_step)
+                    self.writer.add_scalar('loss', loss.avg, global_step)
+                if fp16:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   LearningRate %.6f   Epoch: %d   Global Step: %d   " \
+                          "Fp16 Grad Scale: %2.f   Required: %1.f hours" % (
+                              speed_total, loss.avg, learning_rate, epoch, global_step,
+                              grad_scaler.get_scale(), time_for_end
+                          )
+                else:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   LearningRate %.6f   Epoch: %d   Global Step: %d   " \
+                          "Required: %1.f hours" % (
+                              speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end
+                          )
+                logging.info(msg)
+                loss.reset()
+                self.tic = time.time()
+            else:
+                self.init = True
+                self.tic = time.time()
diff --git a/insightface/recognition/arcface_torch/utils/utils_config.py b/insightface/recognition/arcface_torch/utils/utils_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c02eaf70fc0140aca7925f621c29a496f491cae
--- /dev/null
+++ b/insightface/recognition/arcface_torch/utils/utils_config.py
@@ -0,0 +1,16 @@
+import importlib
+import os.path as osp
+
+
+def get_config(config_file):
+    assert config_file.startswith('configs/'), 'config file setting must start with configs/'
+    temp_config_name = osp.basename(config_file)
+    temp_module_name = osp.splitext(temp_config_name)[0]
+    config = importlib.import_module("configs.base")
+    cfg = config.config
+    config = importlib.import_module("configs.%s" % temp_module_name)
+    job_cfg = config.config
+    cfg.update(job_cfg)
+    if cfg.output is None:
+        cfg.output = osp.join('work_dirs', temp_module_name)
+    return cfg
\ No newline at end of file
diff --git a/insightface/recognition/arcface_torch/utils/utils_distributed_sampler.py b/insightface/recognition/arcface_torch/utils/utils_distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea6703965bff81f8b789ffd933f9b2f889cb680
--- /dev/null
+++ b/insightface/recognition/arcface_torch/utils/utils_distributed_sampler.py
@@ -0,0 +1,126 @@
+import math
+import os
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+def setup_seed(seed, cuda_deterministic=True):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    if cuda_deterministic:  # slower, more reproducible
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    else:  # faster, less reproducible
+        torch.backends.cudnn.deterministic = False
+        torch.backends.cudnn.benchmark = True
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+
+
+def get_dist_info():
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+
+    return rank, world_size
+
+
+def sync_random_seed(seed=None, device="cuda"):
+    """Make sure different ranks share the same seed.
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+
+    dist.broadcast(random_num, src=0)
+
+    return random_num.item()
+
+
+class DistributedSampler(_DistributedSampler):
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,  # world_size
+        rank=None,  # local_rank
+        shuffle=True,
+        seed=0,
+    ):
+
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            # When :attr:`shuffle=True`, this ensures all replicas
+            # use a different random ordering for each epoch.
+            # Otherwise, the next iteration of this sampler will
+            # yield the same ordering.
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices * math.ceil(self.total_size / len(indices)))[
+            : self.total_size
+        ]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/insightface/recognition/arcface_torch/utils/utils_logging.py b/insightface/recognition/arcface_torch/utils/utils_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..c787b6aae7cd037a4718df44d672b8ffa9e5c249
--- /dev/null
+++ b/insightface/recognition/arcface_torch/utils/utils_logging.py
@@ -0,0 +1,41 @@
+import logging
+import os
+import sys
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(rank, models_root):
+    if rank == 0:
+        log_root = logging.getLogger()
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
+        handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info('rank_id: %d' % rank)
diff --git a/insightface/recognition/idmmd/README.md b/insightface/recognition/idmmd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fce9b9103977288845415d8649694df5f6b5545
--- /dev/null
+++ b/insightface/recognition/idmmd/README.md
@@ -0,0 +1,55 @@
+# Physical-based Rendering for NIR-VIS Face Recognition
+
+by [Yunqi Miao*](https://yoqim.github.io/), [Alexandros Lattas*](https://alexlattas.com/), [Jiankang Deng](https://jiankangdeng.github.io/), [Jungong Han](https://jungonghan.github.io/), and [Stefanos Zafeiriou](https://wp.doc.ic.ac.uk/szafeiri/).
+
+
+For more information, please check our
+
+**[[Arxiv]](https://arxiv.org/abs/2211.06408)**
+**[[NeurIPS 2022 Paper]](https://openreview.net/pdf?id=NN_TpS5dpo5)**
+
+
+If you find this project useful in your research, please consider citing:
+
+```
+@inproceedings{miao2022physically,
+  title={Physically-Based Face Rendering for NIR-VIS Face Recognition},
+  author={Miao, Yunqi and Lattas, Alexandros and Deng, Jiankang and Han, Jungong and Zafeiriou, Stefanos},
+  booktitle={NeurIPS 2022},
+  year={2022}
+}
+```
+
+# Overview
+![poster](https://github.com/yoqim/PR-HFR/blob/main/pics/Poster.png)
+
+# Training
+
+For this project, we used python 3.7.10.
+
+## How to run?
+
+```shell
+sh run.sh
+```
+
+
+# Testing
+## Preparation
+- Downloading data (112 x 112) from [[Google drive]](https://drive.google.com/file/d/1Smd-Bdwj4tCbNugmoa66vxnJAU613bCo/view?usp=sharing)
+   - Put data to `data/$dataset_name` 
+   - Copy data list from [here](https://github.com/yoqim/PR-HFR/tree/main/data)
+
+>Note that: casia(fold_1) is provided for research purposes only. For the rest data, please refer to the original publications.
+
+
+- Downloading models from [[Google drive]](https://drive.google.com/file/d/1XjlnvbXmRD5xLJo7lLTy8LyQbMYRoz8C/view?usp=sharing)
+    - Put pretrain model at `models/pretrain/`
+    - Put finetune model at `models/finetune/$dataset/`
+
+
+## How to run?
+
+```shell
+sh eval.sh
+```
diff --git a/insightface/recognition/idmmd/dataset_mix.py b/insightface/recognition/idmmd/dataset_mix.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f6ebb73ab7a28f83ae5ebaf544ddeec717b0ffb
--- /dev/null
+++ b/insightface/recognition/idmmd/dataset_mix.py
@@ -0,0 +1,348 @@
+from PIL import Image
+import os
+import torch.utils.data as data
+import torchvision.transforms as transforms
+
+
+class Real_Dataset_112(data.Dataset):
+    def __init__(self, args):
+        super(Real_Dataset_112, self).__init__()
+        
+        self.img_root = args.img_root_R
+        self.img_list, self.num_classes = self.list_reader(args.train_list_R)
+        self.input_mode = args.input_mode
+
+        self.transform = transforms.Compose([
+            # transforms.RandomCrop(112),
+            transforms.ToTensor()
+        ])
+
+    def __getitem__(self, index):
+        img_name, label = self.img_list[index]
+
+        img = self.get_img_from_path(img_name)
+        return {'img': img, 'label': int(label)}
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def get_img_from_path(self, img_name):
+        img_path = os.path.join(self.img_root, img_name)
+
+        if self.input_mode == 'grey':
+            img = Image.open(img_path).convert('L')
+        elif self.input_mode == 'red':
+            img = Image.open(img_path)
+            img = img.split()[0]
+
+        img = self.transform(img)
+        return img
+
+    def list_reader(self, list_file):
+        img_list = []
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        
+        pid_container = set()
+        for line in lines:
+            pid = int(line.strip().split(' ')[1])
+            pid_container.add(pid)
+        pid2label = {pid:label for label, pid in enumerate(pid_container)}
+
+        for line in lines:
+            img_name, pid = line.strip().split(' ')
+            if not os.path.exists(os.path.join(self.img_root, img_name)):
+                continue
+            label = pid2label[int(pid)]
+            img_list.append((img_name, label))
+        
+        return img_list, len(pid_container)
+
+
+class Real_Dataset_112_paired(data.Dataset):
+    def __init__(self, args):
+        super(Real_Dataset_112_paired, self).__init__()
+
+        self.img_root = args.img_root_R
+        self.img_list, self.num_classes = self.list_reader(args.train_list_R)
+        self.input_mode = args.input_mode
+
+        self.transform = transforms.Compose([
+            # transforms.RandomCrop(112),
+            transforms.ToTensor()
+        ])
+
+        self.vir_list = [(a,b,c) for (a,b,c) in self.img_list if c==0]
+        self.nir_list = [(a,b,c) for (a,b,c) in self.img_list if c==1]
+
+        self.vis_labels = np.array([p[1] for p in self.vir_list])
+        self.nir_labels = np.array([p[1] for p in self.nir_list])
+
+        self.visIndex = None
+        self.nirIndex = None
+
+    def __getitem__(self, index):
+        vis_img_name, vis_label, vis_domain = self.vir_list[self.visIndex[index]]
+        nir_img_name, nir_label, nir_domain = self.nir_list[self.nirIndex[index]]
+        
+        assert vis_domain == 0 and nir_domain == 1
+
+        vis_img = self.get_img_from_path(vis_img_name)
+        nir_img = self.get_img_from_path(nir_img_name)
+
+        return vis_img, nir_img, vis_label, nir_label
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def get_img_from_path(self, img_name):
+        img_path = os.path.join(self.img_root, img_name)
+
+        if self.input_mode == 'grey':
+            img = Image.open(img_path).convert('L')
+        elif self.input_mode == 'red':
+            img = Image.open(img_path)
+            img = img.split()[0]
+
+        img = self.transform(img)
+        return img
+
+
+    def list_reader(self, list_file):
+        img_list = []
+        with open(list_file, 'r') as f:
+            lines = f.readlines()
+        
+        pid_container = set()
+        for line in lines:
+            pid = int(line.strip().split(' ')[1])
+            pid_container.add(pid)
+        pid2label = {pid:label for label, pid in enumerate(pid_container)}
+
+        for line in lines:
+            img_name, pid = line.strip().split(' ')
+            label = pid2label[int(pid)]
+    
+            domain = 0 if 'VIS' in img_name else 1 
+            img_list.append((img_name, label, domain))
+        
+        return img_list, len(pid_container)
+
+class Mix_Dataset_112(data.Dataset):
+    def __init__(self, args):
+        super(Mix_Dataset_112, self).__init__()
+        
+        self.img_root_R = args.img_root_R
+        self.img_root_F = args.img_root_F
+        self.img_list, self.num_classes = self.list_reader(args.train_list_R, args.train_list_F)
+        self.input_mode = args.input_mode
+
+        self.transform = transforms.Compose([
+            # transforms.RandomCrop(112),
+            transforms.ToTensor()
+        ])
+
+    def __getitem__(self, index):
+        img_path, label = self.img_list[index]
+
+        img = self.get_img_from_path(img_path)
+        return {'img': img, 'label': int(label)}
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def get_img_from_path(self, img_path):
+
+        if self.input_mode == 'grey':
+            img = Image.open(img_path).convert('L')
+        elif self.input_mode == 'red':
+            img = Image.open(img_path)
+            img = img.split()[0]
+
+        img = self.transform(img)
+        return img
+
+    def list_reader(self, list_file_real, list_file_fake):
+        with open(list_file_real, 'r') as f:
+            lines_real = f.readlines()
+        with open(list_file_fake, 'r') as f:
+            lines_fake = f.readlines()
+
+        fake_label_start = max([int(l.strip().split(' ')[-1]) for l in lines_real]) + 1
+        lines_fake = ["{} {}".format(l.strip().split(' ')[0], int(l.strip().split(' ')[1]) + fake_label_start) for l in lines_fake]
+
+        lines = lines_real + lines_fake
+
+        pid_container = set()
+        for line in lines:
+            pid = int(line.strip().split(' ')[1])
+            pid_container.add(pid)
+        pid2label = {pid:label for label, pid in enumerate(pid_container)}
+
+        img_list_R = []
+        for line in lines_real:
+            img_name, pid = line.strip().split(' ')
+            label = pid2label[int(pid)]
+            img_list_R.append((os.path.join(self.img_root_R + img_name), label))
+        
+        img_list_F = []
+        for line in lines_fake:
+            img_name, pid = line.strip().split(' ')
+            label = pid2label[int(pid)]
+            img_list_F.append((os.path.join(self.img_root_F + img_name), label))
+        
+        img_list = img_list_R + img_list_F
+
+        return img_list, len(pid_container)
+
+
+class Mix_Dataset_112_paired(data.Dataset):
+    def __init__(self, args):
+        super(Mix_Dataset_112_paired, self).__init__()
+
+        self.img_root_R = args.img_root_R
+        self.img_root_F = args.img_root_F
+        self.img_list, self.num_classes = self.list_reader(args.train_list_R, args.train_list_F)
+        self.input_mode = args.input_mode
+
+        self.transform = transforms.Compose([
+            # transforms.RandomCrop(112),
+            transforms.ToTensor()
+        ])
+
+        self.vir_list = [(a,b,c) for (a,b,c) in self.img_list if c==0]
+        self.nir_list = [(a,b,c) for (a,b,c) in self.img_list if c==1]
+
+        self.vis_labels = np.array([p[1] for p in self.vir_list])
+        self.nir_labels = np.array([p[1] for p in self.nir_list])
+
+        self.visIndex = None
+        self.nirIndex = None
+
+    def __getitem__(self, index):
+        vis_img_name, vis_label, vis_domain = self.vir_list[self.visIndex[index]]
+        nir_img_name, nir_label, nir_domain = self.nir_list[self.nirIndex[index]]
+        
+        assert vis_domain == 0 and nir_domain == 1
+
+        vis_img = self.get_img_from_path(vis_img_name)
+        nir_img = self.get_img_from_path(nir_img_name)
+
+        return vis_img, nir_img, vis_label, nir_label
+
+    def __len__(self):
+        return len(self.img_list)
+
+    def get_img_from_path(self, img_path):
+        if self.input_mode == 'grey':
+            img = Image.open(img_path).convert('L')
+        elif self.input_mode == 'red':
+            img = Image.open(img_path)
+            img = img.split()[0]
+
+        img = self.transform(img)
+        return img
+
+
+    def list_reader(self, list_file_real, list_file_fake):
+        with open(list_file_real, 'r') as f:
+            lines_real = f.readlines()
+        with open(list_file_fake, 'r') as f:
+            lines_fake = f.readlines()
+
+        fake_label_start = max([int(l.strip().split(' ')[-1]) for l in lines_real]) + 1
+        lines_fake = ["{} {}".format(l.strip().split(' ')[0], int(l.strip().split(' ')[1]) + fake_label_start) for l in lines_fake]
+
+        lines = lines_real + lines_fake
+
+        pid_container = set()
+        for line in lines:
+            pid = int(line.strip().split(' ')[1])
+            pid_container.add(pid)
+        pid2label = {pid:label for label, pid in enumerate(pid_container)}
+
+        img_list_R = []
+        for line in lines_real:
+            img_name, pid = line.strip().split(' ')
+            label = pid2label[int(pid)]
+            domain = 0 if 'VIS' in img_name else 1
+            img_list_R.append((os.path.join(self.img_root_R + img_name), label, domain))
+        
+        img_list_F = []
+        for line in lines_fake:
+            img_name, pid = line.strip().split(' ')
+            label = pid2label[int(pid)]
+
+            # if label in [8192,1984,2110,6344,8566,8589,9362]:         # only with single image pair
+            #     print(img_name)
+            domain = 0 if 'VIS' in img_name else 1
+            img_list_F.append((os.path.join(self.img_root_F + img_name), label, domain))
+        
+        img_list = img_list_R + img_list_F
+
+        return img_list, len(pid_container)
+
+
+from torch.utils.data.sampler import Sampler
+import numpy as np
+
+def GenIdx(train_vis_label, train_nir_label):
+    def get_idx_from_label(train_label):
+        pos = []
+        unique_train_label = np.unique(train_label)
+        for ul in unique_train_label:
+            tmp = np.argwhere(train_label == ul).squeeze().tolist()
+            if isinstance(tmp,int):
+                tmp = [tmp]
+            pos.append(tmp)
+        return pos
+    
+    vis_pos = get_idx_from_label(train_vis_label)
+    nir_pos = get_idx_from_label(train_nir_label)
+
+    return vis_pos, nir_pos
+
+
+class IdentitySampler(Sampler):
+    """Sample person identities evenly in each batch.
+        Args:
+            train_color_label, train_thermal_label: labels of two modalities
+            color_pos, thermal_pos: positions of each identity
+            batchSize: batch size
+    """
+
+    def __init__(self, train_color_label, train_thermal_label, color_pos, thermal_pos, batchSize, num_img_per_id = 4):
+        uni_label = np.unique(train_color_label)
+        self.n_classes = len(uni_label)
+
+        sample_color = np.arange(batchSize)
+        sample_thermal = np.arange(batchSize)
+        N = np.maximum(len(train_color_label), len(train_thermal_label))
+        
+        num_id_per_batch = batchSize / num_img_per_id
+
+        for j in range(N//batchSize+1):
+            batch_idx = np.random.choice(uni_label, int(num_id_per_batch), replace=False)
+            
+            for s, i in enumerate(range(0, batchSize, num_img_per_id)):
+                sample_flag = True if len(color_pos[batch_idx[s]]) < num_img_per_id or len(thermal_pos[batch_idx[s]]) < num_img_per_id else False
+                
+                sample_color[i:i+num_img_per_id]  = np.random.choice(color_pos[batch_idx[s]], num_img_per_id, replace=sample_flag)
+                sample_thermal[i:i+num_img_per_id] = np.random.choice(thermal_pos[batch_idx[s]], num_img_per_id, replace=sample_flag)
+            
+            if j ==0:
+                index1= sample_color
+                index2= sample_thermal
+            else:
+                index1 = np.hstack((index1, sample_color))
+                index2 = np.hstack((index2, sample_thermal))
+        
+        self.visIndex = index1
+        self.nirIndex = index2
+        self.N  = N
+        
+    def __iter__(self):
+        return iter(np.arange(len(self.visIndex)))
+
+    def __len__(self):
+        return self.N 
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/eval.sh b/insightface/recognition/idmmd/eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..49f438731f3f4e9c7ff84a07c934b4fede12c929
--- /dev/null
+++ b/insightface/recognition/idmmd/eval.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#### Parameters
+# datasets options: "casia", "lamp", "buaa", "oulu"
+# test_fold_id = -1 if testing 10 fold (casia & lamp) else test_fold_id = i (fold id)
+# test_mode: "pretrain" or "finetune"
+
+dataset='oulu'
+# img_root="path to data folder"
+img_root="/storage/local/local/Oulu_CASIA_NIR_VIS/crops112_3/"
+input_mode='grey'
+model_mode='29'
+test_mode='pretrain'
+test_fold_id=-1
+model_name='L29.pth.tar' # pretrain model
+# model_name=$dataset'_fold'$test_fold_id'_final.pth.tar' # finetune: 'casia_fold1_final.pth.tar' 
+
+
+CUDA_VISIBLE_DEVICES=6 python ./evaluate/eval_${dataset}_112.py --test_fold_id $test_fold_id --input_mode $input_mode --model_mode $model_mode --model_name $model_name --img_root $img_root --test_mode $test_mode | tee test.log
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/evaluate/__init__.py b/insightface/recognition/idmmd/evaluate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee108bc829c034e095f80fa8935422e95d425d5
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/__init__.py
@@ -0,0 +1,2 @@
+
+from evaluate.eval_ops import *
diff --git a/insightface/recognition/idmmd/evaluate/eval_buaa_112.py b/insightface/recognition/idmmd/evaluate/eval_buaa_112.py
new file mode 100644
index 0000000000000000000000000000000000000000..4800a7826925a9f2a29aead9fe91548174993d01
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/eval_buaa_112.py
@@ -0,0 +1,135 @@
+import numpy as np
+import os,sys
+sys.path.append(os.getcwd())
+import argparse
+import torch
+
+from PIL import Image
+from network.lightcnn112 import LightCNN_29Layers
+from evaluate import evaluate2
+
+fars = [10 ** -4, 10 ** -3, 10 ** -2]
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--test_fold_id', default=1, type=int)
+parser.add_argument('--input_mode', default='grey', choices=['grey'], type=str)
+parser.add_argument('--model_mode', default='29', choices=['29'], type=str)
+parser.add_argument('--model_name', default='', type=str)
+parser.add_argument('--img_root', default='', type=str)
+parser.add_argument('--test_mode', default='pretrain', type=str)
+
+args = parser.parse_args()
+
+INPUT_MODE = args.input_mode
+MODEL_MODE = args.model_mode
+model_name = args.model_name
+test_mode = args.test_mode
+img_root = args.img_root
+
+num_classes = 725
+test_list_dir = './data/buaa/' 
+model_dir = f'./models/{test_mode}/'
+model_path = os.path.join(model_dir, model_name)
+
+def load_model(model, pretrained):
+    weights = torch.load(pretrained)
+    weights = weights['state_dict']
+
+    model_dict = model.state_dict()
+    
+    weights = {k.replace('module.',''): v for k, v in weights.items() if k.replace('module.','') in model_dict.keys() and 'fc2' not in k}
+    print("==> len of weights to be loaded: {}. \n".format(len(weights)))
+    model.load_state_dict(weights, strict=False)
+    model.eval()
+
+
+def get_vis_nir_info():
+
+    vis = np.loadtxt(test_list_dir + 'test_vis_paths.txt', dtype=str)
+    vis_labels = [int(s.split('/')[0]) for s in vis]
+    vis = [(p,l) for (p,l) in zip(vis, vis_labels)]
+    
+    nir = np.loadtxt(test_list_dir + 'test_nir_paths.txt', dtype=str)
+    nir_labels = [int(s.split('/')[0]) for s in nir]
+    nir = [(p,l) for (p,l) in zip(nir, nir_labels)]
+    
+    return vis,nir
+
+
+class Embedding:
+    def __init__(self, root, model):
+        self.model = model
+        self.root = root
+
+        self.image_size = (112, 112)
+        self.batch_size = 1
+
+    def get(self, img):
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 1*112*112
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 1, self.image_size[1], self.image_size[0]),
+                              dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+    def extract_feats_labels(self, data_list):
+        img_feats = []
+        pids = []
+        for (imgPath, pid) in data_list:
+            
+            img = Image.open(os.path.join(self.root, imgPath)).convert('L')
+            img = np.array(img)
+            img = img[..., np.newaxis]
+
+            img_feats.append(self.forward_db(self.get(img)).flatten())
+            pids.append(pid)
+        
+        img_feats = np.array(img_feats).astype(np.float32)
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] //2] + img_feats[:, img_feats.shape[1] // 2:]
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+        pids = np.array(pids)
+
+        return img_input_feats, pids
+
+
+
+if MODEL_MODE == '29':
+    model = LightCNN_29Layers(num_classes=num_classes)
+model.cuda()
+
+embedding = Embedding(img_root, model)
+
+if not os.path.exists(model_path):
+    print("cannot find model ",model_path)
+    sys.exit()
+
+load_model(embedding.model, model_path)
+vis, nir= get_vis_nir_info()
+
+feat_vis, label_vis = embedding.extract_feats_labels(vis)
+feat_nir, label_nir = embedding.extract_feats_labels(nir)
+
+labels = np.equal.outer(label_vis, label_nir).astype(np.float32)
+print("*" * 16)
+print("INPUT_MODE: ", INPUT_MODE)
+print("MODEL_MODE: ", MODEL_MODE)
+print("model name: ", model_name)
+print("*" * 16)
+print("[query] feat_nir.shape ",feat_nir.shape)
+print("[gallery] feat_vis.shape ",feat_vis.shape)
+print("*" * 16)
+
+acc, tarfar = evaluate2(feat_vis, feat_nir, labels, fars=fars)
+
+
diff --git a/insightface/recognition/idmmd/evaluate/eval_casia_112.py b/insightface/recognition/idmmd/evaluate/eval_casia_112.py
new file mode 100644
index 0000000000000000000000000000000000000000..9578163f1711111030b94f31625356096c8a8df1
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/eval_casia_112.py
@@ -0,0 +1,189 @@
+import numpy as np
+import pandas as pd
+import os,sys
+sys.path.append(os.getcwd())
+import argparse
+
+from PIL import Image
+import torch
+
+from network.lightcnn112 import LightCNN_29Layers
+from evaluate import evaluate2
+
+fars = [10 ** -4, 10 ** -3, 10 ** -2]
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--test_fold_id', default=1, type=int)
+parser.add_argument('--input_mode', default='grey', choices=['grey'], type=str)
+parser.add_argument('--model_mode', default='29', choices=['29'], type=str)
+parser.add_argument('--model_name', default='', type=str)
+parser.add_argument('--img_root', default='', type=str)
+parser.add_argument('--test_mode', default='pretrain', type=str)
+
+args = parser.parse_args()
+
+INPUT_MODE = args.input_mode
+MODEL_MODE = args.model_mode
+model_name = args.model_name
+test_mode = args.test_mode
+img_root = args.img_root
+
+print("*" * 16)
+print("INPUT_MODE: ", INPUT_MODE)
+print("MODEL_MODE: ", MODEL_MODE)
+print("model name: ", model_name)
+print("*" * 16)
+
+tfi = args.test_fold_id
+num_classes = 725
+test_list_dir = 'data/casia/'
+model_dir = f'./models/{test_mode}/'
+model_path = os.path.join(model_dir, model_name)
+
+def load_model(model, pretrained):
+    weights = torch.load(pretrained)
+    weights = weights['state_dict']
+
+    model_dict = model.state_dict()
+    
+    weights = {k.replace('module.',''): v for k, v in weights.items() if k.replace('module.','') in model_dict.keys() and 'fc2' not in k}
+    print("==> len of weights to be loaded: {}. \n".format(len(weights)))
+    model.load_state_dict(weights, strict=False)
+    model.eval()
+
+
+class Embedding:
+    def __init__(self, root, model):
+        self.model = model
+        self.root = root
+
+        self.image_size = (112, 112)
+        self.batch_size = 1
+
+    def get(self, img):
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 1*112*112
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 1, self.image_size[1], self.image_size[0]),
+                              dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+    def extract_feats_labels(self, data_list):
+        img_feats = []
+        pids = []
+        for (imgPath, pid) in data_list:
+            img = Image.open(os.path.join(self.root, imgPath)).convert('L')
+
+            img = np.array(img)
+            img = img[..., np.newaxis]
+
+            img_feats.append(self.forward_db(self.get(img)).flatten())
+            pids.append(pid)
+        
+        img_feats = np.array(img_feats).astype(np.float32)
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] //2] + img_feats[:, img_feats.shape[1] // 2:]
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True)) 
+        pids = np.array(pids)
+
+        return img_input_feats, pids
+
+
+def get_vis_nir_info(test_fold_id):
+    vis = pd.read_csv(os.path.join(test_list_dir, 'vis_gallery_%d.txt' % test_fold_id), header=None, sep=' ')
+    vis_labels = [int(s.split('\\')[-2]) for s in vis[0]]
+    vis = vis[0].apply(lambda s: rename_path(s)).tolist()
+
+    nir = pd.read_csv(os.path.join(test_list_dir, 'nir_probe_%d.txt' % test_fold_id), header=None, sep=' ')
+    nir_labels = [int(s.split('\\')[-2]) for s in nir[0]]
+    nir = nir[0].apply(lambda s: rename_path(s)).tolist()
+
+    vis = [(p,l) for (p,l) in zip(vis, vis_labels)]
+    nir = [(p,l) for (p,l) in zip(nir, nir_labels)]
+
+    return vis,nir
+
+def rename_path(s):
+    """messy path names, inconsistency between 10-folds and how data are actually saved"""
+    s = s.split(".")[0]
+    gr, mod, id, img = s.split("\\")
+    ext = 'jpg' if (mod == 'VIS') else 'bmp'
+    return "%s/%s_%s_%s_%s.%s" % (mod, gr, mod, id, img, ext)
+
+
+
+if MODEL_MODE == '29':
+    model = LightCNN_29Layers(num_classes=num_classes)
+model.cuda()
+embedding = Embedding(img_root, model)
+
+############### test pre-trained models
+if tfi == -1:
+    n_fold = 10
+    acc_ = []
+    tarfar_ = np.zeros((n_fold, 4))
+    for tf in range(n_fold):
+        
+        load_model(embedding.model, model_path)
+        vis, nir= get_vis_nir_info(tf+1)
+        
+        feat_vis, label_vis = embedding.extract_feats_labels(vis)
+        feat_nir, label_nir = embedding.extract_feats_labels(nir)
+
+        labels = np.equal.outer(label_vis, label_nir).astype(np.float32)
+
+        print("*" * 16)
+        print("Fold id ", tf+1)
+        print("Model: ", model_path)
+        print("[query] feat_nir.shape ",feat_nir.shape)
+        print("[gallery] feat_vis.shape ",feat_vis.shape)
+        print("*" * 16)
+
+        acc, tarfar = evaluate2(feat_vis, feat_nir, labels, fars=fars)
+        
+        acc_.append(acc[0])
+        tarfar_[tf,...] = np.array(tarfar)
+    
+    print('\n')
+    print("*" * 16)
+    print("MEAN")
+    print("*" * 16)
+    
+    print("Rank 1 = {:.3%}  +- {:.2%}".format(np.mean(acc_), np.std(acc_)))
+    var_mean = tarfar_.mean(0)
+    var_std = tarfar_.std(0)
+    for fpr_iter in np.arange(len(fars)):
+        print("TAR {:.3%} +- {:.2%} @ FAR {:.4%}".format(var_mean[fpr_iter], var_std[fpr_iter], fars[fpr_iter]))
+
+
+else:
+    if not os.path.exists(model_path):
+        print("cannot find model ",model_path)
+        sys.exit()
+
+    model = load_model(embedding.model, model_path)
+
+    vis, nir= get_vis_nir_info(tfi)
+
+    feat_vis, label_vis = embedding.extract_feats_labels(vis)
+    feat_nir, label_nir = embedding.extract_feats_labels(nir)
+
+    labels = np.equal.outer(label_vis, label_nir).astype(np.float32)
+
+    print("*" * 16)
+    print("Fold id ", tfi)
+    print("[query] feat_nir.shape ",feat_nir.shape)
+    print("[gallery] feat_vis.shape ",feat_vis.shape)
+    print("*" * 16)
+
+    acc, tarfar = evaluate2(feat_vis, feat_nir, labels, fars=fars)
+
diff --git a/insightface/recognition/idmmd/evaluate/eval_lamp_112.py b/insightface/recognition/idmmd/evaluate/eval_lamp_112.py
new file mode 100644
index 0000000000000000000000000000000000000000..b746a134f9b1c39c31510d1b158e153a70bcaea6
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/eval_lamp_112.py
@@ -0,0 +1,169 @@
+import numpy as np
+import os,sys
+sys.path.append(os.getcwd())
+import argparse
+
+from PIL import Image
+
+import torch
+
+from network.lightcnn112 import LightCNN_29Layers
+from evaluate import evaluate2
+
+fars = [10 ** -4, 10 ** -3, 10 ** -2]
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--test_fold_id', default=1, type=int)
+parser.add_argument('--input_mode', default='grey', choices=['grey'], type=str)
+parser.add_argument('--model_mode', default='29', choices=['29'], type=str)
+parser.add_argument('--model_name', default='', type=str)
+parser.add_argument('--img_root', default='', type=str)
+parser.add_argument('--test_mode', default='pretrain', type=str)
+args = parser.parse_args()
+
+INPUT_MODE = args.input_mode
+MODEL_MODE = args.model_mode
+model_name = args.model_name
+test_mode = args.test_mode
+img_root = args.img_root
+
+tfi = args.test_fold_id
+num_classes = 725
+test_list_dir = './data/lamp/'
+model_dir = f'./models/{test_mode}/'
+model_path = os.path.join(model_dir, model_name)
+
+def load_model(model, pretrained):
+    weights = torch.load(pretrained)
+    weights = weights['state_dict']
+
+    model_dict = model.state_dict()
+
+    weights = {k.replace('module.',''): v for k, v in weights.items() if k.replace('module.','') in model_dict.keys()}
+    print("==> len of weights to be loaded: {}. \n".format(len(weights)))
+
+    model.load_state_dict(weights, strict=False)
+    model.eval()
+
+def get_vis_nir_info(test_fold_id):
+    def get_data(test_fold_id, mode='vis'):
+        name = 'gallery_vis%d.txt' % (test_fold_id) if mode=='vis' else 'probe_nir%d.txt' % (test_fold_id)
+        file_data = np.genfromtxt(test_list_dir + name , usecols=(0,1), skip_header=1, dtype=str)
+        paths = file_data[:,0]
+
+        # paths = [p for p in paths if os.path.exists(img_root + p)]
+        return paths
+
+    vis = get_data(test_fold_id, mode='vis')
+    nir = get_data(test_fold_id, mode='nir')
+
+    return vis, nir
+
+
+class Embedding:
+    def __init__(self, root, model):
+        self.model = model
+        self.root = root
+
+        self.image_size = (112, 112)
+        self.batch_size = 1
+
+    def get(self, img):
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 1*112*112
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 1, self.image_size[1], self.image_size[0]),
+                              dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+    def extract_feats_labels(self, data_list):
+        img_feats = []
+        for imgPath in data_list:
+            
+            img = Image.open(os.path.join(self.root, imgPath)).convert('L')
+            img = np.array(img)
+            img = img[..., np.newaxis]
+
+            img_feats.append(self.forward_db(self.get(img)).flatten())
+        
+        img_feats = np.array(img_feats).astype(np.float32)
+
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] //2] + img_feats[:, img_feats.shape[1] // 2:]
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True)) 
+
+        return img_input_feats
+
+
+
+if MODEL_MODE == '29':
+    model = LightCNN_29Layers(num_classes=num_classes)
+model.cuda()
+
+embedding = Embedding(img_root, model)
+
+############### test pre-trained models
+if tfi == -1:
+    n_fold = 10
+    acc_ = []
+    tarfar_ = np.zeros((n_fold, 4))
+    for tf in range(n_fold):
+        load_model(embedding.model, model_path)
+        vis, nir = get_vis_nir_info(tf+1)
+        
+        feat_vis = embedding.extract_feats_labels(vis)
+        feat_nir = embedding.extract_feats_labels(nir)
+        
+        label_matrix = np.load(test_list_dir + 'binary_lable_matrix_%d.npy' % (tf+1))
+        label_matrix = label_matrix.T
+
+        print("*" * 16)
+        print("Fold id ", tf+1)
+        print("Model: ", model_path)
+        print("[query] feat_nir.shape ",feat_nir.shape)
+        print("[gallery] feat_vis.shape ",feat_vis.shape)
+        print("*" * 16)
+
+        acc, tarfar = evaluate2(feat_vis, feat_nir, label_matrix, fars=fars)
+        
+        acc_.append(acc[0])
+        tarfar_[tf,...] = np.array(tarfar)
+    
+    print('\n')
+    print("*" * 16)
+    print("MEAN")
+    print("*" * 16)
+    
+    print("Rank 1 = {:.3%}  +- {:.2%}".format(np.mean(acc_), np.std(acc_)))
+    var_mean = tarfar_.mean(0)
+    var_std = tarfar_.std(0)
+    for fpr_iter in np.arange(len(fars)):
+        print("TAR {:.3%} +- {:.2%} @ FAR {:.4%}".format(var_mean[fpr_iter], var_std[fpr_iter], fars[fpr_iter]))
+
+else:
+    load_model(embedding.model, model_path)
+
+    vis, nir= get_vis_nir_info(tfi)
+
+    feat_vis = embedding.extract_feats_labels(vis)
+    feat_nir = embedding.extract_feats_labels(nir)
+
+    label_matrix = np.load(test_list_dir + 'binary_lable_matrix_%d.npy' % (tfi))
+    label_matrix = label_matrix.T
+
+    print("*" * 16)
+    print("Fold id ", tfi)
+    print("[query] feat_nir.shape ",feat_nir.shape)
+    print("[gallery] feat_vis.shape ",feat_vis.shape)
+    print("*" * 16)
+
+    acc, tarfar = evaluate2(feat_vis, feat_nir, label_matrix, fars=fars)
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/evaluate/eval_ops.py b/insightface/recognition/idmmd/evaluate/eval_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c01ee62afc31618440cd05ca056f61fc7b7005d
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/eval_ops.py
@@ -0,0 +1,54 @@
+import numpy as np
+from sklearn.metrics import roc_curve
+
+def evaluate2(gallery_feat, query_feat, labels, fars = [10**-5, 10**-4, 10**-3, 10**-2]):
+    query_num = query_feat.shape[0]
+
+    similarity = np.dot(query_feat, gallery_feat.T)
+    top_inds = np.argsort(-similarity)
+    labels = labels.T
+    
+    # calculate top1
+    correct_num = 0
+    for i in range(query_num):
+        j = top_inds[i, 0]
+        if labels[i, j] == 1:
+            correct_num += 1
+    top1 = correct_num / query_num
+    print("top1 = {:.2%}".format(top1))
+
+    # # calculate top5
+    # correct_num = 0
+    # for i in range(query_num):
+    #     j = top_inds[i, :5]
+    #     if any(labels[i, j] == 1.0):
+    #         correct_num += 1
+    #     # else:
+    #     #     print(i,j)
+    # top5 = correct_num / query_num
+    # print("top5 = {:.4%}".format(top5))
+
+    # # calculate 10
+    # correct_num = 0
+    # for i in range(query_num):
+    #     j = top_inds[i, :10]
+    #     if any(labels[i, j] == 1.0):
+    #         correct_num += 1
+    #     # else:
+    #     #     print(i,j)
+    # top10 = correct_num / query_num
+    # print("top10 = {:.4%}".format(top10))
+
+    labels_ = labels.flatten()
+    similarity_ = similarity.flatten()
+    fpr, tpr, _ = roc_curve(labels_, similarity_)
+
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)
+    tpr_fpr_row = []
+    for far in fars:
+        _, min_index = min(list(zip(abs(fpr - far), range(len(fpr)))))
+        tpr_fpr_row.append(tpr[min_index])
+        print("TPR {:.2%} @ FAR {:.4%}".format(tpr[min_index], far))
+        
+    return [top1], tpr_fpr_row
diff --git a/insightface/recognition/idmmd/evaluate/eval_oulu_112.py b/insightface/recognition/idmmd/evaluate/eval_oulu_112.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb3b0aadb91931a90d966e3b029b3cf89a47666c
--- /dev/null
+++ b/insightface/recognition/idmmd/evaluate/eval_oulu_112.py
@@ -0,0 +1,160 @@
+import numpy as np
+import pandas as pd
+import os,sys
+sys.path.append(os.getcwd())
+print(sys.path)
+import argparse
+import torch
+
+from PIL import Image
+
+from network.lightcnn112 import LightCNN_29Layers
+from evaluate import evaluate2
+
+fars = [10 ** -4, 10 ** -3, 10 ** -2]
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--test_fold_id', default=1, type=int)
+parser.add_argument('--input_mode', default='grey', choices=['grey'], type=str)
+parser.add_argument('--model_mode', default='29', choices=['29'], type=str)
+parser.add_argument('--model_name', default='', type=str)
+parser.add_argument('--img_root', default='', type=str)
+parser.add_argument('--test_mode', default='pretrain', type=str)
+
+args = parser.parse_args()
+
+INPUT_MODE = args.input_mode
+MODEL_MODE = args.model_mode
+model_name = args.model_name
+test_mode = args.test_mode
+img_root = args.img_root
+
+num_classes = 725
+test_list_dir = './data/oulu/' 
+model_dir = f'./models/{test_mode}/'
+model_path = os.path.join(model_dir, model_name)
+
+def load_model(model, pretrained):
+    weights = torch.load(pretrained)
+    weights = weights['state_dict']
+
+    model_dict = model.state_dict()
+    
+    weights = {k.replace('module.',''): v for k, v in weights.items() if k.replace('module.','') in model_dict.keys() and 'fc2' not in k}
+
+    print("==> len of weights to be loaded: {}. \n".format(len(weights)))
+    model.load_state_dict(weights, strict=False)
+    model.eval()
+    
+class Embedding:
+    def __init__(self, root, model):
+        self.model = model
+        self.root = root
+
+        self.image_size = (112, 112)
+        self.batch_size = 1
+
+    def get(self, img):
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 1*112*112
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 1, self.image_size[1], self.image_size[0]),
+                              dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+    def extract_feats_labels(self, data_list):
+        img_feats = []
+        pids = []
+        for (imgPath, pid) in data_list:
+            
+            img = Image.open(os.path.join(self.root, imgPath)).convert('L')
+            img = np.array(img)
+            img = img[..., np.newaxis]
+
+            img_feats.append(self.forward_db(self.get(img)).flatten())
+            pids.append(pid)
+        
+        img_feats = np.array(img_feats).astype(np.float32)
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] //2] + img_feats[:, img_feats.shape[1] // 2:]
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+        pids = np.array(pids)
+
+        return img_input_feats, pids
+
+
+def get_vis_nir_info_csv():
+    vis = pd.read_csv(test_list_dir + 'vis_test_paths.csv', header=None, sep=' ')
+    vis_labels = [int(s.strip().split(',')[-1].split('P')[-1]) for s in vis[0]]
+    vis = [s.strip().split(',')[0] for s in vis[0]]
+
+    nir = pd.read_csv(test_list_dir + 'nir_test_paths.csv', header=None, sep=' ')
+    nir_labels = [int(s.strip().split(',')[-1].split('P')[-1]) for s in nir[0]]
+    nir = [s.strip().split(',')[0] for s in nir[0]]
+
+    vis = [(p,l) for (p,l) in zip(vis, vis_labels)]
+    nir = [(p,l) for (p,l) in zip(nir, nir_labels)]
+    
+    return vis,nir
+
+def get_vis_nir_info_txt():
+    def read_file(file_name):
+        with open(test_list_dir + file_name, 'r') as f:
+            lines = f.readlines()
+        paths = [s.strip().split(' ')[0] for s in lines]
+        labels = [int(s.strip().split(' ')[1]) for s in lines]
+        info = [(p,l) for (p,l) in zip(paths, labels)]
+
+        return info
+
+    vis = read_file('test_vis_paths.txt')
+    nir = read_file('test_nir_paths.txt')
+    
+    return vis, nir
+
+
+### Testing pretrain/finetune model
+if test_mode == 'pretrain':
+    vis, nir = get_vis_nir_info_csv()
+elif test_mode == "finetune":
+    vis, nir = get_vis_nir_info_txt()
+else:
+    print("Wrong test_mode!!!")
+
+if MODEL_MODE == '29':
+    model = LightCNN_29Layers(num_classes=num_classes)
+
+model.cuda()
+
+embedding = Embedding(img_root, model)
+
+if not os.path.exists(model_path):
+    print("cannot find model ",model_path)
+    sys.exit()
+
+load_model(embedding.model, model_path)
+
+feat_vis, label_vis = embedding.extract_feats_labels(vis)
+feat_nir, label_nir = embedding.extract_feats_labels(nir)
+
+labels = np.equal.outer(label_vis, label_nir).astype(np.float32)
+
+print("*" * 16)
+print("INPUT_MODE: ", INPUT_MODE)
+print("MODEL_MODE: ", MODEL_MODE)
+print("model path: ", model_path)
+print("*" * 16)
+print("[query] feat_nir.shape ",feat_nir.shape)
+print("[gallery] feat_vis.shape ",feat_vis.shape)
+print("*" * 16)
+
+acc, tarfar = evaluate2(feat_vis, feat_nir, labels, fars=fars)
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/losses.py b/insightface/recognition/idmmd/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09fe0186eb80f02176dceb8bb058cdc6eaf77a8
--- /dev/null
+++ b/insightface/recognition/idmmd/losses.py
@@ -0,0 +1,106 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class IDMMD(nn.Module):
+    def __init__(self, kernel_type='rbf', kernel_mul=2.0, kernel_num=5):
+        super(IDMMD, self).__init__()
+        self.kernel_num = kernel_num
+        self.kernel_mul = kernel_mul
+        self.fix_sigma = None
+        self.kernel_type = kernel_type
+
+    def get_centers_by_id(self, x_rgb, x_ir, targets):
+        centers_rgb = []
+        centers_ir = []
+
+        batch_y_set = set(targets.data.cpu().numpy())
+
+        for _, l in enumerate(batch_y_set):
+            feat1 = x_rgb[targets==l]
+            feat2 = x_ir[targets==l]
+
+            centers_rgb.append(feat1.mean(dim=0).unsqueeze(0))
+            centers_ir.append(feat2.mean(dim=0).unsqueeze(0))
+
+        centers_rgb = torch.cat(centers_rgb, 0).cuda()
+        centers_ir = torch.cat(centers_ir, 0).cuda()
+
+        return centers_rgb, centers_ir
+    
+    def forward(self, x_rgb, x_ir, targets):
+
+        centers_rgb, centers_ir = self.get_centers_by_id(x_rgb, x_ir, targets)
+
+        if self.kernel_type == 'linear':
+            loss = self.linear_mmd(centers_rgb, centers_ir)         # domain-level loss
+
+        elif self.kernel_type == 'rbf':
+            B = centers_rgb.size(0)
+            kernels = self.guassian_kernel(centers_rgb, centers_ir)
+
+            XX = kernels[:B, :B]
+            YY = kernels[B:, B:]
+            XY = kernels[:B, B:]
+            YX = kernels[B:, :B]
+
+            loss = (XX + YY - XY - YX).mean()
+
+        return loss
+        
+
+    def linear_mmd(self, center_rgb, center_ir):
+        def compute_dist_(x_rgb, x_ir):
+            n = x_rgb.size(0)
+            dist1 = torch.pow(x_rgb, 2).sum(dim=1, keepdim=True).expand(n, n)
+            dist2 = torch.pow(x_ir, 2).sum(dim=1, keepdim=True).expand(n, n)
+            
+            dist = dist1 + dist2.t()
+            dist.addmm_(mat1=x_rgb, mat2=x_ir.t(), beta=1, alpha=-2)
+            dist = dist.clamp(min=1e-12)  # for numerical stability
+            return dist
+
+        matrix = compute_dist_(center_rgb, center_ir)
+        loss = matrix.diag()
+        
+        return loss.mean()
+
+
+    def guassian_kernel(self, x_rgb, x_ir):
+        total = torch.cat([x_rgb, x_ir], dim=0)
+        N = total.size(0)
+
+        total0 = total.unsqueeze(0).expand(
+            int(total.size(0)), int(total.size(0)), int(total.size(1)))
+        total1 = total.unsqueeze(1).expand(
+            int(total.size(0)), int(total.size(0)), int(total.size(1)))
+        dists = ((total0-total1)**2).sum(2)
+
+        if self.fix_sigma:
+            bandwidth = self.fix_sigma
+        else:
+            bandwidth = torch.sum(dists.data) / (N**2-N)
+        
+        bandwidth /= self.kernel_mul ** (self.kernel_num // 2)
+        bandwidth_list = [bandwidth * (self.kernel_mul**i)
+                          for i in range(self.kernel_num)]
+        kernel_val = [torch.exp(-dists / bandwidth_temp)
+                      for bandwidth_temp in bandwidth_list]
+        return sum(kernel_val)
+
+
+
+class CosFace(torch.nn.Module):
+    def __init__(self, s=64.0, m=0.40):
+        super(CosFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, logits, labels):
+        one_hot = torch.zeros_like(logits).scatter_(1, labels.view(-1, 1), 1.0).cuda()
+        phi = logits - self.m
+        output = torch.where(one_hot==1, phi, logits)
+        output *= self.s
+
+        return output
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/models/finetune/readme.txt b/insightface/recognition/idmmd/models/finetune/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f8832014ae2faaae69bb951a2843eacd224ed02
--- /dev/null
+++ b/insightface/recognition/idmmd/models/finetune/readme.txt
@@ -0,0 +1 @@
+put $dataset_final.pth.tar here
diff --git a/insightface/recognition/idmmd/models/pretrain/readme.txt b/insightface/recognition/idmmd/models/pretrain/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..86ebb62a34df0de2b32e1b4dd9a589abdb98fa09
--- /dev/null
+++ b/insightface/recognition/idmmd/models/pretrain/readme.txt
@@ -0,0 +1,3 @@
+Put pretrain model here.
+
+Ex. L29.pth.tar
diff --git a/insightface/recognition/idmmd/network/__init__.py b/insightface/recognition/idmmd/network/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/idmmd/network/lightcnn112.py b/insightface/recognition/idmmd/network/lightcnn112.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36254d862ccefdea20fabe6eca0e07a5c2eae17
--- /dev/null
+++ b/insightface/recognition/idmmd/network/lightcnn112.py
@@ -0,0 +1,181 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+
+
+class mfm(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, type=1):
+        super(mfm, self).__init__()
+        self.out_channels = out_channels
+        if type == 1:
+            self.filter = nn.Conv2d(in_channels, 2 * out_channels, kernel_size=kernel_size, stride=stride,
+                                    padding=padding)
+        else:
+            self.filter = nn.Linear(in_channels, 2 * out_channels)
+
+    def forward(self, x):
+        x = self.filter(x)
+        out = torch.split(x, self.out_channels, 1)
+        return torch.max(out[0], out[1])
+
+
+class group(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
+        super(group, self).__init__()
+        self.conv_a = mfm(in_channels, in_channels, 1, 1, 0)
+        self.conv = mfm(in_channels, out_channels, kernel_size, stride, padding)
+
+    def forward(self, x):
+        x = self.conv_a(x)
+        x = self.conv(x)
+        return x
+
+
+class resblock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(resblock, self).__init__()
+        self.conv1 = mfm(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = mfm(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        res = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + res
+        return out
+
+
+
+class network_29layers(nn.Module):
+    def __init__(self, block, layers, num_classes=79077):
+        super(network_29layers, self).__init__()
+        self.conv1  = mfm(1, 48, 5, 1, 2)
+        self.pool1  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block1 = self._make_layer(block, layers[0], 48, 48)
+        self.group1 = group(48, 96, 3, 1, 1)
+        self.pool2  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block2 = self._make_layer(block, layers[1], 96, 96)
+        self.group2 = group(96, 192, 3, 1, 1)
+        self.pool3  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block3 = self._make_layer(block, layers[2], 192, 192)
+        self.group3 = group(192, 128, 3, 1, 1)
+        self.block4 = self._make_layer(block, layers[3], 128, 128)
+        self.group4 = group(128, 128, 3, 1, 1)
+        self.pool4  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.fc     = mfm(7*7*128, 256, type=0)
+        self.fc2    = nn.Linear(256, num_classes)
+            
+    def _make_layer(self, block, num_blocks, in_channels, out_channels):
+        layers = []
+        for i in range(0, num_blocks):
+            layers.append(block(in_channels, out_channels))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool1(x)
+
+        x = self.block1(x)
+        x = self.group1(x)
+        x = self.pool2(x)
+
+        x = self.block2(x)
+        x = self.group2(x)
+        x = self.pool3(x)
+
+        x = self.block3(x)
+        x = self.group3(x)
+        x = self.block4(x)
+        x = self.group4(x)
+        x = self.pool4(x)
+
+        x = x.view(x.size(0), -1)
+        fc = self.fc(x)
+
+        if self.training:
+            x = F.dropout(fc, training=self.training)
+            out = self.fc2(x)
+            return out, F.normalize(fc,p=2,dim=1)
+        return F.normalize(fc,p=2,dim=1)
+
+
+
+
+################################
+## cosface nets
+################################
+
+from torch.nn import Parameter
+
+
+class network_29layers_cosface(nn.Module):
+    def __init__(self, block, layers, num_classes=79077):
+        super(network_29layers_cosface, self).__init__()
+        self.conv1  = mfm(1, 48, 5, 1, 2)
+        self.pool1  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block1 = self._make_layer(block, layers[0], 48, 48)
+        self.group1 = group(48, 96, 3, 1, 1)
+        self.pool2  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block2 = self._make_layer(block, layers[1], 96, 96)
+        self.group2 = group(96, 192, 3, 1, 1)
+        self.pool3  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block3 = self._make_layer(block, layers[2], 192, 192)
+        self.group3 = group(192, 128, 3, 1, 1)
+        self.block4 = self._make_layer(block, layers[3], 128, 128)
+        self.group4 = group(128, 128, 3, 1, 1)
+        self.pool4  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.fc     = mfm(7*7*128, 256, type=0)
+
+        self.weight = Parameter(torch.Tensor(num_classes, 256))
+        nn.init.xavier_uniform_(self.weight)
+    
+
+    def _make_layer(self, block, num_blocks, in_channels, out_channels):
+        layers = []
+        for i in range(0, num_blocks):
+            layers.append(block(in_channels, out_channels))
+        return nn.Sequential(*layers)
+
+    def cosine_sim(self, x1, x2, dim=1, eps=1e-8):
+        ip = torch.mm(x1, x2.t())
+        w1 = torch.norm(x1, 2, dim)
+        w2 = torch.norm(x2, 2, dim)
+        return ip / torch.ger(w1,w2).clamp(min=eps)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool1(x)
+
+        x = self.block1(x)
+        x = self.group1(x)
+        x = self.pool2(x)
+
+        x = self.block2(x)
+        x = self.group2(x)
+        x = self.pool3(x)
+
+        x = self.block3(x)
+        x = self.group3(x)
+        x = self.block4(x)
+        x = self.group4(x)
+        x = self.pool4(x)
+
+        x = x.view(x.size(0), -1)
+        fc = self.fc(x)
+    
+        if self.training:
+            x = F.dropout(fc, training=self.training)
+            out = self.cosine_sim(x, self.weight)
+            return out, F.normalize(fc, p=2, dim=1)
+        return F.normalize(fc, p=2, dim=1)
+
+
+def LightCNN_29Layers(**kwargs):
+    model = network_29layers(resblock, [1, 2, 3, 4], **kwargs)
+    return model
+
+def LightCNN_29Layers_cosface(**kwargs):
+    model = network_29layers_cosface(resblock, [1, 2, 3, 4], **kwargs)
+    return model
diff --git a/insightface/recognition/idmmd/pics/readme.txt b/insightface/recognition/idmmd/pics/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/insightface/recognition/idmmd/pics/readme.txt
@@ -0,0 +1 @@
+
diff --git a/insightface/recognition/idmmd/run.sh b/insightface/recognition/idmmd/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9cdf43b80c222b5c3a6cf19692cf95a8e561e120
--- /dev/null
+++ b/insightface/recognition/idmmd/run.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# run : bash run_train_lightcnn_112.sh
+
+echo train lightcnn 112
+
+gpu_ids='0,1,2,3,4,5,6,7'
+workers=8
+epochs=10
+batch_size=64
+lr=5e-3
+print_iter=40
+train_fold_id=10
+input_mode='grey'
+model_mode='29'
+weights_lightcnn='./models/pretrain/L29.pth.tar'
+
+#! LAMP-HQ
+# dataset='lamp'
+# img_root_R=''
+# train_list_R=''
+
+#! CASIA
+dataset='CASIA'
+img_root_R='' # path to real data
+train_list_R='' # name list
+
+#! Oulu
+# dataset='oulu'
+# img_root_R=''
+# train_list_R=''
+
+#! Buaa
+# dataset='buaa'
+# img_root_R=''
+# train_list_R=''
+
+
+#! finetune 112_cos models
+prefix='train'
+python train.py --gpu_ids $gpu_ids --dataset $dataset --workers $workers \
+                --epochs $epochs --batch_size $batch_size --lr $lr --save_name $prefix --input_mode $input_mode \
+                --print_iter $print_iter --weights_lightcnn $weights_lightcnn \
+                --img_root_R $img_root_R --train_list_R $train_list_R \
+                --model_mode $model_mode 
\ No newline at end of file
diff --git a/insightface/recognition/idmmd/train.py b/insightface/recognition/idmmd/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..37fc16aad93cecfcdb33e63308b36710bcd1e8e3
--- /dev/null
+++ b/insightface/recognition/idmmd/train.py
@@ -0,0 +1,199 @@
+import os
+import argparse
+import random
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+
+from utils import *
+from network.lightcnn112 import LightCNN_29Layers_cosface
+from losses import IDMMD, CosFace
+from dataset_mix import Real_Dataset_112_paired, IdentitySampler, GenIdx
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--gpu_ids', default='0,1', type=str)
+parser.add_argument('--workers', default=8, type=int)
+parser.add_argument('--epochs', default=15, type=int)
+parser.add_argument('--pre_epoch', default=0, type=int)
+parser.add_argument('--batch_size', default=64, type=int)
+parser.add_argument('--lr', default=0.001, type=float)
+parser.add_argument('--momentum', default=0.9, type=float)
+parser.add_argument('--weight_decay', default=2e-4)
+parser.add_argument('--step_size', default=5, type=int)
+parser.add_argument('--print_iter', default=5, type=int)
+parser.add_argument('--save_name', default='', type=str)
+parser.add_argument('--seed', default=1000, type=int)
+parser.add_argument('--weights_lightcnn', default='', type=str)
+parser.add_argument('--dataset', default='CASIA', type=str)
+
+parser.add_argument('--img_root_R', default='', type=str)
+parser.add_argument('--train_list_R', default='', type=str)
+
+parser.add_argument('--input_mode', default='red', choices=['grey'], type=str)
+parser.add_argument('--model_mode', default='9',choices=['9','29'], type=str)
+
+
+def main():
+    global args
+    args = parser.parse_args()
+    print(args)
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
+    cudnn.benchmark = True
+    cudnn.enabled = True
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+
+    dataset = 'lamp' if args.dataset == 'LAMP-HQ' else args.dataset.lower()
+
+    # train loader of real data
+    real_dataset_paired = Real_Dataset_112_paired(args)
+    vis_pos, nir_pos = GenIdx(real_dataset_paired.vis_labels, real_dataset_paired.nir_labels)
+    sampler = IdentitySampler(real_dataset_paired.vis_labels, real_dataset_paired.nir_labels, vis_pos, nir_pos, args.batch_size, 4)
+    
+    real_dataset_paired.visIndex = sampler.visIndex
+    real_dataset_paired.nirIndex = sampler.nirIndex
+    
+    train_loader_real_paired = torch.utils.data.DataLoader(
+        real_dataset_paired, batch_size=args.batch_size, sampler=sampler, num_workers=args.workers, pin_memory=True)
+    
+    num_classes = real_dataset_paired.num_classes
+
+    model = LightCNN_29Layers_cosface(num_classes=num_classes)
+
+    model = torch.nn.DataParallel(model).cuda()
+
+    # load pre trained model
+    if args.pre_epoch:
+        print('load pretrained model of epoch %d' % args.pre_epoch)
+        load_model(model, "./model/lightCNN_epoch_%d.pth.tar" % args.pre_epoch)
+    else:
+        print("=> loading pretrained lightcnn '{}'".format(args.weights_lightcnn))
+        load_model_train_lightcnn(model, args.weights_lightcnn)
+
+    # criterion
+    criterion = nn.CrossEntropyLoss().cuda()
+    criterion_idmmd = IDMMD().cuda()
+    margin_softmax = CosFace(s=64.0, m=0.4).cuda()
+
+    '''
+    Stage I: model pretrained for last fc2 parameters
+    '''
+    params_pretrain = []
+    for name, value in model.named_parameters():
+        if name == "module.weight":
+            params_pretrain += [{"params": value, "lr": 1 * args.lr}]
+
+    print("Stage I: trainable params ", len(params_pretrain))
+    assert len(params_pretrain) > 0
+
+    # optimizer
+    optimizer_pretrain = torch.optim.SGD(params_pretrain, args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    for epoch in range(1, 5):
+        pre_train_pair(train_loader_real_paired, model, criterion, margin_softmax, optimizer_pretrain, epoch)
+        # save_checkpoint(model, epoch, args.save_name+"_pretrain", dataset)
+
+    '''
+    Stage II: model finetune for full network
+    '''
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    start_epoch = args.pre_epoch + 1
+    for epoch in range(start_epoch, args.epochs + 1):
+        adjust_learning_rate(args.lr, args.step_size, optimizer, epoch)
+        train(train_loader_real_paired, model, criterion, criterion_idmmd, margin_softmax, optimizer, epoch)
+        if epoch == args.epochs or epoch % 10 == 0:
+            save_checkpoint(model, epoch, args.save_name, dataset)
+    
+
+def pre_train_pair(train_loader, model, criterion, margin_softmax, optimizer, epoch):
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    model.train()
+    for i, (vis_img, nir_img, vis_label, nir_label) in enumerate(train_loader):
+
+        input = torch.cat((vis_img, nir_img), 0).cuda(non_blocking=True)
+        label = torch.cat((vis_label, nir_label), 0).cuda(non_blocking=True)
+        batch_size = input.size(0)
+
+        if batch_size < 2*args.batch_size:
+            continue
+
+        # forward
+        output = model(input)[0]
+        output = margin_softmax(output, label)
+        loss = criterion(output, label)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, label.data, topk=(1, 5))
+        top1.update(prec1.item(), batch_size)
+        top5.update(prec5.item(), batch_size)
+
+        # print log
+        if i % args.print_iter == 0:
+            info = "====> Epoch: [{:0>3d}][{:3d}/{:3d}] | ".format(epoch, i, len(train_loader))
+            info += "Loss: ce: {:4.3f} | ".format(loss.item())
+            info += "Prec@1: {:4.2f} ({:4.2f}) Prec@5: {:4.2f} ({:4.2f})".format(top1.val, top1.avg, top5.val, top5.avg)
+            print(info)
+
+
+def train(train_loader, model, criterion, criterion_idmmd, margin_softmax, optimizer, epoch, beta = 100):
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    model.train()
+    for i, (vis_img, nir_img, vis_label, nir_label) in enumerate(train_loader):
+
+        input = torch.cat((vis_img, nir_img), 0).cuda(non_blocking=True)
+        label = torch.cat((vis_label, nir_label), 0).cuda(non_blocking=True)
+        batch_size = input.size(0)
+
+        if batch_size < 2*args.batch_size:
+            continue
+        
+        # forward
+        output, fc = model(input)
+        output = margin_softmax(output, label)
+        loss_ce = criterion(output, label)
+
+        num_vis = vis_img.size(0)
+        num_nir = nir_img.size(0)
+        fc_vis, fc_nir = torch.split(fc, [num_vis, num_nir], dim=0)
+
+        loss_idmmd = criterion_idmmd(fc_vis, fc_nir, label[:vis_img.size(0)])
+
+        loss = loss_ce + beta * loss_idmmd
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(output.data, label.data, topk=(1, 5))
+        top1.update(prec1.item(), batch_size)
+        top5.update(prec5.item(), batch_size)
+
+        # print log
+        if i % args.print_iter == 0:
+            info = "====> Epoch: [{:0>3d}][{:3d}/{:3d}] | ".format(epoch, i, len(train_loader))
+            info += "Loss_ce: {:4.3f} | ".format(loss_ce.data)
+            info += "loss_idmmd: {:4.3f} | ".format(loss_idmmd.data)
+            info += "Loss_all: {:4.3f} | ".format(loss.item())
+            info += "Prec@1: {:4.2f} ({:4.2f}) Prec@5: {:4.2f} ({:4.2f})".format(top1.val, top1.avg, top5.val, top5.avg)
+            print(info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/insightface/recognition/idmmd/utils.py b/insightface/recognition/idmmd/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..521ec4a32451328188bb7132679ae3d3c1afcaba
--- /dev/null
+++ b/insightface/recognition/idmmd/utils.py
@@ -0,0 +1,150 @@
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+def ort_loss(x, y):
+    loss = torch.abs((x * y).sum(dim=1)).sum()
+    loss = loss / float(x.size(0))
+    return loss
+
+
+def ang_loss(x, y):
+    loss = (x * y).sum(dim=1).sum()
+    loss = loss / float(x.size(0))
+    return loss
+
+def MMD_Loss(fc_nir, fc_vis):
+    mean_fc_nir = torch.mean(fc_nir, 0)
+    mean_fc_vis = torch.mean(fc_vis, 0)
+    loss_mmd = F.mse_loss(mean_fc_nir, mean_fc_vis)
+    return loss_mmd
+
+
+def rgb2gray(img):
+    r, g, b = torch.split(img, 1, dim=1)
+    return torch.mul(r, 0.299) + torch.mul(g, 0.587) + torch.mul(b, 0.114)
+
+
+def save_checkpoint(model, epoch, name="", dataset=''):
+    if not os.path.exists("model/{}/".format(dataset)):
+        os.makedirs("model/{}/".format(dataset))
+    model_path = "model/{}/".format(dataset) + name + "_e{}.pth.tar".format(epoch)
+    state = {"epoch": epoch, "state_dict": model.state_dict()}
+    torch.save(state, model_path)
+    print("checkpoint saved to {}".format(model_path))
+
+
+def load_model(model, pretrained):
+    weights = torch.load(pretrained)
+    pretrained_dict = weights["state_dict"]
+    model_dict = model.state_dict()
+
+    # print("to here")
+    # print(model_dict.keys())
+    # print('\n')
+
+    # print(pretrained_dict.keys())
+
+    # import pdb;pdb.set_trace()
+
+    if 'LightCNN' in pretrained:
+        tmp = [k for k in pretrained_dict]
+        if "module." in tmp[0]:
+            pretrained_dict = {k.replace('module.',''): v for k, v in pretrained_dict.items() if k.replace('module.','') in model_dict}
+    else:
+        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and k!='module.weight'}
+    
+    print("len of params to be loaded: ",len(pretrained_dict))
+    model.load_state_dict(pretrained_dict, strict=False)
+
+    return weights['epoch']
+
+
+def load_model_train_lightcnn(model, pretrained):
+    weights = torch.load(pretrained)
+    pretrained_dict = weights["state_dict"]
+    model_dict = model.state_dict()
+
+    # print("to here")
+    # print(model_dict.keys())
+    # print('\n')
+
+    # print(pretrained_dict.keys())
+
+    # import pdb;pdb.set_trace()
+
+    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and 'module.weight' not in k}
+    
+    print("len of params to be loaded: ",len(pretrained_dict))
+    model.load_state_dict(pretrained_dict, strict=False)
+
+    return weights['epoch']
+
+
+def set_requires_grad(nets, requires_grad=False):
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+# assign adain_params to AdaIN layers
+def assign_adain_params(adain_params, model):
+    for m in model.modules():
+        if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
+            mean = adain_params[:, :m.num_features]
+            std = adain_params[:, m.num_features:2*m.num_features]
+            m.bias = mean.contiguous().view(-1)
+            m.weight = std.contiguous().view(-1)
+            if adain_params.size(1) > 2*m.num_features:
+                adain_params = adain_params[:, 2*m.num_features:]
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    
+    correct = pred.eq(target.unsqueeze(0).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+
+def adjust_learning_rate(lr, step, optimizer, epoch):
+    scale = 0.457305051927326
+    lr = lr * (scale ** (epoch // step))
+    print('lr: {}'.format(lr))
+    if (epoch != 0) & (epoch % step == 0):
+        print('Change lr')
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = param_group['lr'] * scale
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
diff --git a/insightface/recognition/partial_fc/README.md b/insightface/recognition/partial_fc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f63872c7aedd35e8fb3e630b81082f327a18b1db
--- /dev/null
+++ b/insightface/recognition/partial_fc/README.md
@@ -0,0 +1,160 @@
+## Partial-FC
+Partial FC is a distributed deep learning training framework for face recognition. The goal of Partial FC is to facilitate large-scale classification task (e.g. 10 or 100 million identities). It is much faster than the model parallel solution and there is no performance drop.
+
+![Image text](https://github.com/nttstar/insightface-resources/blob/master/images/partial_speed1.png)
+
+
+## Contents
+[Partial FC](https://arxiv.org/abs/2203.15565)
+- [Largest Face Recognition Dataset: **Glint360k**](#Glint360K)
+- [Docker](#Docker)
+- [Performance On Million Identities](#Benchmark)
+- [FAQ](#FAQ)
+- [Citation](#Citation)
+
+
+## Glint360K
+We clean, merge, and release the largest and cleanest face recognition dataset Glint360K, 
+which contains **`17091657`** images of **`360232`** individuals. 
+By employing the Patial FC training strategy, baseline models trained on Glint360K can easily achieve state-of-the-art performance. 
+Detailed evaluation results on the large-scale test set (e.g. IFRT, IJB-C and Megaface) are as follows:
+
+### 1. Evaluation on IFRT       
+**`r`** denotes the sampling rate of negative class centers.
+| Backbone     | Dataset            | African | Caucasian | Indian | Asian | ALL   |
+| ------------ | -----------        | ----- | ----- | ------ | ----- | ----- |
+| R50          | MS1M-V3            | 76.24 | 86.21 | 84.44  | 37.43 | 71.02 |
+| R124         | MS1M-V3            | 81.08 | 89.06 | 87.53  | 38.40 | 74.76 |
+| R100         | **Glint360k**(r=1.0)   | 89.50 | 94.23 | 93.54  | **65.07** | **88.67** |
+| R100         | **Glint360k**(r=0.1)   | **90.45** | **94.60** | **93.96**  | 63.91 | 88.23 |
+
+### 2. Evaluation on IJB-C and Megaface  
+We employ ResNet100 as the backbone and CosFace (m=0.4) as the loss function.
+TAR@FAR=1e-4 is reported on the IJB-C datasets, and TAR@FAR=1e-6 is reported on the Megaface dataset.
+|Test Dataset        | IJB-C     | Megaface_Id  | Megaface_Ver |
+| :---               | :---:     | :---:        | :---:        |
+| MS1MV2             | 96.4      | 98.3         | 98.6         |
+|**Glint360k** | **97.3**  | **99.1**     | **99.1**     |
+
+### 3. License 
+
+The Glint360K dataset (and the models trained with this dataset) are available for non-commercial research purposes only.
+
+### 4. Download
+- [x] [**Baidu Drive**](https://pan.baidu.com/s/1GsYqTTt7_Dn8BfxxsLFN0w) (code:o3az)    
+- [x] **Magnet URI**: `magnet:?xt=urn:btih:E5F46EE502B9E76DA8CC3A0E4F7C17E4000C7B1E&dn=glint360k`
+
+Refer to the following command to unzip.
+```
+cat glint360k_* | tar -xzvf -
+
+# Don't forget the last '-'!
+
+# cf7433cbb915ac422230ba33176f4625  glint360k_00
+# 589a5ea3ab59f283d2b5dd3242bc027a  glint360k_01
+# 8d54fdd5b1e4cd55e1b9a714d76d1075  glint360k_02
+# cd7f008579dbed9c5af4d1275915d95e  glint360k_03
+# 64666b324911b47334cc824f5f836d4c  glint360k_04
+# a318e4d32493dd5be6b94dd48f9943ac  glint360k_05
+# c3ae1dcbecea360d2ec2a43a7b6f1d94  glint360k_06
+# md5sum:
+# 5d9cd9f262ec87a5ca2eac5e703f7cdf train.idx
+# 8483be5af6f9906e19f85dee49132f8e train.rec
+```
+Use [unpack_glint360k.py](./unpack_glint360k.py) to unpack.
+
+### 5. Pretrain models
+- [x] [**Baidu Drive**](https://pan.baidu.com/s/1sd9ZRsV2c_dWHW84kz1P1Q) (code:befi)
+- [x] [**Google Drive**](https://drive.google.com/drive/folders/1WLjDzEs1wC1K1jxDHNJ7dhEmQ3rOOILl?usp=sharing)
+
+| Framework       |  backbone                      | negative class centers sample_rate  | IJBC@e4 | IFRT@e6 |
+|  :---           | :---                           | :---         |  :---   |  :---   | 
+|  mxnet   | [R100](https://drive.google.com/drive/folders/1YPqIkOZWrmbli4GWfMJO2b0yiiZ7UCsP?usp=sharing) |1.0|97.3|-|
+|  mxnet   | [R100](https://drive.google.com/drive/folders/1-gF5sDwNoRcjwmpPSTNLpaZJi5N91BvL?usp=sharing) |0.1|97.3|-|
+|  pytorch | [R50](https://drive.google.com/drive/folders/16hjOGRJpwsJCRjIBbO13z3SrSgvPTaMV?usp=sharing) |1.0|97.0|-|    
+|  pytorch | [R100](https://drive.google.com/drive/folders/19EHffHN0Yn8DjYm5ofrgVOf_xfkrVgqc?usp=sharing) |1.0|97.4|-|
+    
+## Docker
+Make sure you have installed the NVIDIA driver and Docker engine for your Linux distribution Note that you do not need to 
+install the CUDA Toolkit and other independence on the host system, but the NVIDIA driver needs to be installed.  
+Because the CUDA version used in the image is 10.1, 
+the graphics driver version on the physical machine must be greater than 418.
+
+### 1. Docker Getting Started
+You can use dockerhub or offline docker.tar to get the image of the Partial-fc.
+1. dockerhub
+```shell
+docker pull insightface/partial_fc:v1
+```  
+
+2. offline images  
+coming soon!
+
+### 2. Getting Started
+```shell
+sudo docker run -it -v /train_tmp:/train_tmp --net=host --privileged --gpus 8 --shm-size=1g insightface/partial_fc:v1 /bin/bash
+```
+
+`/train_tmp` is where you put your training set (if you have enough RAM memory, 
+you can turn it into `tmpfs` first).
+
+## Benchmark
+### 1. Train Glint360K Using MXNET
+ 
+| Backbone    |   GPU                       | FP16  | BatchSize / it | Throughput img / sec |
+|  :---       | :---                        | :---  |   :---         | :---                 | 
+|  R100       | 8 * Tesla V100-SXM2-32GB    | False |   64           | 1748                 |
+|  R100       | 8 * Tesla V100-SXM2-32GB    | True  |   64           | 3357                 |
+|  R100       | 8 * Tesla V100-SXM2-32GB    | False |   128          | 1847                 |    
+|  R100       | 8 * Tesla V100-SXM2-32GB    | True  |   128          | 3867                 |   
+|  R50        | 8 * Tesla V100-SXM2-32GB    | False |   64           | 2921                 |
+|  R50        | 8 * Tesla V100-SXM2-32GB    | True  |   64           | 5428                 |
+|  R50        | 8 * Tesla V100-SXM2-32GB    | False |   128          | 3045                 |    
+|  R50        | 8 * Tesla V100-SXM2-32GB    | True  |   128          | 6112                 |  
+
+
+### 2. Performance On Million Identities
+We neglect the influence of IO. All experiments use mixed-precision training, and the backbone is ResNet50.
+#### 1 Million Identities On 8 RTX2080Ti  
+
+|Method                     | GPUs        | BatchSize     | Memory/M      | Throughput img/sec | W     |
+| :---                      | :---:       | :---:         | :---:         | :---:              | :---: |
+| Model Parallel            | 8           | 1024          | 10408         | 2390               | GPU   |
+| **Partial FC(Ours)**      | **8**       | **1024**      | **8100**      | **2780**           | GPU   |
+#### 10 Million Identities On 64 RTX2080Ti  
+
+|Method                     | GPUs        | BatchSize     | Memory/M      | Throughput img/sec | W     |
+| :---                      | :---:       | :---:         | :---:         | :---:              | :---: |
+| Model Parallel            | 64          | 2048          | 9684          | 4483               | GPU   |
+| **Partial FC(Ours)**      | **64**      | **4096**      | **6722**      | **12600**          | GPU   |
+
+
+## FAQ
+#### Glint360K's Face Alignment Settings?
+We use a same alignment setting with MS1MV2, code is [here](https://github.com/deepinsight/insightface/issues/1286).
+
+#### Why update Glint360K, is there a bug in the previous version?  
+In the previous version of Glint360K, there is no bug when using softmax training, but there is a bug in triplet training. 
+In the latest Glint360k, this bug has been fixed.
+
+#### Dataset in Google Drive or Dropbox?
+The torrent has been released.
+
+
+## Citation
+If you find Partial-FC or Glint360K useful in your research, please consider to cite the following related paper: 
+
+[Partial FC](https://arxiv.org/abs/2203.15565)
+```
+@inproceedings{an2022pfc,
+  title={Killing Two Birds with One Stone: Efficient and Robust Training of Face Recognition CNNs by Partial FC},
+  author={An, Xiang and Deng, Jiangkang and Guo, Jia and Feng, Ziyong and Zhu, Xuhan and Jing, Yang and Tongliang, Liu},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2022}
+}
+
+```
+
+
+
+
diff --git a/insightface/recognition/partial_fc/docs/installtion.md b/insightface/recognition/partial_fc/docs/installtion.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/partial_fc/docs/installtion_ch.md b/insightface/recognition/partial_fc/docs/installtion_ch.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/partial_fc/mxnet/README.md b/insightface/recognition/partial_fc/mxnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..930c8c210e55c97f0c2b0131da61e427acaf594c
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/README.md
@@ -0,0 +1,119 @@
+## Training
+### 1.Requirements
+python==3.6  
+cuda==10.1    
+cudnn==765    
+mxnet-cu101==1.6.0.post0  
+pip install easydict mxboard opencv-python tqdm    
+[nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html)  
+[openmpi](mxnet/setup-utils/install-mpi.sh)==4.0.0  
+[horovod](mxnet/setup-utils/install-horovod.sh)==0.19.2  
+
+### 2.Run with horovodrun
+Typically one GPU will be allocated per process, so if a server has 8 GPUs, you will run 8 processes. 
+In horovodrun, the number of processes is specified with the -np flag.
+
+To run on a machine with 8 GPUs:
+```shell script
+horovodrun -np 8 -H localhost:8 bash config.sh
+```
+
+To run on two machine with 16 GPUs:
+```shell script
+horovodrun -np 16 -H ip1:8,ip2:8 bash config.sh
+```
+
+### 3.Run with mpi
+```shell script
+bash run.sh
+```
+
+### Failures due to SSH issues
+The host where horovodrun is executed must be able to SSH to all other hosts without any prompts.
+
+
+
+
+## Troubleshooting
+
+### 1. Horovod installed successfully?  
+
+Run `horovodrun --check` to check the installation of horovod.
+```shell script
+# Horovod v0.19.2:
+# 
+# Available Frameworks:
+#     [ ] TensorFlow
+#     [X] PyTorch
+#     [X] MXNet
+# 
+# Available Controllers:
+#     [X] MPI
+#     [X] Gloo
+# 
+# Available Tensor Operations:
+#     [X] NCCL
+#     [ ] DDL
+#     [ ] CCL
+#     [X] MPI
+#     [X] Gloo
+```
+
+### 2. Mxnet Version!
+Some versions of mxnet with horovod have bug.   
+It is recommended to try version **1.5 or 1.6**.
+
+**The community has found that mxnet1.5.1 cannot install horovod.**
+
+### 3. Check CUDA version!
+```shell script
+# Make sure your cuda version is same as mxnet, such as mxnet-cu101 (CUDA 10.1)
+
+/usr/local/cuda/bin/nvcc -V
+# nvcc: NVIDIA (R) Cuda compiler driver
+# Copyright (c) 2005-2019 NVIDIA Corporation
+# Built on Wed_Apr_24_19:10:27_PDT_2019
+# Cuda compilation tools, release 10.1, V10.1.168
+```
+
+### 4. Block IO
+You can turn on the debug mode to check whether your slow training speed is the cause of IO.
+
+### 5. Training Speed.
+If you find that your training speed is the io bottleneck, you can mount dataset to RAM, 
+using the following command.
+```shell script
+# If your RAM has 256G
+sudo mkdir /train_tmp
+mount -t tmpfs -o size=140G  tmpfs /train_tmp
+```
+
+## Our Method
+![Image text](https://github.com/nttstar/insightface-resources/blob/master/images/partial_fc.png)
+
+### 1. The classification layer model is parallel
+Class centers are evenly distributed across different GPUs. It only takes three communications to complete 
+loss-free Softmax calculations.
+
+#### 1. Synchronization of features
+Make sure each GPU has all the GPU features on it, as is shown in `AllGather(x_i)`.
+
+#### 2. Synchronization of denominator of the softmax function
+We can first calculate the local sum of each GPU, and then compute the global sum through communication, as is shown
+in `Allreduce(sum(exp(logits_i)))`
+
+#### 3. Synchronization the gradients of feature
+The gradient of logits can be calculated independently, so is the gradient of the feature. finally, we collect all the 
+gradients on GPU and send them back to backbone, as is shown in `Allreduce(deta(X))`
+
+### 2. Softmax approximate
+
+Just a subset of class centers can approximate softmax's computation(positive class centers must in these class centers),
+this can be done with the following code:
+```python
+centers_p = func_positive(label)                 # select the positive class centers by the label of the sample
+centers_n = func_negative(centers_p)             # negative class centers are randomly sampled after excluding positive classes
+centers_final = concat(centers_n, centers_p)     # class centers that participate in softmax calculations
+```
+
+
diff --git a/insightface/recognition/partial_fc/mxnet/README_CN.md b/insightface/recognition/partial_fc/mxnet/README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c63462eaee14a89646fbf443e8087033018cdea
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/README_CN.md
@@ -0,0 +1,124 @@
+## 目录
+## Contents
+[Partial FC](https://arxiv.org/abs/2203.15565)
+- [如何安装](#如何安装)
+- [如何运行](#如何运行)
+- [错误排查](#错误排查)
+
+
+## 如何安装
+ 
+### 1. python依赖  
+使用以下命令
+```shell script
+pip install easydict mxboard opencv-python tqdm     
+```
+
+### 2. 安装nccl  
+nccl可以不用装，但是装上速度更快，nccl安装需要对应cuda版本，安装方法参考下边链接:  
+[**NCCL**](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html)  
+
+### 3. 安装openmpi  
+openmpi必须安装，必须采纳我的脚本编译源码安装：  
+[**OpenMPI**](setup-utils/install-mpi.sh)    
+
+### 4. 安装horovod, mxnet
+有些版本的mxnet的horovod无法安装，参考下方表格，强烈建议使用**mxnet==1.6.0**和**cuda==10.1**
+
+| mxnet |horovod  |  cuda        | 
+| :---: | :---    |  :---:       | 
+| 1.4.0 | x       |  x           | 
+| 1.5.0 | 可以安装 | cuda10.0     | 
+| 1.5.1 | x       | x            | 
+| 1.6.0.post0 | 可以安装 | cuda10.1     | 
+| 1.7.0 | x       | x            | 
+
+horovod 安装方法如下:  
+[**Horovod**](setup-utils/install-horovod.sh)
+
+horovod 安装完成后使用下面的命令检查horovod是否安装成功，(nccl有没有都可以，有nccl会更快)：
+```shell script
+# Horovod v0.19.2:
+# Available Frameworks:
+#     [ ] TensorFlow
+#     [ ] PyTorch
+#     [X] MXNet
+# 
+# Available Controllers:
+#     [X] MPI
+#     [X] Gloo
+# 
+# Available Tensor Operations:
+#     [X] NCCL
+#     [ ] DDL
+#     [ ] CCL
+#     [X] MPI
+#     [X] Gloo
+```
+
+
+### 5. ssh无密登录
+
+使用多机分布式训练的时候，每台机器都需要设置无密登录，包括自己与自己，无密码登录具体可见：  
+这里推荐一个简单的命令：  
+```shell script
+ssh-copy-id user@ip
+```
+
+## 如何运行  
+`horovod`底层调用的还是`mpi`，mpi的概念是，你有多少块GPU，就要启动多少个进程，有两种方法启动训练，使用`horovodrun`或者`mpirun`。  
+### 1. 使用 horovodrun 运行  
+
+运行8卡(单机)：
+```shell script
+horovodrun -np 8 -H localhost:8 bash config.sh
+```
+
+运行16卡(两台机器)
+```shell script
+horovodrun -np 16 -H ip1:8,ip2:8 bash config.sh
+```
+
+### 2. 使用 mpirun 运行  
+
+```shell script
+bash run.sh
+```
+
+## 错误排查
+
+QQ群：711302608  
+
+### 检查Horovod是否安装成功？
+
+运行这个命令 `horovodrun --check` 来检查horovod是否安装成功。
+
+### 检查你的CUDA版本是否与mxnet匹配，比如mxnet-cu101需要的cuda版本为CUDA10.1  
+
+```shell script
+# Make sure your cuda version is same as mxnet, such as mxnet-cu101 (CUDA 10.1)
+
+/usr/local/cuda/bin/nvcc -V
+# nvcc: NVIDIA (R) Cuda compiler driver
+# Copyright (c) 2005-2019 NVIDIA Corporation
+# Built on Wed_Apr_24_19:10:27_PDT_2019
+# Cuda compilation tools, release 10.1, V10.1.168
+```
+
+### 屏蔽IO对训练速度的影响？  
+
+可以在`config.py`中开启debug模式，来屏蔽IO，看看是否是IO对性能的影响。
+
+### 将数据挂载到内存盘来提高训练速度。
+
+如果你发现你训练速度的瓶颈是IO的话，你可以把数据挂载到内存盘来提高训练的速度，挂载的命令如下：  
+需要注意的是，你的RAM必须足够的大。
+
+```shell script
+# If your RAM has 256G
+sudo mkdir /train_tmp
+mount -t tmpfs -o size=140G  tmpfs /train_tmp
+```
+
+
+
diff --git a/insightface/recognition/partial_fc/mxnet/callbacks.py b/insightface/recognition/partial_fc/mxnet/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2c93ef198abcbc9daabee951bc13af26e8daec8
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/callbacks.py
@@ -0,0 +1,221 @@
+import logging
+import os
+import sys
+import time
+
+import horovod.mxnet as hvd
+import mxnet as mx
+from mxboard import SummaryWriter
+from mxnet import nd
+
+from default import config
+from evaluation import verification
+
+
+class MetricNdarray(object):
+    def __init__(self):
+        self.sum = None
+        self.count = 0
+        self.reset()
+
+    def reset(self):
+        self.sum = None
+        self.count = 0
+
+    def update(self, val, n=1):
+        assert isinstance(val, mx.nd.NDArray), type(val)
+        if self.sum is None:  # init sum
+            self.sum = mx.nd.zeros_like(val)
+
+        self.sum += val * n
+        self.count += n
+
+    def get(self):
+        average = self.sum / self.count
+        return average.asscalar()
+
+
+class CallBackVertification(object):
+    def __init__(self, symbol, model):
+        self.verbose = config.verbose
+        self.symbol = symbol
+        self.highest_acc = 0.0
+        self.highest_acc_list = [0.0] * len(config.val_targets)
+        self.model = model
+        self.ver_list = []
+        self.ver_name_list = []
+        self.init_dataset(val_targets=config.val_targets,
+                          data_dir=os.path.dirname(config.rec),
+                          image_size=(config.image_size, config.image_size))
+
+    def ver_test(self, num_update):
+        results = []
+        for i in range(len(self.ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], self.model, 10, 10, None, None)
+            logging.info('[%s][%d]XNorm: %f' %
+                         (self.ver_name_list[i], num_update, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                         (self.ver_name_list[i], num_update, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                '[%s][%d]Accuracy-Highest: %1.5f' %
+                (self.ver_name_list[i], num_update, self.highest_acc_list[i]))
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, param):
+        #
+        num_update = param.num_update
+        #
+        if num_update > 0 and num_update % self.verbose == 0:  # debug in mbatches in 100 and 200
+            # accuracy list
+            self.ver_test(num_update)
+
+
+class CallBackCenterSave(object):
+    def __init__(self, memory_bank, save_interval=10000):
+        self.save_interval = save_interval
+        self.memory_bank = memory_bank
+
+    def __call__(self, param):
+        if param.num_update % self.save_interval == 0:
+            self.memory_bank.save()
+
+
+class CallBackModelSave(object):
+    def __init__(self, symbol, model, prefix, rank):
+        self.symbol = symbol
+        self.model = model
+        self.prefix = prefix
+        self.max_step = config.max_update
+        self.rank = rank
+
+    def __call__(self, param):
+        num_update = param.num_update
+
+        if num_update in [
+            self.max_step - 10,
+        ] or (num_update % 10000 == 0 and num_update > 0):
+
+            # params
+            arg, aux = self.model.get_export_params()
+            # symbol
+            _sym = self.symbol
+            # save
+
+            # average all aux
+            new_arg, new_aux = {}, {}
+            for key, tensor in aux.items():
+                new_aux[key] = hvd.allreduce(tensor, average=True)
+            for key, tensor in arg.items():
+                new_arg[key] = hvd.allreduce(tensor, average=True)
+
+            if self.rank == 0:
+                mx.model.save_checkpoint(prefix=self.prefix + "_average",
+                                         epoch=0,
+                                         symbol=_sym,
+                                         arg_params=new_arg,
+                                         aux_params=new_aux)
+                mx.model.save_checkpoint(prefix=self.prefix,
+                                         epoch=0,
+                                         symbol=_sym,
+                                         arg_params=arg,
+                                         aux_params=aux)
+
+        # training is over
+        if num_update > self.max_step > 0:
+            logging.info('Training is over!')
+            sys.exit(0)
+
+
+class MetricCallBack(object):
+    def __init__(self, batch_size, rank, size, prefix_dir, frequent):
+        self.batch_size = batch_size
+        self.rank = rank
+        self.size = size
+        self.prefix_dir = prefix_dir
+        self.frequent = frequent
+        self.init = False
+        self.tic = 0
+        self.last_count = 0
+        self.loss_metric_list = MetricNdarray()
+        t = time.localtime()
+
+        self.summary_writer = SummaryWriter(
+            logdir=os.path.join(self.prefix_dir, 'log_tensorboard', str(t.tm_mon) + '_' + str(t.tm_mday) \
+                                + '_' + str(t.tm_hour)),
+            verbose=False)
+        pass
+
+
+class CallBackLogging(object):
+    def __init__(self, rank, size, prefix_dir):
+        self.batch_size = config.batch_size
+        self.rank = rank
+        self.size = size
+        self.prefix_dir = prefix_dir
+        self.frequent = config.frequent
+        self.init = False
+        self.tic = 0
+        self.last_count = 0
+        self.loss_metric = MetricNdarray()
+        t = time.localtime()
+
+        if self.rank == 0:
+            self.summary_writer = SummaryWriter(logdir=os.path.join(
+                self.prefix_dir, "log_tensorboard",
+                "%s_%s_%s" % (str(t.tm_mon), str(t.tm_mday), str(t.tm_hour))),
+                verbose=False)
+        else:
+            time.sleep(2)
+
+    def __call__(self, param):
+        """Callback to Show speed
+        """
+        count = param.num_update
+
+        if self.last_count > count:
+            self.init = False
+        self.last_count = count
+
+        self.loss_metric.update(param.loss[0])
+
+        if self.init:
+            if count % self.frequent == 0:
+                nd.waitall()
+                try:
+                    speed = self.frequent * self.batch_size / (time.time() - self.tic)
+                    speed_total = speed * self.size
+                except ZeroDivisionError:
+                    speed = float('inf')
+                    speed_total = float('inf')
+
+                # summary loss
+                loss_scalar = self.loss_metric.get()
+
+                if self.rank == 0:
+                    self.summary_writer.add_scalar(tag="loss", value=loss_scalar, global_step=param.num_update)
+                loss_str_format = "[%d][%s]:%.2f " % (param.num_epoch, "loss",
+                                                      loss_scalar)
+                self.loss_metric.reset()
+
+                if self.rank == 0:
+                    self.summary_writer.add_scalar(tag="speed", value=speed, global_step=param.num_update)
+                    self.summary_writer.flush()
+                    logging.info(
+                        "Iter:%d Rank:%.2f it/sec Total:%.2f it/sec %s",
+                        param.num_update, speed, speed_total, loss_str_format)
+
+                self.tic = time.time()
+        else:
+            self.init = True
+            self.tic = time.time()
diff --git a/insightface/recognition/partial_fc/mxnet/config.sh b/insightface/recognition/partial_fc/mxnet/config.sh
new file mode 100644
index 0000000000000000000000000000000000000000..17e58fba0b901ddf7ceb42a58b01e0e57ea1cf32
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/config.sh
@@ -0,0 +1,14 @@
+export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
+export HOROVOD_GPU_ALLREDUCE=NCCL
+export HOROVOD_GPU_ALLGATHER=NCCL
+export HOROVOD_GPU_BROADCAST=NCLL
+export MXNET_CPU_WORKER_NTHREADS=3
+
+# use `which python` to get the absolute path of your python interpreter
+#
+PYTHON_EXEC=/usr/bin/python
+${PYTHON_EXEC} train_memory.py \
+--dataset glint360k_8GPU \
+--loss cosface \
+--network r100 \
+--models-root /data/anxiang/opensource/glint360k_8GPU_r100FC_1.0_fp32_cosface
diff --git a/insightface/recognition/partial_fc/mxnet/default.py b/insightface/recognition/partial_fc/mxnet/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b01f74138823f308d33a93645d27e3f2416c5d
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/default.py
@@ -0,0 +1,129 @@
+from easydict import EasyDict as edict
+
+config = edict()
+# loss
+config.embedding_size = 512
+config.bn_mom = 0.9
+config.workspace = 256
+config.net_se = 0
+config.net_act = 'prelu'
+config.net_unit = 3
+config.net_input = 1
+config.net_output = 'FC'
+config.frequent = 20
+config.verbose = 2000
+config.image_size = 112
+config.memonger = False
+
+config.debug = 0
+config.fp16 = False
+config.batch_size = 64
+config.backbone_lr = 0.1
+config.memory_bank_lr = config.backbone_lr
+config.sample_ratio = 1.0
+
+
+def generate_config(loss_name, dataset, network):
+
+    # loss
+    if loss_name == 'arcface':
+        config.loss_s = 64.0
+        config.loss_m1 = 1.0
+        config.loss_m2 = 0.5
+        config.loss_m3 = 0.0
+    elif loss_name == 'cosface':
+        config.loss_s = 64.0
+        config.loss_m1 = 1.0
+        config.loss_m2 = 0.0
+        config.loss_m3 = 0.4
+
+    # dataset
+    if dataset == 'webface':
+        config.lr_steps = '20000,28000'
+        config.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+        config.rec = '/anxiang/datasets/webface/train.rec'
+        config.rec = '/train_tmp/webface/train.rec'
+        config.num_classes = 10575
+        config.max_update = 32000
+
+    # glint360k 17091657
+    # md5sum:
+    # 5d9cd9f262ec87a5ca2eac5e703f7cdf train.idx
+    # 8483be5af6f9906e19f85dee49132f8e train.rec
+
+    # make training faster
+    # our RAM is 256G
+    # mount -t tmpfs -o size=140G  tmpfs /train_tmp
+
+    elif dataset == 'glint360k_8GPU':
+        config.lr_steps = '200000,400000,500000,550000'
+        config.val_targets = [
+            'agedb_30', 'calfw', 'cfp_ff', 'cfp_fp', 'cplfw', 'lfw', 'vgg2_fp'
+        ]
+        config.rec = '/train_tmp/glint360k/train.rec'
+        config.num_classes = 360232
+        config.batch_size = 64
+        config.max_update = 600000
+
+    elif dataset == 'glint360k_16GPU':
+        config.lr_steps = '200000,280000,360000'
+        config.val_targets = ['agedb_30', 'cfp_fp', 'lfw']
+        config.rec = '/train_tmp/glint360k/train.rec'
+        config.num_classes = 360232
+        config.max_update = 400000
+
+    elif dataset == 'emore':
+        config.lr_steps = '100000,160000'
+        config.val_targets = ['agedb_30', 'cfp_fp', 'lfw']
+        config.rec = '/anxiang/datasets/faces_emore/train.rec'
+        config.rec = '/train_tmp/faces_emore/train.rec'
+        config.num_classes = 85742
+        config.batch_size = 64
+        config.max_update = 180000
+
+    elif dataset == '100w':
+        config.debug = 1
+        config.num_classes = 100 * 10000
+        config.lr_steps = '20000,28000'
+        config.max_update = 32000
+
+    elif dataset == '1000w':
+        config.debug = 1
+        config.num_classes = 1000 * 10000
+        config.lr_steps = '20000,28000'
+        config.max_update = 32000
+
+    elif dataset == '2000w':
+        config.debug = 1
+        config.num_classes = 2000 * 10000
+        config.lr_steps = '20000,28000'
+        config.max_update = 32000
+
+    elif dataset == '3000w':
+        config.debug = 1
+        config.num_classes = 3000 * 10000
+        config.lr_steps = '20000,28000'
+        config.max_update = 32000
+
+    elif dataset == '10000w':
+        config.debug = 1
+        config.num_classes = 10000 * 10000
+        config.lr_steps = '20000,28000'
+        config.max_update = 32000
+
+    # network
+    if network == 'r100':
+        config.net_name = 'resnet'
+        config.num_layers = 100
+    elif network == 'r122':
+        config.net_name = 'resnet'
+        config.num_layers = 122
+    elif network == 'r50':
+        config.net_name = 'resnet'
+        config.num_layers = 50
+    elif network == 'rx101':
+        config.net_name = 'fresnext'
+        config.num_layers = 101
+    elif network == 'rx50':
+        config.net_name = 'fresnext'
+        config.num_layers = 50
diff --git a/insightface/recognition/partial_fc/mxnet/evaluation/align_ijb.py b/insightface/recognition/partial_fc/mxnet/evaluation/align_ijb.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb60abda2d41ea5389ff068ce2cb609c883b868e
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/evaluation/align_ijb.py
@@ -0,0 +1,42 @@
+import os
+
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+src = np.array([[30.2946, 51.6963], [65.5318, 51.5014], [48.0252, 71.7366],
+                [33.5493, 92.3655], [62.7299, 92.2041]],
+               dtype=np.float32)
+src[:, 0] += 8.0
+
+img_path = '/data/anxiang/datasets/IJB_release/IJBC/loose_crop'
+img_path_align = '/data/anxiang/datasets/IJB_release/IJBC/loose_crop_align'
+
+img_list_path = '/data/anxiang/datasets/IJB_release/IJBC/meta/ijbc_name_5pts_score.txt'
+img_list = open(img_list_path)
+files = img_list.readlines()
+
+for img_index, each_line in enumerate(files):
+    if img_index % 500 == 0:
+        print('processing', img_index)
+    name_lmk_score = each_line.strip().split(' ')
+    img_name = os.path.join(img_path, name_lmk_score[0])
+    img = cv2.imread(img_name)
+    landmark = np.array([float(x) for x in name_lmk_score[1:-1]],
+                        dtype=np.float32)
+    landmark = landmark.reshape((5, 2))
+
+    if landmark.shape[0] == 68:
+        landmark5 = np.zeros((5, 2), dtype=np.float32)
+        landmark5[0] = (landmark[36] + landmark[39]) / 2
+        landmark5[1] = (landmark[42] + landmark[45]) / 2
+        landmark5[2] = landmark[30]
+        landmark5[3] = landmark[48]
+        landmark5[4] = landmark[54]
+    else:
+        landmark5 = landmark
+    tform = trans.SimilarityTransform()
+    tform.estimate(landmark5, src)
+    M = tform.params[0:2, :]
+    img = cv2.warpAffine(img, M, (112, 112), borderValue=0.0)
+    cv2.imwrite(os.path.join(img_path_align, name_lmk_score[0]), img)
diff --git a/insightface/recognition/partial_fc/mxnet/evaluation/example.sh b/insightface/recognition/partial_fc/mxnet/evaluation/example.sh
new file mode 100755
index 0000000000000000000000000000000000000000..63427748af0985a6860798d6b38dbb6ae8c38765
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/evaluation/example.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# run `python ijb.py --help` for more information
+python -u ijb.py \
+--model-prefix /home/face/insightface/recognition/partial_fc/mxnet/evaluation/glint360k_r100FC_0.1/model \
+--image-path /data/anxiang/IJB_release/IJBC \
+--result-dir ./results/test \
+--model-epoch 0 \
+--gpu 0,1,2,3 \
+--target IJBC \
+--job partial_fc \
+--batch-size 256 \
+-es 512
+
diff --git a/insightface/recognition/partial_fc/mxnet/evaluation/ijb.py b/insightface/recognition/partial_fc/mxnet/evaluation/ijb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcabd69eaac39694319e58b901dfa13a9503044
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/evaluation/ijb.py
@@ -0,0 +1,460 @@
+import argparse
+import os
+import pickle
+import timeit
+import warnings
+from pathlib import Path
+
+import cv2
+import matplotlib
+import matplotlib.pyplot as plt
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import sklearn
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from mxnet.gluon.data import Dataset, DataLoader
+from prettytable import PrettyTable
+from skimage import transform as trans
+from sklearn import preprocessing
+from sklearn.metrics import roc_curve, auc
+from tqdm import tqdm
+
+matplotlib.use('Agg')
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--model-epoch', default=1, type=int, help='')
+parser.add_argument('--image-path', default='', type=str, help='')
+parser.add_argument('--result-dir', default='.', type=str, help='')
+parser.add_argument('--gpu', default='0', type=str, help='gpu id')
+parser.add_argument('--batch-size', default=128, type=int, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('-es', '--emb-size', type=int, help='embedding size')
+parser.add_argument('--target',
+                    default='IJBC',
+                    type=str,
+                    help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+
+os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+
+target = args.target
+model_path = args.model_prefix
+image_path = args.image_path
+result_dir = args.result_dir
+epoch = args.model_epoch
+use_norm_score = True  # if Ture, TestMode(N1)
+use_detector_score = True  # if Ture, TestMode(D1)
+use_flip_test = True  # if Ture, TestMode(F1)
+job = args.job
+batch_size = args.batch_size
+
+
+class DatasetIJB(Dataset):
+    def __init__(self, root, lines, align=True):
+        self.src = np.array(
+            [[30.2946, 51.6963], [65.5318, 51.5014], [48.0252, 71.7366],
+             [33.5493, 92.3655], [62.7299, 92.2041]],
+            dtype=np.float32)
+        self.src[:, 0] += 8.0
+        self.lines = lines
+        self.img_root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(' ')  # "name lmk score"
+        img_name = os.path.join(self.img_root, name_lmk_score[0])
+        img = cv2.imread(img_name)
+
+        if self.align:
+            landmark = np.array([float(x) for x in name_lmk_score[1:-1]],
+                                dtype=np.float32)
+            landmark = landmark.reshape((5, 2))
+            #
+            assert landmark.shape[0] == 68 or landmark.shape[0] == 5
+            assert landmark.shape[1] == 2
+            if landmark.shape[0] == 68:
+                landmark5 = np.zeros((5, 2), dtype=np.float32)
+                landmark5[0] = (landmark[36] + landmark[39]) / 2
+                landmark5[1] = (landmark[42] + landmark[45]) / 2
+                landmark5[2] = landmark[30]
+                landmark5[3] = landmark[48]
+                landmark5[4] = landmark[54]
+            else:
+                landmark5 = landmark
+            #
+            tform = trans.SimilarityTransform()
+            tform.estimate(landmark5, self.src)
+            #
+            M = tform.params[0:2, :]
+            img = cv2.warpAffine(img, M, (112, 112), borderValue=0.0)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 3*112*112, RGB
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 3, 112, 112), dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return mx.nd.array(input_blob)
+
+
+def extract_parallel(prefix, epoch, dataset, batch_size, size):
+    # init
+    model_list = list()
+    num_ctx = len(os.environ['CUDA_VISIBLE_DEVICES'].split(","))
+    num_iter = 0
+    feat_mat = mx.nd.zeros(shape=(len(dataset), 2 * size))
+
+    def batchify_fn(data):
+        return mx.nd.concat(*data, dim=0)
+
+    data_loader = DataLoader(dataset,
+                             batch_size,
+                             last_batch='keep',
+                             num_workers=8,
+                             thread_pool=True,
+                             prefetch=16,
+                             batchify_fn=batchify_fn)
+    symbol, arg_params, aux_params = mx.module.module.load_checkpoint(
+        prefix, epoch)
+    all_layers = symbol.get_internals()
+    symbol = all_layers['fc1_output']
+
+    # init model list
+    for i in range(num_ctx):
+        model = mx.mod.Module(symbol, context=mx.gpu(i), label_names=None)
+        model.bind(for_training=False,
+                   data_shapes=[('data', (2 * batch_size, 3, 112, 112))])
+        model.set_params(arg_params, aux_params)
+        model_list.append(model)
+
+    # extract parallel and async
+    num_model = len(model_list)
+    for image in tqdm(data_loader):
+        data_batch = mx.io.DataBatch(data=(image, ))
+        model_list[num_iter % num_model].forward(data_batch, is_train=False)
+        feat = model_list[num_iter %
+                          num_model].get_outputs(merge_multi_context=True)[0]
+        feat = mx.nd.L2Normalization(feat)
+        feat = mx.nd.reshape(feat, (-1, size * 2))
+        feat_mat[batch_size * num_iter:batch_size * num_iter +
+                 feat.shape[0], :] = feat.as_in_context(mx.cpu())
+        num_iter += 1
+        #if num_iter % 20 == 0:
+        #    mx.nd.waitall()
+    return feat_mat.asnumpy()
+
+
+# 将一个list尽量均分成n份，限制len(list)==n，份数大于原list内元素个数则分配空list[]
+def divideIntoNstrand(listTemp, n):
+    twoList = [[] for i in range(n)]
+    for i, e in enumerate(listTemp):
+        twoList[i % n].append(e)
+    return twoList
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+
+        (ind_t, ) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m, ) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    # template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+    template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+    # print(template_norm_feats.shape)
+    return template_norm_feats, unique_templates
+
+
+# In[ ]:
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1), ))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+# In[ ]:
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1), ))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# # Step1: Load Meta Data
+
+assert target == 'IJBC' or target == 'IJBB'
+
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id,  mid --> media id
+# format:
+#           image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id,  label : 1/0
+# format:
+#           tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 2: Get Image Features
+
+# =============================================================
+# load image features
+# format:
+#           img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = '%s/loose_crop' % image_path
+img_list_path = '%s/meta/%s_name_5pts_score.txt' % (image_path, target.lower())
+img_list = open(img_list_path)
+files = img_list.readlines()
+dataset = DatasetIJB(root=img_path, lines=files, align=True)
+img_feats = extract_parallel(args.model_prefix,
+                             args.model_epoch,
+                             dataset,
+                             args.batch_size,
+                             size=args.emb_size)
+
+faceness_scores = []
+for each_line in files:
+    name_lmk_score = each_line.split()
+    faceness_scores.append(name_lmk_score[-1])
+
+faceness_scores = np.array(faceness_scores).astype(np.float32)
+
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                          img_feats.shape[1]))
+
+# # Step3: Get Template Features
+
+# In[ ]:
+
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+
+if use_flip_test:
+    # concat --- F1
+    # img_input_feats = img_feats
+    # add --- F2
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+                                2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+if use_norm_score:
+    img_input_feats = img_input_feats
+else:
+    # normalise features to remove norm information
+    img_input_feats = img_input_feats / np.sqrt(
+        np.sum(img_input_feats**2, -1, keepdims=True))
+
+if use_detector_score:
+    print(img_input_feats.shape, faceness_scores.shape)
+    # img_input_feats = img_input_feats * np.matlib.repmat(faceness_scores[:,np.newaxis], 1, img_input_feats.shape[1])
+    img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+    img_input_feats = img_input_feats
+
+template_norm_feats, unique_templates = image2template_feature(
+    img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 4: Get Template Similarity Scores
+
+# In[ ]:
+
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+save_path = result_dir + '/%s_result' % target
+
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+
+score_save_file = os.path.join(save_path, "%s.npy" % job)
+np.save(score_save_file, score)
+
+# # Step 5: Get ROC Curves and TPR@FPR Table
+
+# In[ ]:
+
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+    methods.append(Path(file).stem)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+# x_labels = [1/(10**x) for x in np.linspace(6, 0, 6)]
+x_labels = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, target))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        # tpr_fpr_row.append('%.4f' % tpr[min_index])
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10**-6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+# plt.show()
+fig.savefig(os.path.join(save_path, '%s.pdf' % job))
+print(tpr_fpr_table)
diff --git a/insightface/recognition/partial_fc/mxnet/evaluation/lfw.py b/insightface/recognition/partial_fc/mxnet/evaluation/lfw.py
new file mode 100644
index 0000000000000000000000000000000000000000..d900d7909bd3201cab388ab535dce6a0bcf863d2
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/evaluation/lfw.py
@@ -0,0 +1,325 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+from scipy import misc
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import sklearn
+from sklearn.decomposition import PCA
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    #print('pca', pca)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        #print('train_set', train_set)
+        #print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            #print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            #print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def get_paths(lfw_dir, pairs, file_ext):
+    nrof_skipped_pairs = 0
+    path_list = []
+    issame_list = []
+    for pair in pairs:
+        if len(pair) == 3:
+            path0 = os.path.join(
+                lfw_dir, pair[0],
+                pair[0] + '_' + '%04d' % int(pair[1]) + '.' + file_ext)
+            path1 = os.path.join(
+                lfw_dir, pair[0],
+                pair[0] + '_' + '%04d' % int(pair[2]) + '.' + file_ext)
+            issame = True
+        elif len(pair) == 4:
+            path0 = os.path.join(
+                lfw_dir, pair[0],
+                pair[0] + '_' + '%04d' % int(pair[1]) + '.' + file_ext)
+            path1 = os.path.join(
+                lfw_dir, pair[2],
+                pair[2] + '_' + '%04d' % int(pair[3]) + '.' + file_ext)
+            issame = False
+        if os.path.exists(path0) and os.path.exists(
+                path1):  # Only add the pair if both paths exist
+            path_list += (path0, path1)
+            issame_list.append(issame)
+        else:
+            print('not exists', path0, path1)
+            nrof_skipped_pairs += 1
+    if nrof_skipped_pairs > 0:
+        print('Skipped %d image pairs' % nrof_skipped_pairs)
+
+    return path_list, issame_list
+
+
+def read_pairs(pairs_filename):
+    pairs = []
+    with open(pairs_filename, 'r') as f:
+        for line in f.readlines()[1:]:
+            pair = line.strip().split()
+            pairs.append(pair)
+    return np.array(pairs)
+
+
+def load_dataset(lfw_dir, image_size):
+    lfw_pairs = read_pairs(os.path.join(lfw_dir, 'pairs.txt'))
+    lfw_paths, issame_list = get_paths(lfw_dir, lfw_pairs, 'jpg')
+    lfw_data_list = []
+    for flip in [0, 1]:
+        lfw_data = nd.empty((len(lfw_paths), 3, image_size[0], image_size[1]))
+        lfw_data_list.append(lfw_data)
+    i = 0
+    for path in lfw_paths:
+        with open(path, 'rb') as fin:
+            _bin = fin.read()
+            img = mx.image.imdecode(_bin)
+            img = nd.transpose(img, axes=(2, 0, 1))
+            for flip in [0, 1]:
+                if flip == 1:
+                    img = mx.ndarray.flip(data=img, axis=2)
+                lfw_data_list[flip][i][:] = img
+            i += 1
+            if i % 1000 == 0:
+                print('loading lfw', i)
+    print(lfw_data_list[0].shape)
+    print(lfw_data_list[1].shape)
+    return (lfw_data_list, issame_list)
+
+
+def test(lfw_set, mx_model, batch_size):
+    print('testing lfw..')
+    lfw_data_list = lfw_set[0]
+    issame_list = lfw_set[1]
+    model = mx_model
+    embeddings_list = []
+    for i in range(len(lfw_data_list)):
+        lfw_data = lfw_data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < lfw_data.shape[0]:
+            bb = min(ba + batch_size, lfw_data.shape[0])
+            _data = nd.slice_axis(lfw_data, axis=0, begin=ba, end=bb)
+            _label = nd.ones((bb - ba, ))
+            #print(_data.shape, _label.shape)
+            db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            #_arg, _aux = model.get_params()
+            #__arg = {}
+            #for k,v in _arg.iteritems():
+            #  __arg[k] = v.as_in_context(_ctx)
+            #_arg = __arg
+            #_arg["data"] = _data.as_in_context(_ctx)
+            #_arg["softmax_label"] = _label.as_in_context(_ctx)
+            #for k,v in _arg.iteritems():
+            #  print(k,v.context)
+            #exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
+            #exe.forward(is_train=False)
+            #net_out = exe.outputs
+            _embeddings = net_out[0].asnumpy()
+            #print(_embeddings.shape)
+            if embeddings is None:
+                embeddings = np.zeros(
+                    (lfw_data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            #print(_em.shape, _norm)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=10)
+    acc1, std1 = np.mean(accuracy), np.std(accuracy)
+    #print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    #embeddings = np.concatenate(embeddings_list, axis=1)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=10)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
diff --git a/insightface/recognition/partial_fc/mxnet/evaluation/verification.py b/insightface/recognition/partial_fc/mxnet/evaluation/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..324b1b3b71a7d2d8efca180d24dad5cfa26e0d72
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/evaluation/verification.py
@@ -0,0 +1,679 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import sys
+import numpy as np
+from scipy import misc
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import sklearn
+import cv2
+import math
+import datetime
+import pickle
+from sklearn.decomposition import PCA
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    # print('pca', pca)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        # print('train_set', train_set)
+        # print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            # print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            # print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        # print('threshold', thresholds[best_threshold_index])
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def load_bin(path, image_size):
+    # try:
+    #     with open(path, 'rb') as f:
+    #         bins, issame_list = pickle.load(f)  # py2
+    # except UnicodeDecodeError as e:
+    with open(path, 'rb') as f:
+        bins, issame_list = pickle.load(f, encoding='bytes')  # py3
+    data_list = []
+    for flip in [0, 1]:
+        data = nd.empty(
+            (len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][i][:] = img
+    print('test bin loaded done:', data_list[0].shape)
+    return (data_list, issame_list)
+
+
+def test(data_set,
+         mx_model,
+         batch_size,
+         nfolds=10,
+         data_extra=None,
+         label_shape=None):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            # print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            # _arg, _aux = model.get_params()
+            # __arg = {}
+            # for k,v in _arg.iteritems():
+            #  __arg[k] = v.as_in_context(_ctx)
+            # _arg = __arg
+            # _arg["data"] = _data.as_in_context(_ctx)
+            # _arg["softmax_label"] = _label.as_in_context(_ctx)
+            # for k,v in _arg.iteritems():
+            #  print(k,v.context)
+            # exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
+            # exe.forward(is_train=False)
+            # net_out = exe.outputs
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            # print(_embeddings.shape)
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            # print(_em.shape, _norm)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    # embeddings = embeddings_list[0].copy()
+    # embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    # _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=10)
+    # acc1, std1 = np.mean(accuracy), np.std(accuracy)
+
+    # print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    # embeddings = np.concatenate(embeddings_list, axis=1)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def test_badcase(data_set,
+                 mx_model,
+                 batch_size,
+                 name='',
+                 data_extra=None,
+                 label_shape=None):
+    print('testing verification badcase..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            # print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    thresholds = np.arange(0, 4, 0.01)
+    actual_issame = np.asarray(issame_list)
+    nrof_folds = 10
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    data = data_list[0]
+
+    pouts = []
+    nouts = []
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        # print(train_set)
+        # print(train_set.__class__)
+        for threshold_idx, threshold in enumerate(thresholds):
+            p2 = dist[train_set]
+            p3 = actual_issame[train_set]
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, p2, p3)
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+        best_threshold = thresholds[best_threshold_index]
+        for iid in test_set:
+            ida = iid * 2
+            idb = ida + 1
+            asame = actual_issame[iid]
+            _dist = dist[iid]
+            violate = _dist - best_threshold
+            if not asame:
+                violate *= -1.0
+            if violate > 0.0:
+                imga = data[ida].asnumpy().transpose(
+                    (1, 2, 0))[..., ::-1]  # to bgr
+                imgb = data[idb].asnumpy().transpose((1, 2, 0))[..., ::-1]
+                # print(imga.shape, imgb.shape, violate, asame, _dist)
+                if asame:
+                    pouts.append((imga, imgb, _dist, best_threshold, ida))
+                else:
+                    nouts.append((imga, imgb, _dist, best_threshold, ida))
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    acc = np.mean(accuracy)
+    pouts = sorted(pouts, key=lambda x: x[2], reverse=True)
+    nouts = sorted(nouts, key=lambda x: x[2], reverse=False)
+    print(len(pouts), len(nouts))
+    print('acc', acc)
+    gap = 10
+    image_shape = (112, 224, 3)
+    out_dir = "./badcases"
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    if len(nouts) > 0:
+        threshold = nouts[0][3]
+    else:
+        threshold = pouts[-1][3]
+
+    for item in [(pouts, 'positive(false_negative).png'),
+                 (nouts, 'negative(false_positive).png')]:
+        cols = 4
+        rows = 8000
+        outs = item[0]
+        if len(outs) == 0:
+            continue
+        # if len(outs)==9:
+        #  cols = 3
+        #  rows = 3
+
+        _rows = int(math.ceil(len(outs) / cols))
+        rows = min(rows, _rows)
+        hack = {}
+
+        if name.startswith('cfp') and item[1].startswith('pos'):
+            hack = {
+                0: 'manual/238_13.jpg.jpg',
+                6: 'manual/088_14.jpg.jpg',
+                10: 'manual/470_14.jpg.jpg',
+                25: 'manual/238_13.jpg.jpg',
+                28: 'manual/143_11.jpg.jpg'
+            }
+
+        filename = item[1]
+        if len(name) > 0:
+            filename = name + "_" + filename
+        filename = os.path.join(out_dir, filename)
+        img = np.zeros((image_shape[0] * rows + 20, image_shape[1] * cols +
+                        (cols - 1) * gap, 3),
+                       dtype=np.uint8)
+        img[:, :, :] = 255
+        text_color = (0, 0, 153)
+        text_color = (255, 178, 102)
+        text_color = (153, 255, 51)
+        for outi, out in enumerate(outs):
+            row = outi // cols
+            col = outi % cols
+            if row == rows:
+                break
+            imga = out[0].copy()
+            imgb = out[1].copy()
+            if outi in hack:
+                idx = out[4]
+                print('noise idx', idx)
+                aa = hack[outi]
+                imgb = cv2.imread(aa)
+                # if aa==1:
+                #  imgb = cv2.transpose(imgb)
+                #  imgb = cv2.flip(imgb, 1)
+                # elif aa==3:
+                #  imgb = cv2.transpose(imgb)
+                #  imgb = cv2.flip(imgb, 0)
+                # else:
+                #  for ii in range(2):
+                #    imgb = cv2.transpose(imgb)
+                #    imgb = cv2.flip(imgb, 1)
+            dist = out[2]
+            _img = np.concatenate((imga, imgb), axis=1)
+            k = "%.3f" % dist
+            # print(k)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            cv2.putText(_img, k, (80, image_shape[0] // 2 + 7), font, 0.6,
+                        text_color, 2)
+            # _filename = filename+"_%d.png"%outi
+            # cv2.imwrite(_filename, _img)
+            img[row * image_shape[0]:(row + 1) * image_shape[0],
+                (col * image_shape[1] +
+                 gap * col):((col + 1) * image_shape[1] + gap * col), :] = _img
+        # threshold = outs[0][3]
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        k = "threshold: %.3f" % threshold
+        cv2.putText(img, k, (img.shape[1] // 2 - 70, img.shape[0] - 5), font,
+                    0.6, text_color, 2)
+        cv2.imwrite(filename, img)
+
+
+def dumpR(data_set,
+          mx_model,
+          batch_size,
+          name='',
+          data_extra=None,
+          label_shape=None):
+    print('dump verification embedding..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            # print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join('temp.bin')
+    with open(outname, 'wb') as f:
+        pickle.dump((embeddings, issame_list),
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='do verification')
+    # general
+    parser.add_argument('--data-dir', default='', help='')
+    parser.add_argument('--model',
+                        default='../model/softmax,50',
+                        help='path to load model.')
+    parser.add_argument('--target',
+                        default='lfw,cfp_ff,cfp_fp,agedb_30',
+                        help='test targets.')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+    parser.add_argument('--batch-size', default=32, type=int, help='')
+    parser.add_argument('--max', default='', type=str, help='')
+    parser.add_argument('--mode', default=0, type=int, help='')
+    parser.add_argument('--nfolds', default=10, type=int, help='')
+    args = parser.parse_args()
+    # sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
+    # import face_image
+    # prop = face_image.load_property(args.data_dir)
+    # image_size = prop.image_size
+    image_size = [112, 112]
+    print('image_size', image_size)
+    ctx = mx.gpu(args.gpu)
+    nets = []
+    vec = args.model.split(',')
+    prefix = args.model.split(',')[0]
+    epochs = []
+    if len(vec) == 1:
+        pdir = os.path.dirname(prefix)
+        for fname in os.listdir(pdir):
+            if not fname.endswith('.params'):
+                continue
+            _file = os.path.join(pdir, fname)
+            if _file.startswith(prefix):
+                epoch = int(fname.split('.')[0].split('-')[1])
+                epochs.append(epoch)
+        epochs = sorted(epochs, reverse=True)
+        if len(args.max) > 0:
+            _max = [int(x) for x in args.max.split(',')]
+            assert len(_max) == 2
+            if len(epochs) > _max[1]:
+                epochs = epochs[_max[0]:_max[1]]
+
+    else:
+        epochs = [int(x) for x in vec[1].split('|')]
+    print('model number', len(epochs))
+    time0 = datetime.datetime.now()
+    for epoch in epochs:
+        print('loading', prefix, epoch)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        # arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+        all_layers = sym.get_internals()
+        sym = all_layers['fc1_output']
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+        model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+                                          image_size[1]))])
+        model.set_params(arg_params, aux_params)
+        nets.append(model)
+    time_now = datetime.datetime.now()
+    diff = time_now - time0
+    print('model loading time', diff.total_seconds())
+
+    ver_list = []
+    ver_name_list = []
+    for name in args.target.split(','):
+        path = os.path.join(args.data_dir, name + ".bin")
+        if os.path.exists(path):
+            print('loading.. ', name)
+            data_set = load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+
+    if args.mode == 0:
+        for i in range(len(ver_list)):
+            results = []
+            for model in nets:
+                acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+                    ver_list[i], model, args.batch_size, args.nfolds)
+                print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+                print('[%s]Accuracy: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc1, std1))
+                print('[%s]Accuracy-Flip: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc2, std2))
+                results.append(acc2)
+            print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+    elif args.mode == 1:
+        model = nets[0]
+        test_badcase(ver_list[0], model, args.batch_size, args.target)
+    else:
+        model = nets[0]
+        dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/partial_fc/mxnet/hosts/host_16 b/insightface/recognition/partial_fc/mxnet/hosts/host_16
new file mode 100644
index 0000000000000000000000000000000000000000..cb485433b7522558b18b4bf945cfea2e1cc13667
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/hosts/host_16
@@ -0,0 +1,2 @@
+10.55.0.182 slots=8
+10.55.0.37 slots=8
\ No newline at end of file
diff --git a/insightface/recognition/partial_fc/mxnet/hosts/host_8 b/insightface/recognition/partial_fc/mxnet/hosts/host_8
new file mode 100644
index 0000000000000000000000000000000000000000..11ce82de152d2795a9665c3b9d044ae3b09b9b8b
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/hosts/host_8
@@ -0,0 +1 @@
+localhost slots=8
\ No newline at end of file
diff --git a/insightface/recognition/partial_fc/mxnet/image_iter.py b/insightface/recognition/partial_fc/mxnet/image_iter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a18717dfd4b5376b97586eadcfd1e66bb54ec4
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/image_iter.py
@@ -0,0 +1,348 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import logging
+import sys
+import numbers
+import math
+import datetime
+import numpy as np
+import cv2
+
+import mxnet as mx
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+
+logger = logging.getLogger()
+
+
+class FaceImageIter(io.DataIter):
+    def __init__(self,
+                 batch_size,
+                 data_shape,
+                 path_imgrec=None,
+                 shuffle=False,
+                 aug_list=None,
+                 mean=None,
+                 rand_mirror=False,
+                 cutoff=0,
+                 color_jittering=0,
+                 images_filter=0,
+                 data_name='data',
+                 label_name='softmax_label',
+                 context=0,
+                 context_num=1,
+                 **kwargs):
+        super(FaceImageIter, self).__init__()
+        assert path_imgrec
+        self.context = context
+        self.context_num = context_num
+        if path_imgrec:
+            logging.info('loading recordio %s...', path_imgrec)
+            path_imgidx = path_imgrec[0:-4] + ".idx"
+            self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec,
+                                                     'r')
+            s = self.imgrec.read_idx(0)
+            header, _ = recordio.unpack(s)
+            if header.flag > 0:
+                self.header0 = (int(header.label[0]), int(header.label[1]))
+                self.imgidx = []
+                self.id2range = {}
+                self.seq_identity = range(int(header.label[0]),
+                                          int(header.label[1]))
+                for identity in self.seq_identity:
+                    s = self.imgrec.read_idx(identity)
+                    header, _ = recordio.unpack(s)
+                    a, b = int(header.label[0]), int(header.label[1])
+                    count = b - a
+                    if count < images_filter:
+                        continue
+                    self.id2range[identity] = (a, b)
+                    self.imgidx += range(a, b)
+                self_data_lenth = len(self.imgidx)
+            else:
+                self.imgidx = list(self.imgrec.keys)
+            if shuffle:
+                self.seq = self.imgidx
+                self.oseq = self.imgidx
+            else:
+                self.seq = None
+
+        self.mean = mean
+        self.nd_mean = None
+        self.epoch = 0
+
+        if self.mean:
+            self.mean = np.array(self.mean, dtype=np.float32).reshape(1, 1, 3)
+            self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3))
+
+        self.check_data_shape(data_shape)
+        self.provide_data = [(data_name, (batch_size, ) + data_shape)]
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+        self.shuffle = shuffle
+        self.image_size = '%d,%d' % (data_shape[1], data_shape[2])
+        self.rand_mirror = rand_mirror
+
+        self.cutoff = cutoff
+        self.color_jittering = color_jittering
+        self.CJA = mx.image.ColorJitterAug(0.125, 0.125, 0.125)
+        self.provide_label = [(label_name, (batch_size, ))]
+
+        self.cur = 0
+        self.nbatch = 0
+        self.is_init = False
+        self.num_samples_per_gpu = int(
+            math.floor(len(self.seq) * 1.0 / self.context_num))
+
+    def reset(self):
+        """Resets the iterator to the beginning of the data."""
+        self.epoch += 1
+        self.cur = 0
+        if self.shuffle:
+            random.seed(self.epoch)
+            random.shuffle(self.seq)
+        if self.seq is None and self.imgrec is not None:
+            self.imgrec.reset()
+
+    def num_samples(self):
+        return len(self.seq)
+
+    def next_sample(self):
+        if self.seq is not None:
+            while True:
+                if self.cur >= self.num_samples_per_gpu:
+                    raise StopIteration
+                idx = self.seq[self.num_samples_per_gpu * self.context +
+                               self.cur]
+                self.cur += 1
+                if self.imgrec is not None:
+                    s = self.imgrec.read_idx(idx)
+                    header, img = recordio.unpack(s)
+                    label = header.label
+                    if not isinstance(label, numbers.Number):
+                        label = label[0]
+                    return int(label), img, None, None
+                else:
+                    label, fname, bbox, landmark = self.imglist[idx]
+                    return label, self.read_image(fname), bbox, landmark
+        else:
+            s = self.imgrec.read()
+            if s is None:
+                raise StopIteration
+            header, img = recordio.unpack(s)
+            return header.label, img, None, None
+
+    def brightness_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        src *= alpha
+        return src
+
+    def contrast_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = (3.0 * (1.0 - alpha) / gray.size) * nd.sum(gray)
+        src *= alpha
+        src += gray
+        return src
+
+    def saturation_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = nd.sum(gray, axis=2, keepdims=True)
+        gray *= (1.0 - alpha)
+        src *= alpha
+        src += gray
+        return src
+
+    def color_aug(self, img, x):
+        return self.CJA(img)
+
+    def mirror_aug(self, img):
+        _rd = random.randint(0, 1)
+        if _rd == 1:
+            for c in range(img.shape[2]):
+                img[:, :, c] = np.fliplr(img[:, :, c])
+        return img
+
+    def compress_aug(self, img):
+        from PIL import Image
+        from io import BytesIO
+        buf = BytesIO()
+        img = Image.fromarray(img.asnumpy(), 'RGB')
+        q = random.randint(2, 20)
+        img.save(buf, format='JPEG', quality=q)
+        buf = buf.getvalue()
+        img = Image.open(BytesIO(buf))
+        return nd.array(np.asarray(img, 'float32'))
+
+    def next(self):
+        if not self.is_init:
+            self.reset()
+            self.is_init = True
+        """Returns the next batch of data."""
+        # print('in next', self.cur, self.labelcur)
+        self.nbatch += 1
+
+        batch_size = self.batch_size
+        c, h, w = self.data_shape
+        batch_data = nd.empty((batch_size, c, h, w))
+        if self.provide_label is not None:
+            batch_label = nd.empty(self.provide_label[0][1])
+        i = 0
+        try:
+            while i < batch_size:
+                label, s, bbox, landmark = self.next_sample()
+                _data = self.imdecode(s)
+                if _data.shape[0] != self.data_shape[1]:
+                    _data = mx.image.resize_short(_data, self.data_shape[1])
+                if self.rand_mirror:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        _data = mx.ndarray.flip(data=_data, axis=1)
+                if self.color_jittering > 0:
+                    if self.color_jittering > 1:
+                        _rd = random.randint(0, 1)
+                        if _rd == 1:
+                            _data = self.compress_aug(_data)
+                    # print('do color aug')
+                    _data = _data.astype('float32', copy=False)
+                    # print(_data.__class__)
+                    _data = self.color_aug(_data, 0.125)
+                if self.nd_mean is not None:
+                    _data = _data.astype('float32', copy=False)
+                    _data -= self.nd_mean
+                    _data *= 0.0078125
+                if self.cutoff > 0:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        # print('do cutoff aug', self.cutoff)
+                        centerh = random.randint(0, _data.shape[0] - 1)
+                        centerw = random.randint(0, _data.shape[1] - 1)
+                        half = self.cutoff // 2
+                        starth = max(0, centerh - half)
+                        endh = min(_data.shape[0], centerh + half)
+                        startw = max(0, centerw - half)
+                        endw = min(_data.shape[1], centerw + half)
+                        # print(starth, endh, startw, endw, _data.shape)
+                        _data[starth:endh, startw:endw, :] = 128
+                data = [_data]
+                try:
+                    self.check_valid_image(data)
+                except RuntimeError as e:
+                    logging.debug('Invalid image, skipping:  %s', str(e))
+                    continue
+                # print('aa',data[0].shape)
+                # data = self.augmentation_transform(data)
+                # print('bb',data[0].shape)
+                for datum in data:
+                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
+                    # print(datum.shape)
+                    batch_data[i][:] = self.postprocess_data(datum)
+                    batch_label[i][:] = label
+                    i += 1
+        except StopIteration:
+            if i < batch_size:
+                raise StopIteration
+
+        return io.DataBatch([batch_data], [batch_label], batch_size - i)
+
+    def check_data_shape(self, data_shape):
+        """Checks if the input data shape is valid"""
+        if not len(data_shape) == 3:
+            raise ValueError(
+                'data_shape should have length 3, with dimensions CxHxW')
+        if not data_shape[0] == 3:
+            raise ValueError(
+                'This iterator expects inputs to have 3 channels.')
+
+    def check_valid_image(self, data):
+        """Checks if the input data is valid"""
+        if len(data[0].shape) == 0:
+            raise RuntimeError('Data shape is wrong')
+
+    def imdecode(self, s):
+        """Decodes a string or byte string to an NDArray.
+        See mx.img.imdecode for more details."""
+        img = mx.image.imdecode(s)  # mx.ndarray
+        return img
+
+    def read_image(self, fname):
+        """Reads an input image `fname` and returns the decoded raw bytes.
+
+        Example usage:
+        ----------
+        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
+        """
+        with open(os.path.join(self.path_root, fname), 'rb') as fin:
+            img = fin.read()
+        return img
+
+    def augmentation_transform(self, data):
+        """Transforms input data with specified augmentation."""
+        for aug in self.auglist:
+            data = [ret for src in data for ret in aug(src)]
+        return data
+
+    def postprocess_data(self, datum):
+        """Final postprocessing step before image is loaded into the batch."""
+        return nd.transpose(datum, axes=(2, 0, 1))
+
+
+class FaceImageIterList(io.DataIter):
+    def __init__(self, iter_list):
+        assert len(iter_list) > 0
+        self.provide_data = iter_list[0].provide_data
+        self.provide_label = iter_list[0].provide_label
+        self.iter_list = iter_list
+        self.cur_iter = None
+
+    def reset(self):
+        self.cur_iter.reset()
+
+    def next(self):
+        self.cur_iter = random.choice(self.iter_list)
+        while True:
+            try:
+                ret = self.cur_iter.next()
+            except StopIteration:
+                self.cur_iter.reset()
+                continue
+            return ret
+
+
+# dummy
+class DummyIter(mx.io.DataIter):
+    def __init__(self,
+                 batch_size,
+                 data_shape,
+                 batches=1000,
+                 mode='',
+                 dtype='float32'):
+        super(DummyIter, self).__init__(batch_size)
+        self.data_shape = (batch_size, ) + data_shape
+        self.label_shape = (batch_size, )
+        self.provide_data = [('data', self.data_shape)]
+        self.provide_label = [('softmax_label', self.label_shape)]
+        # self.provide_label = [('label', self.label_shape)]
+        # if mode == 'perseus':
+        #    self.provide_label = []
+        self.batch = mx.io.DataBatch(
+            data=[mx.nd.zeros(self.data_shape, dtype=dtype)],
+            label=[mx.nd.zeros(self.label_shape, dtype=dtype)])
+        self._batches = 0
+        self.batches = batches
+
+    def next(self):
+        if self._batches < self.batches:
+            self._batches += 1
+            return self.batch
+        else:
+            self._batches = 0
+            raise StopIteration
diff --git a/insightface/recognition/partial_fc/mxnet/memory_bank.py b/insightface/recognition/partial_fc/mxnet/memory_bank.py
new file mode 100644
index 0000000000000000000000000000000000000000..d24a2122170b1ffd220fdcf1f5905bf3f6569c16
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/memory_bank.py
@@ -0,0 +1,117 @@
+import os
+
+import numpy as np
+from mxnet import nd
+import mxnet as mx
+
+from memory_samplers import WeightIndexSampler
+
+
+class MemoryBank(object):
+    def __init__(self,
+                 num_sample,
+                 num_local,
+                 rank,
+                 local_rank,
+                 embedding_size,
+                 prefix,
+                 gpu=True):
+        """
+        Parameters
+        ----------
+        num_sample: int
+            The number of sampled class center.
+        num_local: int
+            The number of class center storage in this rank(CPU/GPU).
+        rank: int
+            Unique process(GPU) ID from 0 to size - 1.
+        local_rank: int
+            Unique process(GPU) ID within the server from 0 to 7.
+        embedding_size: int
+            The feature dimension.
+        prefix_dir: str
+            Path prefix of model dir.
+        gpu: bool
+            If True, class center and class center mom will storage in GPU.
+        """
+        self.num_sample = num_sample
+        self.num_local = num_local
+        self.rank = rank
+        self.embedding_size = embedding_size
+        self.gpu = gpu
+        self.prefix = prefix
+
+        if self.gpu:
+            context = mx.gpu(local_rank)
+        else:
+            context = mx.cpu()
+
+        # In order to apply update, weight and momentum should be storage.
+        self.weight = nd.random_normal(loc=0,
+                                       scale=0.01,
+                                       shape=(self.num_local,
+                                              self.embedding_size),
+                                       ctx=context)
+        self.weight_mom = nd.zeros_like(self.weight)
+
+        # Sampler object
+        self.weight_index_sampler = WeightIndexSampler(num_sample, num_local,
+                                                       rank)
+
+    def sample(self, global_label):
+        """
+        Parameters
+        ----------
+        global_label: NDArray
+            Global label (after gathers label from all rank)
+        Returns
+        -------
+        index: ndarray(numpy)
+            Local index for memory bank to sample, start from 0 to num_local, length is num_sample.
+        global_label: ndarray(numpy)
+            Global label after sort and unique.
+        """
+        assert isinstance(global_label, nd.NDArray)
+        global_label = global_label.asnumpy()
+        global_label = np.unique(global_label)
+        global_label.sort()
+        index = self.weight_index_sampler(global_label)
+        index.sort()
+        return index, global_label
+
+    def get(self, index):
+        """
+        Get sampled class centers and their momentum.
+
+        Parameters
+        ----------
+        index: NDArray
+            Local index for memory bank to sample, start from 0 to num_local.
+        """
+        return self.weight[index], self.weight_mom[index]
+
+    def set(self, index, updated_weight, updated_weight_mom=None):
+        """
+        Update sampled class to memory bank, make the class center stored
+        in the memory bank the latest.
+
+        Parameters
+        ----------
+        index: NDArray
+            Local index for memory bank to sample, start from 0 to num_local.
+        updated_weight: NDArray
+            Class center which has been applied gradients.
+        updated_weight_mom: NDArray
+            Class center momentum which has been moved average.
+        """
+
+        self.weight[index] = updated_weight
+        self.weight_mom[index] = updated_weight_mom
+
+    def save(self):
+        nd.save(fname=os.path.join(self.prefix,
+                                   "%d_centers.param" % self.rank),
+                data=self.weight)
+        nd.save(fname=os.path.join(self.prefix,
+                                   "%d_centers_mom.param" % self.rank),
+                data=self.weight_mom)
diff --git a/insightface/recognition/partial_fc/mxnet/memory_module.py b/insightface/recognition/partial_fc/mxnet/memory_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd0b6b9e28d5d1ced3a53f6a5be984ddd11c52d
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/memory_module.py
@@ -0,0 +1,508 @@
+import logging
+import warnings
+from collections import namedtuple
+
+import horovod.mxnet as hvd
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+
+from default import config
+from optimizer import DistributedOptimizer
+
+
+class SampleDistributeModule(object):
+    """
+    Large-scale distributed sampling face recognition training Module, of course sampling is an option,
+    There will be no loss of accuracy in sampling in large-scale identities training tasks, uses only
+    8 NVIDIA RTX2080Ti to complete classification tasks with 10 millions of identities, 64 NVIDIA
+    RTX2080Ti can complete classification tasks with 100 million of identities.
+
+    See the original paper:
+    https://arxiv.org/abs/2203.15565
+
+    Parameters
+    ----------
+    symbol: Symbol
+        Backbone symbol.
+    fc7_model: Object
+        Object of margin loss.
+    memory_bank: Memory bank Object.
+        Object of memory bank, which maintain local class centers and their momentum.
+    memory_optimizer: Optimizer object.
+        The updater of memory bank, default is sgd optimizer.
+    logger:
+    """
+    def __init__(
+        self,
+        symbol,
+        fc7_model,
+        memory_bank,
+        memory_optimizer,
+        logger=logging,
+    ):
+        self.size = hvd.size()
+        self.rank = hvd.rank()
+        self.local_rank = hvd.local_rank()
+        self.gpu = mx.gpu(self.local_rank)
+        self.cpu = mx.cpu()  # `device_id` is not needed for CPU.
+        self.nd_cache = {}
+        self.embedding_size = config.embedding_size
+        self.batch_size = config.batch_size
+        self.num_update = 0
+        self.batch_end_param = namedtuple('batch_end_param',
+                                          ['loss', 'num_epoch', 'num_update'])
+
+        self.fc7_model = fc7_model
+        self.symbol = symbol
+        self.logger = logger
+        self.backbone_module = mx.module.Module(self.symbol, ['data'],
+                                                ['softmax_label'],
+                                                logger=self.logger,
+                                                context=self.gpu)
+
+        self.memory_bank = memory_bank
+        self.memory_optimizer = memory_optimizer
+        self.memory_lr = None
+        self.loss_cache = None
+        self.grad_cache = None
+
+    def forward_backward(self, data_batch):
+        """A convenient function that calls both ``forward`` and ``backward``.
+        """
+        total_feature, total_label = self.forward(data_batch, is_train=True)
+        self.backward_all(total_feature, total_label)
+
+    @staticmethod
+    def broadcast_parameters(params):
+        """
+        :param params:
+        :return:
+        """
+
+        rank_0_dict = {}
+
+        # Run broadcasts.
+        for key, tensor in params.items():
+            rank_0_dict[key] = hvd.broadcast(tensor, 0, key)
+        return rank_0_dict
+
+    def fit(self,
+            train_data,
+            optimizer_params,
+            batch_end_callback,
+            initializer,
+            arg_params=None,
+            aux_params=None):
+
+        # Bind -> Init_params -> Init_optimizers
+        self.bind(train_data.provide_data, train_data.provide_label, True)
+        self.init_params(initializer, arg_params, aux_params, False)
+        self.init_optimizer(optimizer_params=optimizer_params)
+
+        # Sync init
+        _arg_params, _aux_params = self.backbone_module.get_params()
+        _arg_params_rank_0 = self.broadcast_parameters(_arg_params)
+        _aux_params_rank_0 = self.broadcast_parameters(_aux_params)
+        self.backbone_module.set_params(_arg_params_rank_0, _aux_params_rank_0)
+
+        # Training loop
+        num_epoch = 0
+        while True:
+            data_iter = iter(train_data)
+            end_of_batch = False
+            next_data_batch = next(data_iter)
+
+            while not end_of_batch:
+                data_batch = next_data_batch
+                self.forward_backward(data_batch)
+                self.update()
+                try:
+                    # pre fetch next batch
+                    next_data_batch = next(data_iter)
+                    self.prepare(next_data_batch, sparse_row_id_fn=None)
+                except StopIteration:
+                    num_epoch += 1
+                    end_of_batch = True
+                    logging.info('reset dataset')
+                    train_data.reset()
+
+                if batch_end_callback is not None:
+                    batch_end_params = self.batch_end_param(
+                        loss=self.loss_cache,
+                        num_epoch=num_epoch,
+                        num_update=self.num_update)
+                    batch_end_callback(batch_end_params)
+
+    def get_export_params(self):
+        _g, _x = self.backbone_module.get_params()
+        g = _g.copy()
+        x = _x.copy()
+        return g, x
+
+    def get_ndarray2(self, context, name, arr):
+        key = "%s_%s" % (name, context)
+        if key not in self.nd_cache:
+            v = nd.zeros(shape=arr.shape, ctx=context, dtype=arr.dtype)
+            self.nd_cache[key] = v
+        else:
+            v = self.nd_cache[key]
+        arr.copyto(v)
+        return v
+
+    def get_ndarray(self, context, name, shape, dtype='float32'):
+        key = "%s_%s" % (name, context)
+        if key not in self.nd_cache:
+            v = nd.zeros(shape=shape, ctx=context, dtype=dtype)
+            self.nd_cache[key] = v
+        else:
+            v = self.nd_cache[key]
+        return v
+
+    def init_params(self,
+                    initializer,
+                    arg_params=None,
+                    aux_params=None,
+                    allow_missing=False,
+                    force_init=False,
+                    allow_extra=False):
+        """Initializes the parameters and auxiliary states.
+
+        Parameters
+        ----------
+        initializer : Initializer
+            Called to initialize parameters if needed.
+        arg_params : dict
+            If not ``None``, should be a dictionary of existing arg_params. Initialization
+            will be copied from that.
+        aux_params : dict
+            If not ``None``, should be a dictionary of existing aux_params. Initialization
+            will be copied from that.
+        allow_missing : bool
+            If ``True``, params could contain missing values, and the initializer will be
+            called to fill those missing params.
+        force_init : bool
+            If ``True``, will force re-initialize even if already initialized.
+        allow_extra : boolean, optional
+            Whether allow extra parameters that are not needed by symbol.
+            If this is True, no error will be thrown when arg_params or aux_params
+            contain extra parameters that is not needed by the executor.
+        """
+        # backbone
+        self.backbone_module.init_params(initializer=initializer,
+                                         arg_params=arg_params,
+                                         aux_params=aux_params,
+                                         allow_missing=allow_missing,
+                                         force_init=force_init,
+                                         allow_extra=allow_extra)
+
+    def prepare(self, data_batch, sparse_row_id_fn=None):
+        if sparse_row_id_fn is not None:
+            warnings.warn(
+                UserWarning("sparse_row_id_fn is not invoked for BaseModule."))
+
+    def allgather(self, tensor, name, shape, dtype, context):
+        """ Implement in-place AllGather using AllReduce
+        """
+        assert isinstance(tensor, nd.NDArray), type(tensor)
+        assert isinstance(name, str), type(name)
+        assert isinstance(shape, tuple), type(shape)
+        assert isinstance(dtype, str), type(dtype)
+        assert isinstance(context, mx.context.Context), type(context)
+        total_tensor = self.get_ndarray(context=context,
+                                        name=name,
+                                        shape=shape,
+                                        dtype=dtype)
+        total_tensor[:] = 0  # reset array before all-reduce is very important
+        total_tensor[self.rank * self.batch_size:self.rank * self.batch_size +
+                     self.batch_size] = tensor
+        hvd.allreduce_(total_tensor, average=False)  # all-reduce in-place
+        return total_tensor
+
+    def forward(self, data_batch, is_train=None):
+        self.backbone_module.forward(data_batch, is_train=is_train)
+        if is_train:
+            self.num_update += 1
+            fc1 = self.backbone_module.get_outputs()[0]
+            label = data_batch.label[0]
+
+            total_features = self.allgather(tensor=fc1,
+                                            name='total_feature',
+                                            shape=(self.batch_size * self.size,
+                                                   self.embedding_size),
+                                            dtype='float32',
+                                            context=self.gpu)
+            total_labels = self.allgather(tensor=label,
+                                          name='total_label',
+                                          shape=(self.batch_size *
+                                                 self.size, ),
+                                          dtype='int32',
+                                          context=self.cpu)
+            return total_features, total_labels
+        else:
+            return None
+
+    def backward_all(
+        self,
+        total_feature,
+        total_label,
+    ):
+        # get memory bank learning rate
+        self.memory_lr = self.memory_optimizer.lr_scheduler(self.num_update)
+
+        self.grad_cache = self.get_ndarray(self.gpu, 'grad_cache',
+                                           total_feature.shape)
+        self.loss_cache = self.get_ndarray(self.gpu, 'loss_cache', [1])
+
+        self.grad_cache[:] = 0
+        self.loss_cache[:] = 0
+
+        if not bool(config.sample_ratio - 1):
+            grad, loss = self.backward(total_feature, total_label)
+        else:
+            grad, loss = self.backward_sample(total_feature, total_label)
+
+        self.loss_cache[0] = loss
+
+        total_feature_grad = grad
+        total_feature_grad = hvd.allreduce(total_feature_grad, average=False)
+
+        fc1_grad = total_feature_grad[self.batch_size *
+                                      self.rank:self.batch_size * self.rank +
+                                      self.batch_size]
+        self.backbone_module.backward(out_grads=[fc1_grad / self.size])
+
+    def get_outputs(self, merge_multi_context=True):
+        """
+        Gets outputs of the previous forward computation.
+
+        Returns
+        -------
+        list of NDArray or list of list of NDArray
+            Output.
+        """
+        return self.backbone_module.get_outputs(
+            merge_multi_context=merge_multi_context)
+
+    def update(self):
+        """
+        Updates parameters according to the installed optimizer and the gradients computed
+        in the previous forward-backward batch.
+        """
+        self.backbone_module.update()
+        mx.nd.waitall()
+
+    def bind(self, data_shapes, label_shapes=None, for_training=True):
+        self.backbone_module.bind(data_shapes,
+                                  label_shapes,
+                                  for_training=for_training)
+
+    def init_optimizer(self, optimizer_params, force_init=False):
+        """
+        Installs and initializes optimizers.
+
+        Parameters
+        ----------
+        optimizer_params : dict
+            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
+            just to avoid pylint warning of dangerous default values.
+        force_init : bool
+            Default ``False``, indicating whether we should force re-initializing the
+            optimizer in the case an optimizer is already installed.
+        """
+        optimizer_backbone = DistributedOptimizer(
+            mx.optimizer.SGD(**optimizer_params))
+        self.backbone_module.init_optimizer('local',
+                                            optimizer_backbone,
+                                            force_init=force_init)
+
+    def backward(self, total_feature, label):
+        memory_bank = self.memory_bank
+        assert memory_bank.num_local == memory_bank.num_sample, "pass"
+
+        _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank,
+                                  total_feature)
+        # Attach grad
+        _data.attach_grad()
+        memory_bank.weight.attach_grad()
+
+        # Convert label
+        _label = self.get_ndarray2(self.gpu, 'label_%d' % self.rank, label)
+        _label = _label - int(self.rank * memory_bank.num_local)
+        _fc7, _one_hot = self.fc7_model.forward(_data,
+                                                memory_bank.weight,
+                                                mapping_label=_label,
+                                                depth=memory_bank.num_local)
+
+        # Sync max
+        max_fc7 = nd.max(_fc7, axis=1, keepdims=True)
+        max_fc7 = nd.reshape(max_fc7, -1)
+
+        total_max_fc7 = self.get_ndarray(context=self.gpu,
+                                         name='total_max_fc7',
+                                         shape=(max_fc7.shape[0], self.size),
+                                         dtype='float32')
+        total_max_fc7[:] = 0
+        total_max_fc7[:, self.rank] = max_fc7
+        hvd.allreduce_(total_max_fc7, average=False)
+
+        global_max_fc7 = self.get_ndarray(context=self.gpu,
+                                          name='global_max_fc7',
+                                          shape=(max_fc7.shape[0], 1),
+                                          dtype='float32')
+        nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7)
+
+        # Calculate exp(logits)
+        _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7)
+        _fc7_grad = nd.exp(_fc7_grad)
+
+        # Calculate sum
+        sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True)
+        global_sum_fc7 = hvd.allreduce(sum_fc7, average=False)
+
+        # Calculate prob
+        _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7)
+
+        # Calculate loss
+        tmp = _fc7_grad * _one_hot
+        tmp = nd.sum(tmp, axis=1, keepdims=True)
+        tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp)
+        tmp = hvd.allreduce(tmp, average=False)
+        global_loss = -nd.mean(nd.log(tmp + 1e-30))
+
+        # Calculate fc7 grad
+        _fc7_grad = _fc7_grad - _one_hot
+
+        # Backward
+        _fc7.backward(out_grad=_fc7_grad)
+
+        # Update center
+        _weight_grad = memory_bank.weight.grad
+        self.memory_optimizer.update(weight=memory_bank.weight,
+                                     grad=_weight_grad,
+                                     state=memory_bank.weight_mom,
+                                     learning_rate=self.memory_lr)
+
+        return _data.grad, global_loss
+
+    def backward_sample(self, total_feature, label):
+        this_rank_classes = int(self.memory_bank.num_sample)
+        local_index, unique_sorted_global_label = self.memory_bank.sample(
+            label)
+
+        # Get local index
+        _mapping_dict = {}
+        local_sampled_class = local_index + self.rank * self.memory_bank.num_local
+        global_label_set = set(unique_sorted_global_label)
+        for idx, absolute_label in enumerate(local_sampled_class):
+            if absolute_label in global_label_set:
+                _mapping_dict[
+                    absolute_label] = idx + self.rank * self.memory_bank.num_sample
+
+        label_list = list(label.asnumpy())
+        mapping_label = []
+        for i in range(len(label_list)):
+            absolute_label = label_list[i]
+            if absolute_label in _mapping_dict.keys():
+                mapping_label.append(_mapping_dict[absolute_label])
+            else:
+                mapping_label.append(-1)
+
+        mapping_label = nd.array(mapping_label, dtype=np.int32)
+
+        # Get weight
+        local_index = nd.array(local_index)
+        local_index = self.get_ndarray2(self.gpu, "local_index", local_index)
+        sample_weight, sample_weight_mom = self.memory_bank.get(local_index)
+
+        # Sync to gpu
+        if self.memory_bank.gpu:
+            _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank,
+                                      total_feature)
+            _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank,
+                                        sample_weight)
+            _weight_mom = self.get_ndarray2(self.gpu,
+                                            'weight_mom_%d' % self.rank,
+                                            sample_weight_mom)
+        else:
+            _data = self.get_ndarray2(self.gpu, "data_%d" % self.rank,
+                                      total_feature)
+            _weight = self.get_ndarray2(self.gpu, 'weight_%d' % self.rank,
+                                        sample_weight)
+            _weight_mom = self.get_ndarray2(self.gpu,
+                                            'weight_mom_%d' % self.rank,
+                                            sample_weight_mom)
+
+        # Attach grad
+        _data.attach_grad()
+        _weight.attach_grad()
+
+        # Convert label
+        _label = self.get_ndarray2(self.gpu, 'mapping_label_%d' % self.rank,
+                                   mapping_label)
+        _label = _label - int(self.rank * self.memory_bank.num_sample)
+        _fc7, _one_hot = self.fc7_model.forward(_data,
+                                                _weight,
+                                                mapping_label=_label,
+                                                depth=this_rank_classes)
+
+        # Sync max
+        max_fc7 = nd.max(_fc7, axis=1, keepdims=True)
+        max_fc7 = nd.reshape(max_fc7, -1)
+
+        total_max_fc7 = self.get_ndarray(context=self.gpu,
+                                         name='total_max_fc7',
+                                         shape=(max_fc7.shape[0], self.size),
+                                         dtype='float32')
+        total_max_fc7[:] = 0
+        total_max_fc7[:, self.rank] = max_fc7
+        hvd.allreduce_(total_max_fc7, average=False)
+
+        global_max_fc7 = self.get_ndarray(context=self.gpu,
+                                          name='global_max_fc7',
+                                          shape=(max_fc7.shape[0], 1),
+                                          dtype='float32')
+        nd.max(total_max_fc7, axis=1, keepdims=True, out=global_max_fc7)
+
+        # Calculate exp(logits)
+        _fc7_grad = nd.broadcast_sub(_fc7, global_max_fc7)
+        _fc7_grad = nd.exp(_fc7_grad)
+
+        # Calculate sum
+        sum_fc7 = nd.sum(_fc7_grad, axis=1, keepdims=True)
+        global_sum_fc7 = hvd.allreduce(sum_fc7, average=False)
+
+        # Calculate grad
+        _fc7_grad = nd.broadcast_div(_fc7_grad, global_sum_fc7)
+
+        # Calculate loss
+        tmp = _fc7_grad * _one_hot
+        tmp = nd.sum(tmp, axis=1, keepdims=True)
+        tmp = self.get_ndarray2(self.gpu, 'ctx_loss', tmp)
+        tmp = hvd.allreduce(tmp, average=False)
+        global_loss = -nd.mean(nd.log(tmp + 1e-30))
+
+        _fc7_grad = _fc7_grad - _one_hot
+
+        # Backward
+        _fc7.backward(out_grad=_fc7_grad)
+
+        # Update center
+        _weight_grad = _weight.grad
+        self.memory_optimizer.update(weight=_weight,
+                                     grad=_weight_grad,
+                                     state=_weight_mom,
+                                     learning_rate=self.memory_lr)
+        if self.memory_bank.gpu:
+            self.memory_bank.set(index=local_index,
+                                 updated_weight=_weight,
+                                 updated_weight_mom=_weight_mom)
+        else:
+            self.memory_bank.set(index=local_index,
+                                 updated_weight=self.get_ndarray2(
+                                     mx.cpu(), "cpu_weight_%d" % self.rank,
+                                     _weight),
+                                 updated_weight_mom=self.get_ndarray2(
+                                     mx.cpu(), "cpu_weight_mom_%d" % self.rank,
+                                     _weight_mom))
+        return _data.grad, global_loss
diff --git a/insightface/recognition/partial_fc/mxnet/memory_samplers.py b/insightface/recognition/partial_fc/mxnet/memory_samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee894917678b65adc525062f00ebb3855b4c3208
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/memory_samplers.py
@@ -0,0 +1,77 @@
+import numpy as np
+
+
+class CenterPositiveClassGet(object):
+    """ Get the corresponding center of the positive class
+    """
+    def __init__(self, num_sample, num_local, rank):
+        self.num_sample = num_sample
+        self.num_local = num_local
+        self.rank = rank
+        self.rank_class_start = self.rank * num_local
+        self.rank_class_end = self.rank_class_start + num_local
+        pass
+
+    def __call__(self, global_label):
+        """
+        Return:
+        -------
+        positive_center_label: list of int
+        """
+        greater_than = global_label >= self.rank_class_start
+        smaller_than = global_label < self.rank_class_end
+
+        positive_index = greater_than * smaller_than
+        positive_center_label = global_label[positive_index]
+
+        return positive_center_label
+
+
+class CenterNegetiveClassSample(object):
+    """ Sample negative class center
+    """
+    def __init__(self, num_sample, num_local, rank):
+        self.num_sample = num_sample
+        self.num_local = num_local
+        self.rank = rank
+        self.negative_class_pool = np.arange(num_local)
+        pass
+
+    def __call__(self, positive_center_index):
+        """
+        Return:
+        -------
+        negative_center_index: list of int
+        """
+        negative_class_pool = np.setdiff1d(self.negative_class_pool,
+                                           positive_center_index)
+        negative_sample_size = self.num_sample - len(positive_center_index)
+        negative_center_index = np.random.choice(negative_class_pool,
+                                                 negative_sample_size,
+                                                 replace=False)
+        return negative_center_index
+
+
+class WeightIndexSampler(object):
+    def __init__(self, num_sample, num_local, rank):
+        self.num_sample = num_sample
+        self.num_local = num_local
+        self.rank = rank
+        self.rank_class_start = self.rank * num_local
+
+        self.positive = CenterPositiveClassGet(num_sample, num_local, rank)
+        self.negative = CenterNegetiveClassSample(num_sample, num_local, rank)
+
+    def __call__(self, global_label):
+        positive_center_label = self.positive(global_label)
+        positive_center_index = positive_center_label - self.positive.rank_class_start
+        if len(positive_center_index) > self.num_sample:
+            positive_center_index = positive_center_index[:self.num_sample]
+        negative_center_index = self.negative(positive_center_index)
+        #
+        final_center_index = np.concatenate(
+            (positive_center_index, negative_center_index))
+        assert len(final_center_index) == len(
+            np.unique(final_center_index)) == self.num_sample
+        assert len(final_center_index) == self.num_sample
+        return final_center_index
diff --git a/insightface/recognition/partial_fc/mxnet/memory_scheduler.py b/insightface/recognition/partial_fc/mxnet/memory_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b1b36ddaa1d094e6a0aa5a9c08152a49ade2190
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/memory_scheduler.py
@@ -0,0 +1,12 @@
+from default import config
+import mxnet as mx
+
+
+def get_scheduler():
+    step = [int(x) for x in config.lr_steps.split(',')]
+    backbone_lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(
+        step=step, factor=0.1, base_lr=config.backbone_lr)
+    memory_bank_lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(
+        step=step, factor=0.1, base_lr=config.memory_bank_lr)
+
+    return backbone_lr_scheduler, memory_bank_lr_scheduler
diff --git a/insightface/recognition/partial_fc/mxnet/memory_softmax.py b/insightface/recognition/partial_fc/mxnet/memory_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64159a416e934fc59c944ee10a9296d1a046b5b
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/memory_softmax.py
@@ -0,0 +1,53 @@
+import mxnet as mx
+from mxnet import autograd
+from mxnet import nd
+
+
+class MarginLoss(object):
+    """ Default is Arcface loss
+    """
+    def __init__(self, margins=(1.0, 0.5, 0.0), loss_s=64, embedding_size=512):
+        """
+        """
+        # margins
+        self.loss_m1 = margins[0]
+        self.loss_m2 = margins[1]
+        self.loss_m3 = margins[2]
+        self.loss_s = loss_s
+        self.embedding_size = embedding_size
+
+    def forward(self, data, weight, mapping_label, depth):
+        """
+        """
+        with autograd.record():
+            norm_data = nd.L2Normalization(data)
+            norm_weight = nd.L2Normalization(weight)
+            #
+            fc7 = nd.dot(norm_data, norm_weight, transpose_b=True)
+            #
+            mapping_label_onehot = mx.nd.one_hot(indices=mapping_label,
+                                                 depth=depth,
+                                                 on_value=1.0,
+                                                 off_value=0.0)
+            # cosface
+            if self.loss_m1 == 1.0 and self.loss_m2 == 0.0:
+                _one_hot = mapping_label_onehot * self.loss_m3
+                fc7 = fc7 - _one_hot
+            else:
+                fc7_onehot = fc7 * mapping_label_onehot
+                cos_t = fc7_onehot
+                t = nd.arccos(cos_t)
+                if self.loss_m1 != 1.0:
+                    t = t * self.loss_m1
+                if self.loss_m2 != 0.0:
+                    t = t + self.loss_m2
+                margin_cos = nd.cos(t)
+                if self.loss_m3 != 0.0:
+                    margin_cos = margin_cos - self.loss_m3
+                margin_fc7 = margin_cos
+                margin_fc7_onehot = margin_fc7 * mapping_label_onehot
+                diff = margin_fc7_onehot - fc7_onehot
+                fc7 = fc7 + diff
+
+            fc7 = fc7 * self.loss_s
+            return fc7, mapping_label_onehot
diff --git a/insightface/recognition/partial_fc/mxnet/optimizer.py b/insightface/recognition/partial_fc/mxnet/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8112e925a2e2b07f640335835231d3fed662c6d
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/optimizer.py
@@ -0,0 +1,71 @@
+import horovod.mxnet as hvd
+import mxnet as mx
+from mxnet import nd
+
+
+# This is where Horovod's DistributedOptimizer wrapper for MXNet goes
+class DistributedOptimizer(mx.optimizer.Optimizer):
+    def __init__(self, optimizer, prefix=""):
+        self._optimizer = optimizer
+        self._prefix = prefix
+
+    def __getattr__(self, item):
+        return getattr(self._optimizer, item)
+
+    def create_state_multi_precision(self, index, weight):
+        return self._optimizer.create_state_multi_precision(index, weight)
+
+    def _do_allreduce(self, index, grad):
+        if hvd.size() == 1:
+            return
+
+        if isinstance(index, (tuple, list)):
+            for i in range(len(index)):
+                hvd.allreduce_(grad[i],
+                               average=False,
+                               name=self._prefix + str(index[i]),
+                               priority=-i)
+        else:
+            hvd.allreduce_(grad, average=False, name=self._prefix + str(index))
+
+    def update(self, index, weight, grad, state):
+        self._do_allreduce(index, grad)
+        self._optimizer.update(index, weight, grad, state)
+
+    def update_multi_precision(self, index, weight, grad, state):
+        self._do_allreduce(index, grad)
+        self._optimizer.update_multi_precision(index, weight, grad, state)
+
+    def set_learning_rate(self, lr):
+        self._optimizer.set_learning_rate(lr)
+
+    def set_lr_mult(self, args_lr_mult):
+        self._optimizer.set_lr_mult(args_lr_mult)
+
+    def set_wd_mult(self, args_wd_mult):
+        self._optimizer.set_wd_mult(args_wd_mult)
+
+
+class MemoryBankSGDOptimizer(object):
+    def __init__(self, lr_scheduler, rescale_grad):
+        self.lr_scheduler = lr_scheduler
+        self.rescale_grad = rescale_grad
+        self.momentum = 0.9
+        self.wd = 5e-4
+
+    def update(self, weight, grad, state, learning_rate):
+        lr = learning_rate
+        # do the regular sgd update flow
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if state is not None:
+            nd.sgd_mom_update(weight,
+                              grad,
+                              state,
+                              out=weight,
+                              lr=lr,
+                              wd=self.wd,
+                              **kwargs)
+        else:
+            raise ValueError
diff --git a/insightface/recognition/partial_fc/mxnet/run.sh b/insightface/recognition/partial_fc/mxnet/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..04dd414a5cd54810d69fb581ce1e5de54bbe83f2
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+mpirun -np 8 \
+-hostfile hosts/host_8 --allow-run-as-root \
+-bind-to none -map-by slot \
+-x LD_LIBRARY_PATH -x PATH \
+-mca pml ob1 -mca btl ^openib \
+-mca btl_tcp_if_include eth0 \
+-x OMP_NUM_THREADS=2 \
+bash config.sh
diff --git a/insightface/recognition/partial_fc/mxnet/setup-utils/README.md b/insightface/recognition/partial_fc/mxnet/setup-utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cd205664c22b9b62b65978d711fdbd2e34aebda
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/setup-utils/README.md
@@ -0,0 +1,24 @@
+# Docker run
+
+```
+# master
+docker run -it \
+--network=host \
+--gpus all \
+-v /mnt:/mnt \
+-v /anxiang:/anxiang \
+-v /data:/data \
+-v /anxiang/share/ssh/:/root/.ssh \
+partical_fc:0.1 /bin/bash
+
+
+# other
+docker run -it \
+--network=host \
+-v /mnt:/mnt \
+-v /anxiang:/anxiang \
+-v /data:/data \
+-v /anxiang/share/ssh/:/root/.ssh \
+partical_fc:0.1 \
+bash -c "/usr/sbin/sshd -p 12345; sleep infinity"
+```
\ No newline at end of file
diff --git a/insightface/recognition/partial_fc/mxnet/setup-utils/install-horovod.sh b/insightface/recognition/partial_fc/mxnet/setup-utils/install-horovod.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e72e3564ef6befe06114fc91aa1ced2f64f49d5c
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/setup-utils/install-horovod.sh
@@ -0,0 +1,2 @@
+#! /bin/bash
+HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install --no-cache-dir horovod==0.19.2
diff --git a/insightface/recognition/partial_fc/mxnet/setup-utils/install-mpi.sh b/insightface/recognition/partial_fc/mxnet/setup-utils/install-mpi.sh
new file mode 100644
index 0000000000000000000000000000000000000000..74a337142f594e8d4642e7351c70ab8a70fb89cc
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/setup-utils/install-mpi.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# install mpi
+#wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz
+
+#sudo rm -rf /usr/local/lib/openmpi /usr/local/lib/libmca* /usr/local/lib/libmpi*
+#sudo rm -rf /usr/local/lib/libompitrace* /usr/local/lib/libopen* /usr/local/lib/liboshmem* /usr/local/lib/mpi_*
+
+
+tar zxf openmpi-4.0.0.tar.gz
+cd openmpi-4.0.0 || return
+sudo ./configure --enable-orterun-prefix-by-default
+sudo make -j 48 all
+sudo make install
+ldconfig
\ No newline at end of file
diff --git a/insightface/recognition/partial_fc/mxnet/setup-utils/requirements.txt b/insightface/recognition/partial_fc/mxnet/setup-utils/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..973a1dd5cdf9e69861279e12da4902ccaaeeb6c8
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/setup-utils/requirements.txt
@@ -0,0 +1,7 @@
+easydict==1.9
+horovod>=0.24
+mxboard==0.1.0
+opencv-python==4.2.0.34
+tqdm==4.48.2
+scipy
+sklearn
diff --git a/insightface/recognition/partial_fc/mxnet/symbol/__init__.py b/insightface/recognition/partial_fc/mxnet/symbol/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/partial_fc/mxnet/symbol/memonger.py b/insightface/recognition/partial_fc/mxnet/symbol/memonger.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad610b57b821ec6b8f0087ee2569ad6fda4d177
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/symbol/memonger.py
@@ -0,0 +1,175 @@
+import mxnet as mx
+import math
+
+
+def prod(shape):
+    """Get product of the shape.
+    """
+    ret = 1
+    for s in shape:
+        ret *= s
+    return ret
+
+
+def is_param(name):
+    """Quick script to check if name is a parameter.
+    """
+    if name == 'data':
+        return False
+    if name.endswith('weight'):
+        return True
+    if name.endswith('bias'):
+        return True
+    if name.endswith('beta'):
+        return True
+    if name.endswith('gamma'):
+        return True
+    return False
+
+
+def make_mirror_plan(sym, threshold, plan_info=None, **kwargs):
+    """Memory allocation planner with a given threshold.
+
+    The user can pass in a network configuration,
+    a threshold that limits memory per block.
+    And input shape configurations.
+
+    Parameters
+    ----------
+    sym : symbol
+        Input configuration of symbols.
+        The user need to pre-mark the attribute "mirror_stage" on the nodes
+        that can be book-kept as stage
+
+        The algorithm will decide whether to disbale mirror on the stage nodes.
+
+    threshold: integer
+        A tuning parameter to tune the approximate size of each stage blocks
+
+    plan_info: dict, optional
+        Used to hold plan information.
+
+    **kwargs:
+        The arguments to infer shape.
+
+    Returns
+    -------
+    alloc_sym: symbol
+        A symbol with force mirror tagged on the nodes for better allocation.
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_sb = None
+    last_local = 0
+    period = 1
+    last_stage = ''
+    stage_decision = ''
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        else:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def get_cost(sym, type_dict=None, **kwargs):
+    """Get the cost of the current symbolic plan by running bind on CPU.
+
+    sym : Symbolic Variable
+
+    """
+    texec = sym.simple_bind(ctx=mx.gpu(),
+                            grad_req='write',
+                            type_dict=type_dict,
+                            **kwargs)
+    return int(texec.debug_str().split('\n')[-3].split()[1])
+
+
+def search_plan(sym, ntrial=6, type_dict=None, **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    threshold = 0
+    min_threshold = None
+    min_cost = None
+    nbegin = 3
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan(sym,
+                               threshold=threshold,
+                               plan_info=info,
+                               **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = int(math.sqrt(save_size * local_size / 2))
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan(sym,
+                                   threshold=threshold,
+                                   plan_info=info,
+                                   **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
diff --git a/insightface/recognition/partial_fc/mxnet/symbol/resnet.py b/insightface/recognition/partial_fc/mxnet/symbol/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd86106b6b944fcd342c2d7914da664eb35fe424
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/symbol/resnet.py
@@ -0,0 +1,1184 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import mxnet as mx
+import numpy as np
+from symbol import symbol_utils
+# import memonger
+# import sklearn
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
+from default import config
+
+
+def Conv(**kwargs):
+    # name = kwargs.get('name')
+    # _weight = mx.symbol.Variable(name+'_weight')
+    # _bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    # body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def residual_unit_v1(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    # print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v1_L(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    # print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v2(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    # print('in unit2')
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act3 = Act(data=bn3, act_type=act_type, name=name + '_relu3')
+        conv3 = Conv(data=act3,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=conv3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv3 = mx.symbol.broadcast_mul(conv3, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=conv2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv2 = mx.symbol.broadcast_mul(conv2, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    # print('in unit3')
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn4 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn4')
+
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn4,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn4 = mx.symbol.broadcast_mul(bn4, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn4 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn3 + shortcut
+
+
+def residual_unit_v3_x(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNeXt Unit symbol for building ResNeXt
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    assert (bottle_neck)
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    num_group = 32
+    # print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+    act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+    conv3 = Conv(data=act2,
+                 num_filter=num_filter,
+                 kernel=(1, 1),
+                 stride=stride,
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv3')
+    bn4 = mx.sym.BatchNorm(data=conv3,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn4')
+
+    if use_se:
+        # se begin
+        body = mx.sym.Pooling(data=bn4,
+                              global_pool=True,
+                              kernel=(7, 7),
+                              pool_type='avg',
+                              name=name + '_se_pool1')
+        body = Conv(data=body,
+                    num_filter=num_filter // 16,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv1",
+                    workspace=workspace)
+        body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+        body = Conv(data=body,
+                    num_filter=num_filter,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv2",
+                    workspace=workspace)
+        body = mx.symbol.Activation(data=body,
+                                    act_type='sigmoid',
+                                    name=name + "_se_sigmoid")
+        bn4 = mx.symbol.broadcast_mul(bn4, body)
+        # se end
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=bn_mom,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn4 + shortcut
+
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck,
+                  **kwargs):
+    uv = kwargs.get('version_unit', 3)
+    version_input = kwargs.get('version_input', 1)
+    if uv == 1:
+        if version_input == 0:
+            return residual_unit_v1(data, num_filter, stride, dim_match, name,
+                                    bottle_neck, **kwargs)
+        else:
+            return residual_unit_v1_L(data, num_filter, stride, dim_match,
+                                      name, bottle_neck, **kwargs)
+    elif uv == 2:
+        return residual_unit_v2(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    elif uv == 4:
+        return residual_unit_v4(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    else:
+        return residual_unit_v3(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+
+
+def resnet(units, num_stages, filter_list, num_classes, bottle_neck):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {
+        'version_se': config.net_se,
+        'version_input': config.net_input,
+        'version_output': config.net_output,
+        'version_unit': config.net_unit,
+        'version_act': config.net_act,
+        'bn_mom': bn_mom,
+        'workspace': workspace,
+        'memonger': config.memonger,
+    }
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    """
+    version_se = kwargs.get('version_se', 1)
+    version_input = kwargs.get('version_input', 1)
+    assert version_input >= 0
+    version_output = kwargs.get('version_output', 'E')
+    fc_type = version_output
+    version_unit = kwargs.get('version_unit', 3)
+    act_type = kwargs.get('version_act', 'prelu')
+    memonger = kwargs.get('memonger', False)
+    print(version_se, version_input, version_output, version_unit, act_type,
+          memonger)
+    num_unit = len(units)
+    assert (num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+
+    if config.fp16:
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+
+    if version_input == 0:
+        # data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+        # body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+    elif version_input == 2:
+        data = mx.sym.BatchNorm(data=data,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn_data')
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+    else:
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = data
+        body = Conv(data=body,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+
+    for i in range(num_stages):
+        # if version_input==0:
+        #  body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+        #                       name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        # else:
+        #  body = residual_unit(body, filter_list[i+1], (2, 2), False,
+        #    name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        body = residual_unit(body,
+                             filter_list[i + 1], (2, 2),
+                             False,
+                             name='stage%d_unit%d' % (i + 1, 1),
+                             bottle_neck=bottle_neck,
+                             **kwargs)
+        for j in range(units[i] - 1):
+            body = residual_unit(body,
+                                 filter_list[i + 1], (1, 1),
+                                 True,
+                                 name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck,
+                                 **kwargs)
+    if config.fp16:
+        body = mx.sym.Cast(data=body, dtype=np.float32)
+
+    if bottle_neck:
+        body = Conv(data=body,
+                    num_filter=512,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    no_bias=True,
+                    name="convd",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnd')
+        body = Act(data=body, act_type=act_type, name='relud')
+
+    fc1 = symbol_utils.get_fc1(body, num_classes, fc_type)
+    return fc1
+
+
+def get_symbol():
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    num_classes = config.embedding_size
+    num_layers = config.num_layers
+    if num_layers >= 500:
+        filter_list = [64, 256, 512, 1024, 2048]
+        bottle_neck = True
+    else:
+        filter_list = [64, 64, 128, 256, 512]
+        bottle_neck = False
+    num_stages = 4
+    if num_layers == 18:
+        units = [2, 2, 2, 2]
+    elif num_layers == 34:
+        units = [3, 4, 6, 3]
+    elif num_layers == 49:
+        units = [3, 4, 14, 3]
+    elif num_layers == 50:
+        units = [3, 4, 14, 3]
+    elif num_layers == 74:
+        units = [3, 6, 24, 3]
+    elif num_layers == 90:
+        units = [3, 8, 30, 3]
+    elif num_layers == 98:
+        units = [3, 4, 38, 3]
+    elif num_layers == 99:
+        units = [3, 8, 35, 3]
+    elif num_layers == 100:
+        units = [3, 13, 30, 3]
+    elif num_layers == 134:
+        units = [3, 10, 50, 3]
+    elif num_layers == 136:
+        units = [3, 13, 48, 3]
+    elif num_layers == 140:
+        units = [3, 15, 48, 3]
+    elif num_layers == 124:
+        units = [3, 13, 40, 5]
+    elif num_layers == 160:
+        units = [3, 24, 49, 3]
+    elif num_layers == 101:
+        units = [3, 4, 23, 3]
+    elif num_layers == 152:
+        units = [3, 8, 36, 3]
+    elif num_layers == 200:
+        units = [3, 24, 36, 3]
+    elif num_layers == 269:
+        units = [3, 30, 48, 8]
+    else:
+        raise ValueError(
+            "no experiments done on num_layers {}, you can do it yourself".
+            format(num_layers))
+
+    net = resnet(units=units,
+                 num_stages=num_stages,
+                 filter_list=filter_list,
+                 num_classes=num_classes,
+                 bottle_neck=bottle_neck)
+
+    return net
diff --git a/insightface/recognition/partial_fc/mxnet/symbol/symbol_utils.py b/insightface/recognition/partial_fc/mxnet/symbol/symbol_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..73cd3984a69a0edde2fc050d3dc2629295364576
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/symbol/symbol_utils.py
@@ -0,0 +1,596 @@
+import sys
+import os
+import mxnet as mx
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
+from default import config
+
+
+def Conv(**kwargs):
+    # name = kwargs.get('name')
+    # _weight = mx.symbol.Variable(name+'_weight')
+    # _bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    # body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    # ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+bn_mom = config.bn_mom
+
+
+def Linear(data,
+           num_filter=1,
+           kernel=(1, 1),
+           stride=(1, 1),
+           pad=(0, 0),
+           num_group=1,
+           name=None,
+           suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=bn_mom)
+    return bn
+
+
+def get_fc1(last_conv, num_classes, fc_type, input_channel=512):
+    body = last_conv
+    if fc_type == 'Z':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = body
+    elif fc_type == 'E':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'FC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'SFC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(3, 3),
+                    stride=(2, 2),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="convf",
+                    num_group=input_channel)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf')
+        body = Act(data=body, act_type=config.net_act, name='reluf')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(1, 1),
+                    pad=(0, 0),
+                    stride=(1, 1),
+                    name="convf2")
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf2')
+        body = Act(data=body, act_type=config.net_act, name='reluf2')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GAP':
+        bn1 = mx.sym.BatchNorm(data=body,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='bn1')
+        relu1 = Act(data=bn1, act_type=config.net_act, name='relu1')
+        # Although kernel is not used here when global_pool=True, we should put one
+        pool1 = mx.sym.Pooling(data=relu1,
+                               global_pool=True,
+                               kernel=(7, 7),
+                               pool_type='avg',
+                               name='pool1')
+        flat = mx.sym.Flatten(data=pool1)
+        fc1 = mx.sym.FullyConnected(data=flat,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GNAP':  # mobilefacenet++
+        filters_in = 512  # param in mobilefacenet
+        if num_classes > filters_in:
+            body = mx.sym.Convolution(data=last_conv,
+                                      num_filter=num_classes,
+                                      kernel=(1, 1),
+                                      stride=(1, 1),
+                                      pad=(0, 0),
+                                      no_bias=True,
+                                      name='convx')
+            body = mx.sym.BatchNorm(data=body,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=0.9,
+                                    name='convx_bn')
+            body = Act(data=body, act_type=config.net_act, name='convx_relu')
+            filters_in = num_classes
+        else:
+            body = last_conv
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name='bn6f')
+
+        spatial_norm = body * body
+        spatial_norm = mx.sym.sum(data=spatial_norm, axis=1, keepdims=True)
+        spatial_sqrt = mx.sym.sqrt(spatial_norm)
+        # spatial_mean=mx.sym.mean(spatial_sqrt, axis=(1,2,3), keepdims=True)
+        spatial_mean = mx.sym.mean(spatial_sqrt)
+        spatial_div_inverse = mx.sym.broadcast_div(spatial_mean, spatial_sqrt)
+
+        spatial_attention_inverse = mx.symbol.tile(spatial_div_inverse,
+                                                   reps=(1, filters_in, 1, 1))
+        body = body * spatial_attention_inverse
+        # body = mx.sym.broadcast_mul(body, spatial_div_inverse)
+
+        fc1 = mx.sym.Pooling(body,
+                             kernel=(7, 7),
+                             global_pool=True,
+                             pool_type='avg')
+        if num_classes < filters_in:
+            fc1 = mx.sym.BatchNorm(data=fc1,
+                                   fix_gamma=True,
+                                   eps=2e-5,
+                                   momentum=0.9,
+                                   name='bn6w')
+            fc1 = mx.sym.FullyConnected(data=fc1,
+                                        num_hidden=num_classes,
+                                        name='pre_fc1')
+        else:
+            fc1 = mx.sym.Flatten(data=fc1)
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=0.9,
+                               name='fc1')
+    elif fc_type == "GDC":  # mobilefacenet_v1
+        conv_6_dw = Linear(last_conv,
+                           num_filter=input_channel,
+                           num_group=input_channel,
+                           kernel=(7, 7),
+                           pad=(0, 0),
+                           stride=(1, 1),
+                           name="conv_6dw7_7")
+        conv_6_f = mx.sym.FullyConnected(data=conv_6_dw,
+                                         num_hidden=num_classes,
+                                         name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=conv_6_f,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'F':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'G':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'H':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'I':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'J':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    return fc1
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    # print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=config.net_act, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=stride,
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    momentum=bn_mom,
+                                    eps=2e-5,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn3 + shortcut
+
+
+def residual_unit_v1l(data, num_filter, stride, dim_match, name, bottle_neck):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    workspace = config.workspace
+    bn_mom = config.bn_mom
+    memonger = False
+    use_se = config.net_se
+    act_type = config.net_act
+    # print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            # se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            # se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def get_head(data, version_input, num_filter):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {'bn_mom': bn_mom, 'workspace': workspace}
+    data = data - 127.5
+    data = data * 0.0078125
+    # data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    if version_input == 0:
+        body = Conv(data=data,
+                    num_filter=num_filter,
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        body = mx.sym.Pooling(data=body,
+                              kernel=(3, 3),
+                              stride=(2, 2),
+                              pad=(1, 1),
+                              pool_type='max')
+    else:
+        body = data
+        _num_filter = min(num_filter, 64)
+        body = Conv(data=body,
+                    num_filter=_num_filter,
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        # body = residual_unit_v3(body, _num_filter, (2, 2), False, name='head', **kwargs)
+        body = residual_unit_v1l(body,
+                                 _num_filter, (2, 2),
+                                 False,
+                                 name='head',
+                                 bottle_neck=False)
+    return body
diff --git a/insightface/recognition/partial_fc/mxnet/train_memory.py b/insightface/recognition/partial_fc/mxnet/train_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4eb7f59be32a991ecf5c87f87ecd539dd554ad
--- /dev/null
+++ b/insightface/recognition/partial_fc/mxnet/train_memory.py
@@ -0,0 +1,173 @@
+import argparse
+import logging
+import os
+import sys
+import time
+
+import horovod.mxnet as hvd
+import mxnet as mx
+
+import default
+from callbacks import CallBackModelSave, CallBackLogging, CallBackCenterSave, CallBackVertification
+from default import config
+from image_iter import FaceImageIter, DummyIter
+from memory_module import SampleDistributeModule
+from memory_bank import MemoryBank
+from memory_scheduler import get_scheduler
+from memory_softmax import MarginLoss
+from optimizer import MemoryBankSGDOptimizer
+from symbol import resnet
+
+sys.path.append(os.path.join(os.path.dirname(__file__), 'symbol'))
+os.environ['MXNET_BACKWARD_DO_MIRROR'] = '0'
+os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
+os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
+os.environ['MXNET_USE_TENSORRT'] = "0"
+os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
+os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
+os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"
+os.environ['HOROVOD_CYCLE_TIME'] = "0.1"
+os.environ['HOROVOD_FUSION_THRESHOLD'] = "67108864"
+os.environ['HOROVOD_NUM_NCCL_STREAMS'] = "2"
+os.environ['MXNET_HOROVOD_NUM_GROUPS'] = "16"
+os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD'] = "999"
+os.environ['MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD'] = "25"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train parall face network')
+    # general
+    parser.add_argument('--dataset', default='emore', help='dataset config')
+    parser.add_argument('--network', default='r100', help='network config')
+    parser.add_argument('--loss', default='cosface', help='loss config')
+
+    args, rest = parser.parse_known_args()
+    default.generate_config(args.loss, args.dataset, args.network)
+    parser.add_argument('--models-root',
+                        default="./test",
+                        help='root directory to save model.')
+    args = parser.parse_args()
+    return args
+
+
+def set_logger(logger, rank, models_root):
+    formatter = logging.Formatter("rank-id:" + str(rank) +
+                                  ":%(asctime)s-%(message)s")
+    file_handler = logging.FileHandler(
+        os.path.join(models_root, "%d_hist.log" % rank))
+    stream_handler = logging.StreamHandler(sys.stdout)
+    file_handler.setFormatter(formatter)
+    stream_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    logger.addHandler(stream_handler)
+    logger.info('rank_id: %d' % rank)
+
+
+def get_symbol_embedding():
+    embedding = eval(config.net_name).get_symbol()
+    all_label = mx.symbol.Variable('softmax_label')
+    all_label = mx.symbol.BlockGrad(all_label)
+    out_list = [embedding, all_label]
+    out = mx.symbol.Group(out_list)
+    return out, embedding
+
+
+def train_net():
+    args = parse_args()
+    hvd.init()
+
+    # Size is the number of total GPU, rank is the unique process(GPU) ID from 0 to size,
+    # local_rank is the unique process(GPU) ID within this server
+    rank = hvd.rank()
+    local_rank = hvd.local_rank()
+    size = hvd.size()
+
+    prefix = os.path.join(args.models_root, 'model')
+    prefix_dir = os.path.dirname(prefix)
+    if not os.path.exists(prefix_dir) and not local_rank:
+        os.makedirs(prefix_dir)
+    else:
+        time.sleep(2)
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    set_logger(logger, rank, prefix_dir)
+
+    data_shape = (3, config.image_size, config.image_size)
+
+    # We equally store the class centers (softmax linear transformation matrix) on all GPUs in order.
+    num_local = (config.num_classes + size - 1) // size
+    num_sample = int(num_local * config.sample_ratio)
+    memory_bank = MemoryBank(
+        num_sample=num_sample,
+        num_local=num_local,
+        rank=rank,
+        local_rank=local_rank,
+        embedding_size=config.embedding_size,
+        prefix=prefix_dir,
+        gpu=True)
+
+    if config.debug:
+        train_iter = DummyIter(config.batch_size, data_shape, 1000 * 10000)
+    else:
+        train_iter = FaceImageIter(
+            batch_size=config.batch_size,
+            data_shape=data_shape,
+            path_imgrec=config.rec,
+            shuffle=True,
+            rand_mirror=True,
+            context=rank,
+            context_num=size)
+    train_data_iter = mx.io.PrefetchingIter(train_iter)
+
+    esym, save_symbol = get_symbol_embedding()
+    margins = (config.loss_m1, config.loss_m2, config.loss_m3)
+    fc7_model = MarginLoss(margins, config.loss_s, config.embedding_size)
+
+    # optimizer
+    # backbone  lr_scheduler & optimizer
+    backbone_lr_scheduler, memory_bank_lr_scheduler = get_scheduler()
+
+    backbone_kwargs = {
+        'learning_rate': config.backbone_lr,
+        'momentum': 0.9,
+        'wd': 5e-4,
+        'rescale_grad': 1.0 / (config.batch_size * size) * size,
+        'multi_precision': config.fp16,
+        'lr_scheduler': backbone_lr_scheduler,
+    }
+
+    # memory_bank lr_scheduler & optimizer
+    memory_bank_optimizer = MemoryBankSGDOptimizer(
+        lr_scheduler=memory_bank_lr_scheduler,
+        rescale_grad=1.0 / config.batch_size / size,
+    )
+    #
+    train_module = SampleDistributeModule(
+        symbol=esym,
+        fc7_model=fc7_model,
+        memory_bank=memory_bank,
+        memory_optimizer=memory_bank_optimizer)
+    #
+    if not config.debug and local_rank == 0:
+        cb_vert = CallBackVertification(esym, train_module)
+    cb_speed = CallBackLogging(rank, size, prefix_dir)
+    cb_save = CallBackModelSave(save_symbol, train_module, prefix, rank)
+    cb_center_save = CallBackCenterSave(memory_bank)
+
+    def call_back_fn(params):
+        cb_speed(params)
+        if not config.debug and local_rank == 0:
+            cb_vert(params)
+        cb_center_save(params)
+        cb_save(params)
+
+    train_module.fit(
+        train_data_iter,
+        optimizer_params=backbone_kwargs,
+        initializer=mx.init.Normal(0.1),
+        batch_end_callback=call_back_fn)
+
+
+if __name__ == '__main__':
+    train_net()
diff --git a/insightface/recognition/partial_fc/pytorch/README.md b/insightface/recognition/partial_fc/pytorch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b5f51b90c5dcab4952d0d13d7bf5674aaa71f
--- /dev/null
+++ b/insightface/recognition/partial_fc/pytorch/README.md
@@ -0,0 +1,3 @@
+# partial fc
+
+PartialFC-Pytorch has been merged into [arcface_torch](https://github.com/deepinsight/insightface/tree/master/recognition/arcface_torch).
diff --git a/insightface/recognition/partial_fc/unpack_glint360k.py b/insightface/recognition/partial_fc/unpack_glint360k.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec8a9daec358c12b8cec1adc9d7ef2883bd7bff
--- /dev/null
+++ b/insightface/recognition/partial_fc/unpack_glint360k.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+
+import cv2
+import mxnet as mx
+
+
+def main(args):
+    include_datasets = args.include.split(',')
+    rec_list = []
+    for ds in include_datasets:
+        path_imgrec = os.path.join(ds, 'train.rec')
+        path_imgidx = os.path.join(ds, 'train.idx')
+        imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+        rec_list.append(imgrec)
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+
+    #
+    imgid = 0
+    for ds_id in range(len(rec_list)):
+        imgrec = rec_list[ds_id]
+        s = imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        assert header.flag > 0
+        seq_identity = range(int(header.label[0]), int(header.label[1]))
+
+        for identity in seq_identity:
+            s = imgrec.read_idx(identity)
+            header, _ = mx.recordio.unpack(s)
+            for _idx in range(int(header.label[0]), int(header.label[1])):
+                s = imgrec.read_idx(_idx)
+                _header, _img = mx.recordio.unpack(s)
+                label = int(_header.label[0])
+                class_path = os.path.join(args.output, "id_%d" % label)
+                if not os.path.exists(class_path):
+                    os.makedirs(class_path)
+
+                image_path = os.path.join(class_path, "%d_%d.jpg" % (label, imgid))
+                with open(image_path, "wb") as ff:
+                    ff.write(_img)
+
+                imgid += 1
+                if imgid % 10000 == 0:
+                    print(imgid)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do dataset merge')
+    # general
+    parser.add_argument('--include', default='', type=str, help='')
+    parser.add_argument('--output', default='', type=str, help='')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/subcenter_arcface/README.md b/insightface/recognition/subcenter_arcface/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..15b9e6b4029e896ae223da0a84279ed4d01a97f0
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/README.md
@@ -0,0 +1,48 @@
+
+## Subcenter ArcFace
+
+### 1. Motivation
+
+We introduce one extra hyperparameter (subcenter number `loss_K`) to ArcFace to relax the intra-class compactness constraint. In our experiments, we find ``loss_K=3`` can achieve a good balance between accuracy and robustness.
+
+![difference](https://insightface.ai/assets/img/github/subcenterarcfacediff.png)
+
+### 2. Implementation
+
+The training process of Subcenter ArcFace is almost same as [ArcFace](https://github.com/deepinsight/insightface/tree/master/recognition/ArcFace)
+The increased GPU memory consumption can be easily alleviated by our parallel framework.
+
+![framework](https://insightface.ai/assets/img/github/subcenterarcfaceframework.png)
+
+### 3. Training Dataset
+
+1. MS1MV0 (The noise rate is around 50%), download link ([baidu drive](https://pan.baidu.com/s/1bSamN5CLiSrxOuGi-Lx7tw), code ``8ql0``)  ([dropbox](https://www.dropbox.com/sh/y2mj25uj440f7bl/AABc7pCJvUvxEcmXs8WYi9Zaa?dl=0))
+
+### 4. Training Steps
+
+1). Train Sub-center ArcFace (``loss_K=3``) on MS1MV0.
+
+2). Drop non-dominant subcenters and high-confident noisy data (``>75 degrees``). 
+
+  ``
+  python drop.py --data <ms1mv0-path> --model <step-1-pretrained-model> --threshold 75 --k 3 --output <ms1mv0-drop75-path>
+  ``
+  
+3). Train ArcFace on the new ``MS1MV0-Drop75`` dataset.
+
+### 5. Pretrained Models and Logs
+  [baidu drive](https://pan.baidu.com/s/1yikOW1Xzm1XIHu0uv0RdRw) code ``3jsh``. [gdrive](https://drive.google.com/file/d/1h8Ybz6mJ7n2IfLbDv2HUU37OdVHn7YPg/view?usp=sharing)
+
+### Citation
+
+If you find *Sub-center ArcFace* useful in your research, please consider to cite the following related papers:
+
+```
+@inproceedings{deng2020subcenter,
+  title={Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces},
+  author={Deng, Jiankang and Guo, Jia and Liu, Tongliang and Gong, Mingming and Zafeiriou, Stefanos},
+  booktitle={Proceedings of the IEEE Conference on European Conference on Computer Vision},
+  year={2020}
+}
+```
+
diff --git a/insightface/recognition/subcenter_arcface/common/build_eval_pack.py b/insightface/recognition/subcenter_arcface/common/build_eval_pack.py
new file mode 100644
index 0000000000000000000000000000000000000000..23208ceebfd1204c5596d53966952ec8d4c88cb3
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/build_eval_pack.py
@@ -0,0 +1,136 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+#import mxnet as mx
+#from mxnet import ndarray as nd
+import argparse
+import cv2
+import pickle
+import numpy as np
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'common'))
+sys.path.append(
+    os.path.join(os.path.dirname(__file__), '..', '..', 'RetinaFace'))
+import face_align
+from retinaface import RetinaFace
+
+
+def to_rgb(img):
+    w, h = img.shape
+    ret = np.empty((w, h, 3), dtype=np.uint8)
+    ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
+    return ret
+
+
+def IOU(Reframe, GTframe):
+    x1 = Reframe[0]
+    y1 = Reframe[1]
+    width1 = Reframe[2] - Reframe[0]
+    height1 = Reframe[3] - Reframe[1]
+
+    x2 = GTframe[0]
+    y2 = GTframe[1]
+    width2 = GTframe[2] - GTframe[0]
+    height2 = GTframe[3] - GTframe[1]
+
+    endx = max(x1 + width1, x2 + width2)
+    startx = min(x1, x2)
+    width = width1 + width2 - (endx - startx)
+
+    endy = max(y1 + height1, y2 + height2)
+    starty = min(y1, y2)
+    height = height1 + height2 - (endy - starty)
+
+    if width <= 0 or height <= 0:
+        ratio = 0
+    else:
+        Area = width * height
+        Area1 = width1 * height1
+        Area2 = width2 * height2
+        ratio = Area * 1. / (Area1 + Area2 - Area)
+    return ratio
+
+
+parser = argparse.ArgumentParser(description='Package eval images')
+# general
+parser.add_argument('--data-dir', default='', help='')
+parser.add_argument('--image-size', type=int, default=112, help='')
+parser.add_argument('--gpu', type=int, default=0, help='')
+parser.add_argument('--det-prefix', type=str, default='./model/R50', help='')
+parser.add_argument('--output', default='./', help='path to save.')
+parser.add_argument('--align-mode', default='arcface', help='align mode.')
+args = parser.parse_args()
+
+gpu_id = args.gpu
+
+detector = RetinaFace(args.det_prefix, 0, gpu_id, network='net3')
+target_size = 400
+max_size = 800
+
+
+def get_norm_crop(image_path):
+    im = cv2.imread(image_path)
+    im_shape = im.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(target_size) / float(im_size_min)
+    # prevent bigger axis from being more than max_size:
+    if np.round(im_scale * im_size_max) > max_size:
+        im_scale = float(max_size) / float(im_size_max)
+    bbox, landmark = detector.detect(im, threshold=0.5, scales=[im_scale])
+    #print(im.shape, bbox.shape, landmark.shape)
+    if bbox.shape[0] == 0:
+        bbox, landmark = detector.detect(
+            im,
+            threshold=0.05,
+            scales=[im_scale * 0.75, im_scale, im_scale * 2.0])
+        print('refine', im.shape, bbox.shape, landmark.shape)
+    nrof_faces = bbox.shape[0]
+    if nrof_faces > 0:
+        det = bbox[:, 0:4]
+        img_size = np.asarray(im.shape)[0:2]
+        bindex = 0
+        if nrof_faces > 1:
+            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                           det[:, 1])
+            img_center = img_size / 2
+            offsets = np.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                                 (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            bindex = np.argmax(bounding_box_size - offset_dist_squared *
+                               2.0)  # some extra weight on the centering
+        #_bbox = bounding_boxes[bindex, 0:4]
+        _landmark = landmark[bindex]
+        warped = face_align.norm_crop(im,
+                                      landmark=_landmark,
+                                      image_size=args.image_size,
+                                      mode=args.align_mode)
+        return warped
+    else:
+        return None
+
+
+bins = []
+issame_list = []
+pp = 0
+for line in open(os.path.join(args.data_dir, 'pairs_label.txt'), 'r'):
+    pp += 1
+    if pp % 100 == 0:
+        print('processing', pp)
+    line = line.strip().split()
+    assert len(line) == 3
+    path1 = os.path.join(args.data_dir, line[0])
+    path2 = os.path.join(args.data_dir, line[1])
+    im1 = get_norm_crop(path1)
+    im2 = get_norm_crop(path2)
+    issame = True
+    if line[2] == '0':
+        issame = False
+    issame_list.append(issame)
+    for im in [im1, im2]:
+        _, s = cv2.imencode('.jpg', im)
+        bins.append(s)
+
+with open(args.output, 'wb') as f:
+    pickle.dump((bins, issame_list), f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/insightface/recognition/subcenter_arcface/common/face_align.py b/insightface/recognition/subcenter_arcface/common/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f48a7691dacb54d1847a748660db0ed02371d63
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/face_align.py
@@ -0,0 +1,71 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+src1 = np.array([[51.642, 50.115], [57.617, 49.990], [35.740, 69.007],
+                 [51.157, 89.050], [57.025, 89.702]],
+                dtype=np.float32)
+#<--left
+src2 = np.array([[45.031, 50.118], [65.568, 50.872], [39.677, 68.111],
+                 [45.177, 86.190], [64.246, 86.758]],
+                dtype=np.float32)
+
+#---frontal
+src3 = np.array([[39.730, 51.138], [72.270, 51.138], [56.000, 68.493],
+                 [42.463, 87.010], [69.537, 87.010]],
+                dtype=np.float32)
+
+#-->right
+src4 = np.array([[46.845, 50.872], [67.382, 50.118], [72.737, 68.111],
+                 [48.167, 86.758], [67.236, 86.190]],
+                dtype=np.float32)
+
+#-->right profile
+src5 = np.array([[54.796, 49.990], [60.771, 50.115], [76.673, 69.007],
+                 [55.388, 89.702], [61.257, 89.050]],
+                dtype=np.float32)
+
+src = np.array([src1, src2, src3, src4, src5])
+src_map = {112: src, 224: src * 2}
+
+arcface_src = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+arcface_src = np.expand_dims(arcface_src, axis=0)
+
+# In[66]:
+
+
+# lmk is prediction; src is template
+def estimate_norm(lmk, image_size=112, mode='arcface'):
+    assert lmk.shape == (5, 2)
+    tform = trans.SimilarityTransform()
+    lmk_tran = np.insert(lmk, 2, values=np.ones(5), axis=1)
+    min_M = []
+    min_index = []
+    min_error = float('inf')
+    if mode == 'arcface':
+        assert image_size == 112
+        src = arcface_src
+    else:
+        src = src_map[image_size]
+    for i in np.arange(src.shape[0]):
+        tform.estimate(lmk, src[i])
+        M = tform.params[0:2, :]
+        results = np.dot(M, lmk_tran.T)
+        results = results.T
+        error = np.sum(np.sqrt(np.sum((results - src[i])**2, axis=1)))
+        #         print(error)
+        if error < min_error:
+            min_error = error
+            min_M = M
+            min_index = i
+    return min_M, min_index
+
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M, pose_index = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
diff --git a/insightface/recognition/subcenter_arcface/common/flops_counter.py b/insightface/recognition/subcenter_arcface/common/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..809424142da4db304ac57539f8cc87c827eaea8e
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/flops_counter.py
@@ -0,0 +1,120 @@
+'''
+@author: insightface
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+import json
+import argparse
+import numpy as np
+import mxnet as mx
+
+
+def is_no_bias(attr):
+    ret = False
+    if 'no_bias' in attr and (attr['no_bias'] == True
+                              or attr['no_bias'] == 'True'):
+        ret = True
+    return ret
+
+
+def count_fc_flops(input_filter, output_filter, attr):
+    #print(input_filter, output_filter ,attr)
+    ret = 2 * input_filter * output_filter
+    if is_no_bias(attr):
+        ret -= output_filter
+    return int(ret)
+
+
+def count_conv_flops(input_shape, output_shape, attr):
+    kernel = attr['kernel'][1:-1].split(',')
+    kernel = [int(x) for x in kernel]
+
+    #print('kernel', kernel)
+    if is_no_bias(attr):
+        ret = (2 * input_shape[1] * kernel[0] * kernel[1] -
+               1) * output_shape[2] * output_shape[3] * output_shape[1]
+    else:
+        ret = 2 * input_shape[1] * kernel[0] * kernel[1] * output_shape[
+            2] * output_shape[3] * output_shape[1]
+    num_group = 1
+    if 'num_group' in attr:
+        num_group = int(attr['num_group'])
+    ret /= num_group
+    return int(ret)
+
+
+def count_flops(sym, **data_shapes):
+    all_layers = sym.get_internals()
+    #print(all_layers)
+    arg_shapes, out_shapes, aux_shapes = all_layers.infer_shape(**data_shapes)
+    out_shape_dict = dict(zip(all_layers.list_outputs(), out_shapes))
+
+    nodes = json.loads(sym.tojson())['nodes']
+    nodeid_shape = {}
+    for nodeid, node in enumerate(nodes):
+        name = node['name']
+        layer_name = name + "_output"
+        if layer_name in out_shape_dict:
+            nodeid_shape[nodeid] = out_shape_dict[layer_name]
+    #print(nodeid_shape)
+    FLOPs = 0
+    for nodeid, node in enumerate(nodes):
+        flops = 0
+        if node['op'] == 'Convolution':
+            output_shape = nodeid_shape[nodeid]
+            name = node['name']
+            attr = node['attrs']
+            input_nodeid = node['inputs'][0][0]
+            input_shape = nodeid_shape[input_nodeid]
+            flops = count_conv_flops(input_shape, output_shape, attr)
+        elif node['op'] == 'FullyConnected':
+            attr = node['attrs']
+            output_shape = nodeid_shape[nodeid]
+            input_nodeid = node['inputs'][0][0]
+            input_shape = nodeid_shape[input_nodeid]
+            output_filter = output_shape[1]
+            input_filter = input_shape[1] * input_shape[2] * input_shape[3]
+            #assert len(input_shape)==4 and input_shape[2]==1 and input_shape[3]==1
+            flops = count_fc_flops(input_filter, output_filter, attr)
+        #print(node, flops)
+        FLOPs += flops
+
+    return FLOPs
+
+
+def flops_str(FLOPs):
+    preset = [(1e12, 'T'), (1e9, 'G'), (1e6, 'M'), (1e3, 'K')]
+
+    for p in preset:
+        if FLOPs // p[0] > 0:
+            N = FLOPs / p[0]
+            ret = "%.1f%s" % (N, p[1])
+            return ret
+    ret = "%.1f" % (FLOPs)
+    return ret
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='flops counter')
+    # general
+    #parser.add_argument('--model', default='../models2/y2-arcface-retinat1/model,1', help='path to load model.')
+    #parser.add_argument('--model', default='../models2/r100fc-arcface-retinaa/model,1', help='path to load model.')
+    parser.add_argument('--model',
+                        default='../models2/r50fc-arcface-emore/model,1',
+                        help='path to load model.')
+    args = parser.parse_args()
+    _vec = args.model.split(',')
+    assert len(_vec) == 2
+    prefix = _vec[0]
+    epoch = int(_vec[1])
+    print('loading', prefix, epoch)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+    all_layers = sym.get_internals()
+    sym = all_layers['fc1_output']
+    FLOPs = count_flops(sym, data=(1, 3, 112, 112))
+    print('FLOPs:', FLOPs)
diff --git a/insightface/recognition/subcenter_arcface/common/rec2image.py b/insightface/recognition/subcenter_arcface/common/rec2image.py
new file mode 100644
index 0000000000000000000000000000000000000000..21e5ec4822fc4afe6d317c87bf9b6e47d5eb8051
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/rec2image.py
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import mxnet as mx
+from mxnet import ndarray as nd
+import random
+import argparse
+import cv2
+import time
+import sklearn
+import numpy as np
+
+
+def main(args):
+    include_datasets = args.include.split(',')
+    rec_list = []
+    for ds in include_datasets:
+        path_imgrec = os.path.join(ds, 'train.rec')
+        path_imgidx = os.path.join(ds, 'train.idx')
+        imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+        rec_list.append(imgrec)
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    for ds_id in range(len(rec_list)):
+        id_list = []
+        imgrec = rec_list[ds_id]
+        s = imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        assert header.flag > 0
+        print('header0 label', header.label)
+        header0 = (int(header.label[0]), int(header.label[1]))
+        seq_identity = range(int(header.label[0]), int(header.label[1]))
+        pp = 0
+        for identity in seq_identity:
+            id_dir = os.path.join(args.output, "%d_%d" % (ds_id, identity))
+            os.makedirs(id_dir)
+            pp += 1
+            if pp % 10 == 0:
+                print('processing id', pp)
+            s = imgrec.read_idx(identity)
+            header, _ = mx.recordio.unpack(s)
+            imgid = 0
+            for _idx in range(int(header.label[0]), int(header.label[1])):
+                s = imgrec.read_idx(_idx)
+                _header, _img = mx.recordio.unpack(s)
+                _img = mx.image.imdecode(_img).asnumpy()[:, :, ::-1]  # to bgr
+                image_path = os.path.join(id_dir, "%d.jpg" % imgid)
+                cv2.imwrite(image_path, _img)
+                imgid += 1
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do dataset merge')
+    # general
+    parser.add_argument('--include', default='', type=str, help='')
+    parser.add_argument('--output', default='', type=str, help='')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/subcenter_arcface/common/rec2shufrec.py b/insightface/recognition/subcenter_arcface/common/rec2shufrec.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf916b4ad4c45a9d59394159fa743f5b0b76b511
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/rec2shufrec.py
@@ -0,0 +1,72 @@
+import os
+import os.path as osp
+import sys
+import datetime
+import glob
+import shutil
+import numbers
+import mxnet as mx
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+import random
+import argparse
+import cv2
+import time
+import numpy as np
+
+def main(args):
+    ds = args.input
+    path_imgrec = osp.join(ds, 'train.rec')
+    path_imgidx = osp.join(ds, 'train.idx')
+    imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+    if not osp.exists(args.output):
+        os.makedirs(args.output)
+    writer = mx.recordio.MXRecordIO(osp.join(args.output, 'train.rec'), 'w')
+    s = imgrec.read_idx(0)
+    header, _ = recordio.unpack(s)
+    if header.flag > 0:
+        print('header0 label', header.label)
+        header0 = (int(header.label[0]), int(header.label[1]))
+        imgidx = list(range(1, int(header.label[0])))
+    else:
+        imgidx = list(imgrec.keys)
+    random.shuffle(imgidx)
+    label_stat = None
+    print('total images:', len(imgidx))
+    for i, idx in enumerate(imgidx):
+        if i%10000==0:
+            print('processing', i, idx)
+        s = imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        if label_stat is None:
+            label_stat = [label, label]
+        else:
+            label_stat[0] = min(label, label_stat[0])
+            label_stat[1] = max(label, label_stat[1])
+        wheader = mx.recordio.IRHeader(0, label, i, 0)
+        ws = mx.recordio.pack(wheader, img)
+        writer.write(ws)
+    print('label_stat:', label_stat)
+    writer.close()
+    if args.copy_vers:
+        for binfile in glob.glob(osp.join(args.input, '*.bin')):
+            target_file = osp.join(args.output, binfile.split('/')[-1])
+            shutil.copyfile(binfile, target_file)
+    with open(osp.join(args.output, 'property'), 'w') as f:
+        f.write("%d,112,112\n"%(int(label_stat[1])+1))
+        f.write("%d\n"%len(imgidx))
+        f.write("shuffled\n")
+        f.write("%s\n"%(datetime.datetime.now()))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='convert rec to shuffled rec')
+    # general
+    parser.add_argument('--input', default='', type=str, help='')
+    parser.add_argument('--output', default='', type=str, help='')
+    parser.add_argument('--copy-vers', action='store_true', help='copy verification bins')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/subcenter_arcface/common/rec_builder.py b/insightface/recognition/subcenter_arcface/common/rec_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d517152f85c9d822160c542ce2a6e51aa2660b5
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/rec_builder.py
@@ -0,0 +1,109 @@
+import os
+import sys
+import mxnet as mx
+from mxnet import ndarray as nd
+import random
+import argparse
+import cv2
+import time
+import sklearn
+import numpy as np
+
+
+class SeqRecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.last_label = -1
+        self.widx = 0
+        if not os.path.exists(path):
+            os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(
+            os.path.join(path, 'train.idx'), os.path.join(path, 'train.rec'),
+            'w')
+        self.label_stat = [-1, -1]
+
+    def add(self, label, img, is_image=True):
+        #img should be BGR
+        #if self.sis:
+        #    assert label>=self.last_label
+        idx = self.widx
+        self.widx += 1
+        header = mx.recordio.IRHeader(0, label, idx, 0)
+        if is_image:
+            s = mx.recordio.pack_img(header, img, quality=95, img_fmt='.jpg')
+        else:
+            s = mx.recordio.pack(header, img)
+        self.writer.write_idx(idx, s)
+        if self.label_stat[0] < 0:
+            self.label_stat = [label, label]
+        else:
+            self.label_stat[0] = min(self.label_stat[0], label)
+            self.label_stat[1] = max(self.label_stat[1], label)
+
+    def close(self):
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.label_stat[1] + 1, self.image_size[0],
+                                    self.image_size[1]))
+
+
+class RecBuilder():
+    def __init__(self, path, image_size=(112, 112)):
+        self.path = path
+        self.image_size = image_size
+        self.last_label = -1
+        self.widx = 1
+        if not os.path.exists(path):
+            os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(
+            os.path.join(path, 'train.idx'), os.path.join(path, 'train.rec'),
+            'w')
+        self.label_stat = [-1, -1]
+        self.identities = []
+
+    def add(self, label, imgs):
+        #img should be BGR
+        assert label >= 0
+        assert label > self.last_label
+        assert len(imgs) > 0
+        idflag = [self.widx, -1]
+        for img in imgs:
+            idx = self.widx
+            self.widx += 1
+            header = mx.recordio.IRHeader(0, label, idx, 0)
+            if isinstance(img, np.ndarray):
+                s = mx.recordio.pack_img(header,
+                                         img,
+                                         quality=95,
+                                         img_fmt='.jpg')
+            else:
+                s = mx.recordio.pack(header, img)
+            self.writer.write_idx(idx, s)
+        idflag[1] = self.widx
+        self.identities.append(idflag)
+        if self.label_stat[0] < 0:
+            self.label_stat = [label, label]
+        else:
+            self.label_stat[0] = min(self.label_stat[0], label)
+            self.label_stat[1] = max(self.label_stat[1], label)
+        self.last_label = label
+
+    def close(self):
+        id_idx = self.widx
+        for id_flag in self.identities:
+            idx = self.widx
+            self.widx += 1
+            _header = mx.recordio.IRHeader(0, id_flag, idx, 0)
+            s = mx.recordio.pack(_header, b'')
+            self.writer.write_idx(idx, s)
+
+        print('id0:', (id_idx, self.widx))
+        idx = 0
+        _header = mx.recordio.IRHeader(0, (id_idx, self.widx), idx, 1)
+        s = mx.recordio.pack(_header, b'')
+        self.writer.write_idx(idx, s)
+        print('label stat:', self.label_stat)
+
+        with open(os.path.join(self.path, 'property'), 'w') as f:
+            f.write("%d,%d,%d\n" % (self.label_stat[1] + 1, self.image_size[0],
+                                    self.image_size[1]))
diff --git a/insightface/recognition/subcenter_arcface/common/verification.py b/insightface/recognition/subcenter_arcface/common/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f46942a2581b56319af2f057d29bfe6e7694efd4
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/common/verification.py
@@ -0,0 +1,423 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import sys
+import numpy as np
+from scipy import misc
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import sklearn
+import cv2
+import math
+import datetime
+import pickle
+from sklearn.decomposition import PCA
+import mxnet as mx
+from mxnet import ndarray as nd
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    #print('pca', pca)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        #print('train_set', train_set)
+        #print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            #print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            #print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        #print('threshold', thresholds[best_threshold_index])
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,
+                 threshold_idx], fprs[fold_idx,
+                                      threshold_idx], _ = calculate_accuracy(
+                                          threshold, dist[test_set],
+                                          actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    #print(true_accept, false_accept)
+    #print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  #py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  #py3
+    data_list = []
+    for flip in [0, 1]:
+        data = nd.empty(
+            (len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for i in range(len(issame_list) * 2):
+        _bin = bins[i]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][i][:] = img
+        if i % 1000 == 0:
+            print('loading bin', i)
+    print(data_list[0].shape)
+    return (data_list, issame_list)
+
+
+def test(data_set,
+         mx_model,
+         batch_size,
+         nfolds=10,
+         data_extra=None,
+         label_shape=None):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    model = mx_model
+    embeddings_list = []
+    if data_extra is not None:
+        _data_extra = nd.array(data_extra)
+    time_consumed = 0.0
+    if label_shape is None:
+        _label = nd.ones((batch_size, ))
+    else:
+        _label = nd.ones(label_shape)
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            #print(_data.shape, _label.shape)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data, ), label=(_label, ))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label, ))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            #_arg, _aux = model.get_params()
+            #__arg = {}
+            #for k,v in _arg.iteritems():
+            #  __arg[k] = v.as_in_context(_ctx)
+            #_arg = __arg
+            #_arg["data"] = _data.as_in_context(_ctx)
+            #_arg["softmax_label"] = _label.as_in_context(_ctx)
+            #for k,v in _arg.iteritems():
+            #  print(k,v.context)
+            #exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
+            #exe.forward(is_train=False)
+            #net_out = exe.outputs
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            #print(_embeddings.shape)
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            #print(_em.shape, _norm)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    #_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=10)
+    #acc1, std1 = np.mean(accuracy), np.std(accuracy)
+
+    #print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+    #embeddings = np.concatenate(embeddings_list, axis=1)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings,
+                                                 issame_list,
+                                                 nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='do verification')
+    # general
+    parser.add_argument('--data-dir', default='', help='')
+    parser.add_argument('--model',
+                        default='../model/softmax,50',
+                        help='path to load model.')
+    parser.add_argument('--target',
+                        default='lfw,cfp_ff,cfp_fp,agedb_30',
+                        help='test targets.')
+    parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+    parser.add_argument('--batch-size', default=32, type=int, help='')
+    parser.add_argument('--max', default='', type=str, help='')
+    parser.add_argument('--mode', default=0, type=int, help='')
+    parser.add_argument('--nfolds', default=10, type=int, help='')
+    args = parser.parse_args()
+    #sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
+    #import face_image
+    #prop = face_image.load_property(args.data_dir)
+    #image_size = prop.image_size
+    image_size = [112, 112]
+    print('image_size', image_size)
+    ctx = mx.gpu(args.gpu)
+    nets = []
+    vec = args.model.split(',')
+    prefix = args.model.split(',')[0]
+    epochs = []
+    if len(vec) == 1:
+        pdir = os.path.dirname(prefix)
+        for fname in os.listdir(pdir):
+            if not fname.endswith('.params'):
+                continue
+            _file = os.path.join(pdir, fname)
+            if _file.startswith(prefix):
+                epoch = int(fname.split('.')[0].split('-')[1])
+                epochs.append(epoch)
+        epochs = sorted(epochs, reverse=True)
+        if len(args.max) > 0:
+            _max = [int(x) for x in args.max.split(',')]
+            assert len(_max) == 2
+            if len(epochs) > _max[1]:
+                epochs = epochs[_max[0]:_max[1]]
+
+    else:
+        epochs = [int(x) for x in vec[1].split('|')]
+    print('model number', len(epochs))
+    time0 = datetime.datetime.now()
+    for epoch in epochs:
+        print('loading', prefix, epoch)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        #arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+        all_layers = sym.get_internals()
+        sym = all_layers['fc1_output']
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+        model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+                                          image_size[1]))])
+        model.set_params(arg_params, aux_params)
+        nets.append(model)
+    time_now = datetime.datetime.now()
+    diff = time_now - time0
+    print('model loading time', diff.total_seconds())
+
+    ver_list = []
+    ver_name_list = []
+    for name in args.target.split(','):
+        path = os.path.join(args.data_dir, name + ".bin")
+        if os.path.exists(path):
+            print('loading.. ', name)
+            data_set = load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+
+    if args.mode == 0:
+        for i in range(len(ver_list)):
+            results = []
+            for model in nets:
+                acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+                    ver_list[i], model, args.batch_size, args.nfolds)
+                print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+                print('[%s]Accuracy: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc1, std1))
+                print('[%s]Accuracy-Flip: %1.5f+-%1.5f' %
+                      (ver_name_list[i], acc2, std2))
+                results.append(acc2)
+            print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+    elif args.mode == 1:
+        model = nets[0]
+        test_badcase(ver_list[0], model, args.batch_size, args.target)
+    else:
+        model = nets[0]
+        dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/subcenter_arcface/drop.py b/insightface/recognition/subcenter_arcface/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d20bae8175159f3c0a8e26e0528340733042a4
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/drop.py
@@ -0,0 +1,212 @@
+import os
+import shutil
+import datetime
+import sys
+from mxnet import ndarray as nd
+import mxnet as mx
+import random
+import argparse
+import numbers
+import cv2
+import time
+import pickle
+import sklearn
+import sklearn.preprocessing
+from easydict import EasyDict as edict
+import numpy as np
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
+from rec_builder import *
+
+
+def get_embedding(args, imgrec, a, b, image_size, model):
+    ocontents = []
+    for idx in range(a, b):
+        s = imgrec.read_idx(idx)
+        ocontents.append(s)
+    embeddings = None
+    #print(len(ocontents))
+    ba = 0
+    rlabel = -1
+    imgs = []
+    contents = []
+    while True:
+        bb = min(ba + args.batch_size, len(ocontents))
+        if ba >= bb:
+            break
+        _batch_size = bb - ba
+        #_batch_size2 = max(_batch_size, args.ctx_num)
+        _batch_size2 = _batch_size
+        if _batch_size % args.ctx_num != 0:
+            _batch_size2 = ((_batch_size // args.ctx_num) + 1) * args.ctx_num
+        data = np.zeros((_batch_size2, 3, image_size[0], image_size[1]))
+        count = bb - ba
+        ii = 0
+        for i in range(ba, bb):
+            header, img = mx.recordio.unpack(ocontents[i])
+            contents.append(img)
+            label = header.label
+            if not isinstance(label, numbers.Number):
+                label = label[0]
+            if rlabel < 0:
+                rlabel = int(label)
+
+            img = mx.image.imdecode(img)
+            rgb = img.asnumpy()
+            bgr = rgb[:, :, ::-1]
+            imgs.append(bgr)
+            img = rgb.transpose((2, 0, 1))
+            data[ii] = img
+            ii += 1
+        while ii < _batch_size2:
+            data[ii] = data[0]
+            ii += 1
+        nddata = nd.array(data)
+        db = mx.io.DataBatch(data=(nddata, ))
+        model.forward(db, is_train=False)
+        net_out = model.get_outputs()
+        net_out = net_out[0].asnumpy()
+        if embeddings is None:
+            embeddings = np.zeros((len(ocontents), net_out.shape[1]))
+        embeddings[ba:bb, :] = net_out[0:_batch_size, :]
+        ba = bb
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    return embeddings, rlabel, contents
+
+
+def main(args):
+    print(args)
+    image_size = (112, 112)
+    print('image_size', image_size)
+    vec = args.model.split(',')
+    prefix = vec[0]
+    epoch = int(vec[1])
+    print('loading', prefix, epoch)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    args.ctx_num = len(ctx)
+    args.batch_size *= args.ctx_num
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+    W = None
+    i = 0
+    while True:
+        key = 'fc7_%d_weight' % i
+        i += 1
+        if key not in arg_params:
+            break
+        _W = arg_params[key].asnumpy()
+        #_W = _W.reshape( (-1, 10, 512) )
+        if W is None:
+            W = _W
+        else:
+            W = np.concatenate((W, _W), axis=0)
+    K = args.k
+    W = sklearn.preprocessing.normalize(W)
+    W = W.reshape((-1, K, 512))
+    all_layers = sym.get_internals()
+    sym = all_layers['fc1_output']
+    model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+    model.bind(data_shapes=[('data', (args.ctx_num, 3, image_size[0],
+                                      image_size[1]))])
+    model.set_params(arg_params, aux_params)
+    print('W:', W.shape)
+    path_imgrec = os.path.join(args.data, 'train.rec')
+    path_imgidx = os.path.join(args.data, 'train.idx')
+    imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
+    id_list = []
+    s = imgrec.read_idx(0)
+    header, _ = mx.recordio.unpack(s)
+    assert header.flag > 0
+    print('header0 label', header.label)
+    header0 = (int(header.label[0]), int(header.label[1]))
+    #assert(header.flag==1)
+    imgidx = range(1, int(header.label[0]))
+    id2range = {}
+    a, b = int(header.label[0]), int(header.label[1])
+    seq_identity = range(a, b)
+    print(len(seq_identity))
+    image_count = 0
+    pp = 0
+    for wid, identity in enumerate(seq_identity):
+        pp += 1
+        s = imgrec.read_idx(identity)
+        header, _ = mx.recordio.unpack(s)
+        contents = []
+        a, b = int(header.label[0]), int(header.label[1])
+        _count = b - a
+        id_list.append((wid, a, b, _count))
+        image_count += _count
+    pp = 0
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    ret = np.zeros((image_count, K + 1), dtype=np.float32)
+    output_dir = args.output
+    builder = SeqRecBuilder(output_dir)
+    print(ret.shape)
+    imid = 0
+    da = datetime.datetime.now()
+    label = 0
+    num_images = 0
+    cos_thresh = np.cos(np.pi * args.threshold / 180.0)
+    for id_item in id_list:
+        wid = id_item[0]
+        pp += 1
+        if pp % 40 == 0:
+            db = datetime.datetime.now()
+            print('processing id', pp, (db - da).total_seconds())
+            da = db
+        x, _, contents = get_embedding(args, imgrec, id_item[1], id_item[2],
+                                       image_size, model)
+        subcenters = W[wid]
+        K_stat = np.zeros((K, ), dtype=np.int)
+        for i in range(x.shape[0]):
+            _x = x[i]
+            sim = np.dot(subcenters, _x)  # len(sim)==K
+            mc = np.argmax(sim)
+            K_stat[mc] += 1
+        dominant_index = np.argmax(K_stat)
+        dominant_center = subcenters[dominant_index]
+        sim = np.dot(x, dominant_center)
+        idx = np.where(sim > cos_thresh)[0]
+        num_drop = x.shape[0] - len(idx)
+        if len(idx) == 0:
+            continue
+        #print("labelid %d dropped %d, from %d to %d"% (wid, num_drop, x.shape[0], len(idx)))
+        num_images += len(idx)
+        for _idx in idx:
+            c = contents[_idx]
+            builder.add(label, c, is_image=False)
+        label += 1
+    builder.close()
+
+    print('total:', num_images)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    # general
+    parser.add_argument('--data',
+                        default='/bigdata/faces_ms1m_full',
+                        type=str,
+                        help='')
+    parser.add_argument('--output',
+                        default='/bigdata/ms1m_full_k3drop075',
+                        type=str,
+                        help='')
+    parser.add_argument(
+        '--model',
+        default=
+        '../Evaluation/IJB/pretrained_models/r50-arcfacesc-msf-k3z/model,2',
+        help='path to load model.')
+    parser.add_argument('--batch-size', default=16, type=int, help='')
+    parser.add_argument('--threshold', default=75, type=float, help='')
+    parser.add_argument('--k', default=3, type=int, help='')
+    args = parser.parse_args()
+    main(args)
diff --git a/insightface/recognition/subcenter_arcface/image_iter.py b/insightface/recognition/subcenter_arcface/image_iter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a079778689559bd04f77d4409c9da44fef5f93c
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/image_iter.py
@@ -0,0 +1,312 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import logging
+import sys
+import numbers
+import math
+import sklearn
+import datetime
+import numpy as np
+import cv2
+from config import config
+
+import mxnet as mx
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+
+logger = logging.getLogger()
+
+
+class FaceImageIter(io.DataIter):
+    def __init__(self,
+                 batch_size,
+                 data_shape,
+                 path_imgrec=None,
+                 shuffle=False,
+                 aug_list=None,
+                 mean=None,
+                 rand_mirror=False,
+                 cutoff=0,
+                 color_jittering=0,
+                 images_filter=0,
+                 data_name='data',
+                 label_name='softmax_label',
+                 **kwargs):
+        super(FaceImageIter, self).__init__()
+        assert path_imgrec
+        if path_imgrec:
+            logging.info('loading recordio %s...', path_imgrec)
+            path_imgidx = path_imgrec[0:-4] + ".idx"
+            self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec,
+                                                     'r')  # pylint: disable=redefined-variable-type
+            s = self.imgrec.read_idx(0)
+            header, _ = recordio.unpack(s)
+            if header.flag > 0:
+                print('header0 label', header.label)
+                self.header0 = (int(header.label[0]), int(header.label[1]))
+                #assert(header.flag==1)
+                self.imgidx = list(range(1, int(header.label[0])))
+                #self.imgidx = []
+                #self.id2range = {}
+                #self.seq_identity = range(int(header.label[0]), int(header.label[1]))
+                #for identity in self.seq_identity:
+                #  s = self.imgrec.read_idx(identity)
+                #  header, _ = recordio.unpack(s)
+                #  a,b = int(header.label[0]), int(header.label[1])
+                #  count = b-a
+                #  if count<images_filter:
+                #    continue
+                #  self.id2range[identity] = (a,b)
+                #  self.imgidx += range(a, b)
+                #print('id2range', len(self.id2range))
+            else:
+                self.imgidx = list(self.imgrec.keys)
+            if shuffle:
+                self.seq = self.imgidx
+                self.oseq = self.imgidx
+                print(len(self.seq))
+            else:
+                self.seq = None
+
+        self.mean = mean
+        self.nd_mean = None
+        if self.mean:
+            self.mean = np.array(self.mean, dtype=np.float32).reshape(1, 1, 3)
+            self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3))
+
+        self.check_data_shape(data_shape)
+        self.provide_data = [(data_name, (batch_size, ) + data_shape)]
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+        self.shuffle = shuffle
+        self.image_size = '%d,%d' % (data_shape[1], data_shape[2])
+        self.rand_mirror = rand_mirror
+        print('rand_mirror', rand_mirror)
+        self.cutoff = cutoff
+        self.color_jittering = color_jittering
+        self.CJA = mx.image.ColorJitterAug(0.125, 0.125, 0.125)
+        self.provide_label = [(label_name, (batch_size, ))]
+        c, h, w = self.data_shape
+        self.batch_data = np.zeros((batch_size, c, h, w), dtype=np.float32)
+        self.batch_label = np.zeros(self.provide_label[0][1], dtype=np.float32)
+        #print(self.provide_label[0][1])
+        self.cur = 0
+        self.nbatch = 0
+        self.is_init = False
+
+    def reset(self):
+        """Resets the iterator to the beginning of the data."""
+        print('call reset()')
+        self.cur = 0
+        if self.shuffle:
+            random.shuffle(self.seq)
+        if self.seq is None and self.imgrec is not None:
+            self.imgrec.reset()
+
+    def num_samples(self):
+        return len(self.seq)
+
+    def next_sample(self):
+        """Helper function for reading in next sample."""
+        #set total batch size, for example, 1800, and maximum size for each people, for example 45
+        if self.seq is not None:
+            while True:
+                if self.cur >= len(self.seq):
+                    raise StopIteration
+                idx = self.seq[self.cur]
+                self.cur += 1
+                if self.imgrec is not None:
+                    s = self.imgrec.read_idx(idx)
+                    header, img = recordio.unpack(s)
+                    label = header.label
+                    if not isinstance(label, numbers.Number):
+                        label = label[0]
+                    return label, img, None, None
+                else:
+                    label, fname, bbox, landmark = self.imglist[idx]
+                    return label, self.read_image(fname), bbox, landmark
+        else:
+            s = self.imgrec.read()
+            if s is None:
+                raise StopIteration
+            header, img = recordio.unpack(s)
+            return header.label, img, None, None
+
+    def brightness_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        src *= alpha
+        return src
+
+    def contrast_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = (3.0 * (1.0 - alpha) / gray.size) * nd.sum(gray)
+        src *= alpha
+        src += gray
+        return src
+
+    def saturation_aug(self, src, x):
+        alpha = 1.0 + random.uniform(-x, x)
+        coef = nd.array([[[0.299, 0.587, 0.114]]])
+        gray = src * coef
+        gray = nd.sum(gray, axis=2, keepdims=True)
+        gray *= (1.0 - alpha)
+        src *= alpha
+        src += gray
+        return src
+
+    def color_aug(self, img, x):
+        #augs = [self.brightness_aug, self.contrast_aug, self.saturation_aug]
+        #random.shuffle(augs)
+        #for aug in augs:
+        #  #print(img.shape)
+        #  img = aug(img, x)
+        #  #print(img.shape)
+        #return img
+        return self.CJA(img)
+
+    def mirror_aug(self, img):
+        _rd = random.randint(0, 1)
+        if _rd == 1:
+            for c in range(img.shape[2]):
+                img[:, :, c] = np.fliplr(img[:, :, c])
+        return img
+
+    def compress_aug(self, img):
+        from PIL import Image
+        from io import BytesIO
+        buf = BytesIO()
+        img = Image.fromarray(img.asnumpy(), 'RGB')
+        q = random.randint(2, 20)
+        img.save(buf, format='JPEG', quality=q)
+        buf = buf.getvalue()
+        img = Image.open(BytesIO(buf))
+        return nd.array(np.asarray(img, 'float32'))
+
+    def next(self):
+        if not self.is_init:
+            self.reset()
+            self.is_init = True
+        """Returns the next batch of data."""
+        #print('in next', self.cur, self.labelcur)
+        self.nbatch += 1
+        batch_size = self.batch_size
+        i = 0
+        try:
+            while i < batch_size:
+                label, s, bbox, landmark = self.next_sample()
+                try:
+                    _data = self.imdecode(s)
+                except Exception as e:
+                    logging.debug('Invalid decoding, skipping:  %s', str(e))
+                    continue
+                if _data.shape[0] != self.data_shape[1]:
+                    _data = mx.image.resize_short(_data, self.data_shape[1])
+                _data = _data.asnumpy().astype(np.float32)
+                if self.rand_mirror and np.random.rand() < 0.5:
+                    _data = _data[:, ::-1, :]
+                if self.color_jittering > 0:
+                    if self.color_jittering > 1:
+                        _rd = random.randint(0, 1)
+                        if _rd == 1:
+                            _data = self.compress_aug(_data)
+                    _data = self.color_aug(_data, 0.125)
+                #if self.nd_mean is not None:
+                #  _data -= self.nd_mean
+                #  _data *= 0.0078125
+                if self.cutoff > 0:
+                    _rd = random.randint(0, 1)
+                    if _rd == 1:
+                        #print('do cutoff aug', self.cutoff)
+                        centerh = random.randint(0, _data.shape[0] - 1)
+                        centerw = random.randint(0, _data.shape[1] - 1)
+                        half = self.cutoff // 2
+                        starth = max(0, centerh - half)
+                        endh = min(_data.shape[0], centerh + half)
+                        startw = max(0, centerw - half)
+                        endw = min(_data.shape[1], centerw + half)
+                        #print(starth, endh, startw, endw, _data.shape)
+                        _data[starth:endh, startw:endw, :] = 128
+                #_data -= 127.5
+                #_data /= 128.0
+                #self.batch_data[i] = self.postprocess_data(_data)
+                _data = _data.transpose((2, 0, 1))
+                self.batch_data[i] = _data
+                self.batch_label[i] = label
+                i += 1
+        except StopIteration:
+            if i < batch_size:
+                raise StopIteration
+
+        batch_data = nd.array(self.batch_data)
+        batch_label = nd.array(self.batch_label)
+        return io.DataBatch([batch_data], [batch_label], batch_size - i)
+
+    def check_data_shape(self, data_shape):
+        """Checks if the input data shape is valid"""
+        if not len(data_shape) == 3:
+            raise ValueError(
+                'data_shape should have length 3, with dimensions CxHxW')
+        if not data_shape[0] == 3:
+            raise ValueError(
+                'This iterator expects inputs to have 3 channels.')
+
+    def check_valid_image(self, data):
+        """Checks if the input data is valid"""
+        if len(data[0].shape) == 0:
+            raise RuntimeError('Data shape is wrong')
+
+    def imdecode(self, s):
+        """Decodes a string or byte string to an NDArray.
+        See mx.img.imdecode for more details."""
+        img = mx.image.imdecode(s)  #mx.ndarray
+        return img
+
+    def read_image(self, fname):
+        """Reads an input image `fname` and returns the decoded raw bytes.
+
+        Example usage:
+        ----------
+        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
+        """
+        with open(os.path.join(self.path_root, fname), 'rb') as fin:
+            img = fin.read()
+        return img
+
+    def augmentation_transform(self, data):
+        """Transforms input data with specified augmentation."""
+        for aug in self.auglist:
+            data = [ret for src in data for ret in aug(src)]
+        return data
+
+    def postprocess_data(self, datum):
+        """Final postprocessing step before image is loaded into the batch."""
+        return nd.transpose(datum, axes=(2, 0, 1))
+
+
+class FaceImageIterList(io.DataIter):
+    def __init__(self, iter_list):
+        assert len(iter_list) > 0
+        self.provide_data = iter_list[0].provide_data
+        self.provide_label = iter_list[0].provide_label
+        self.iter_list = iter_list
+        self.cur_iter = None
+
+    def reset(self):
+        self.cur_iter.reset()
+
+    def next(self):
+        self.cur_iter = random.choice(self.iter_list)
+        while True:
+            try:
+                ret = self.cur_iter.next()
+            except StopIteration:
+                self.cur_iter.reset()
+                continue
+            return ret
diff --git a/insightface/recognition/subcenter_arcface/parall_module_local_v1.py b/insightface/recognition/subcenter_arcface/parall_module_local_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f68610ad24dbfe680032c4010241afce91a4e59
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/parall_module_local_v1.py
@@ -0,0 +1,655 @@
+'''
+@author: insightface
+'''
+
+import logging
+import copy
+import time
+import os
+
+import mxnet as mx
+import numpy as np
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+from mxnet import metric
+from mxnet.model import BatchEndParam
+from mxnet import io
+import mxnet.ndarray as nd
+from config import config
+
+
+class ParallModule(BaseModule):
+    def __init__(self,
+                 symbol,
+                 data_names,
+                 label_names,
+                 logger=logging,
+                 context=ctx.cpu(),
+                 work_load_list=None,
+                 asymbol=None,
+                 args=None):
+        super(ParallModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._asymbol = asymbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+        self._num_classes = config.num_classes
+        self._batch_size = args.batch_size
+        self._verbose = args.verbose
+        self._emb_size = config.emb_size
+        self._local_class_start = args.local_class_start
+        self._iter = 0
+
+        self._curr_module = None
+
+        self._num_workers = config.num_workers
+        self._num_ctx = len(self._context)
+        self._ctx_num_classes = args.ctx_num_classes
+        self._nd_cache = {}
+        self._ctx_cpu = mx.cpu()
+        self._ctx_single_gpu = self._context[-1]
+        self._fixed_param_names = None
+        self._curr_module = Module(self._symbol,
+                                   self._data_names,
+                                   self._label_names,
+                                   logger=self.logger,
+                                   context=self._context,
+                                   work_load_list=self._work_load_list,
+                                   fixed_param_names=self._fixed_param_names)
+        self._arcface_modules = []
+        self._ctx_class_start = []
+        for i in range(len(self._context)):
+
+            args._ctxid = i
+            _module = Module(self._asymbol(args),
+                             self._data_names,
+                             self._label_names,
+                             logger=self.logger,
+                             context=mx.gpu(i),
+                             work_load_list=self._work_load_list,
+                             fixed_param_names=self._fixed_param_names)
+            self._arcface_modules.append(_module)
+            _c = args.local_class_start + i * args.ctx_num_classes
+            self._ctx_class_start.append(_c)
+        self._usekv = False
+        if self._usekv:
+            self._distkv = mx.kvstore.create('dist_sync')
+            self._kvinit = {}
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_export_params(self):
+        assert self.binded and self.params_initialized
+        _g, _x = self._curr_module.get_params()
+        g = _g.copy()
+        x = _x.copy()
+        return g, x
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        _g, _x = self._curr_module.get_params()
+        g = _g.copy()
+        x = _x.copy()
+        for _module in self._arcface_modules:
+            _g, _x = _module.get_params()
+            ag = _g.copy()
+            ax = _x.copy()
+            g.update(ag)
+            x.update(ax)
+        return g, x
+
+    def set_params(self,
+                   arg_params,
+                   aux_params,
+                   allow_missing=False,
+                   force_init=True,
+                   allow_extra=False):
+        g = arg_params
+        x = aux_params
+        #ag = {}
+        #ax = {}
+        rk = []
+        for k in g:
+            v = g[k]
+            if k.startswith('fc7'):
+                p1 = k.find('_')
+                p2 = k.rfind('_')
+                _ctxid = int(k[p1 + 1:p2])
+                self._arcface_modules[_ctxid].set_params({k: v}, {})
+                rk.append(k)
+        for k in rk:
+            del g[k]
+        self._curr_module.set_params(g, x)
+        #self._arcface_module.set_params(ag, ax)
+
+    def init_params(self,
+                    initializer=Uniform(0.01),
+                    arg_params=None,
+                    aux_params=None,
+                    allow_missing=False,
+                    force_init=False,
+                    allow_extra=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        #TODO init the same weights with all work nodes
+        self._curr_module.init_params(initializer=initializer,
+                                      arg_params=None,
+                                      aux_params=None,
+                                      allow_missing=allow_missing,
+                                      force_init=force_init,
+                                      allow_extra=allow_extra)
+        for _module in self._arcface_modules:
+            #_initializer = initializer
+            _initializer = mx.init.Normal(0.01)
+            _module.init_params(initializer=_initializer,
+                                arg_params=None,
+                                aux_params=None,
+                                allow_missing=allow_missing,
+                                force_init=force_init,
+                                allow_extra=allow_extra)
+        self.params_initialized = True
+
+    def bind(self,
+             data_shapes,
+             label_shapes=None,
+             for_training=True,
+             inputs_need_grad=False,
+             force_rebind=False,
+             shared_module=None):
+        print('in_bind', self.params_initialized, data_shapes, label_shapes)
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+        self._curr_module.bind(data_shapes,
+                               label_shapes,
+                               for_training,
+                               inputs_need_grad,
+                               force_rebind=False,
+                               shared_module=None)
+        _data_shape = data_shapes[0][1]
+        print('_data_shape', _data_shape, label_shapes)
+        for _module in self._arcface_modules:
+            _module.bind(
+                [('data',
+                  (_data_shape[0] * self._num_workers, self._emb_size))],
+                [('softmax_label', (_data_shape[0] * self._num_workers, ))],
+                for_training,
+                True,
+                force_rebind=False,
+                shared_module=None)
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self,
+                       kvstore='local',
+                       optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01), ),
+                       force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore,
+                                         optimizer,
+                                         optimizer_params,
+                                         force_init=force_init)
+        for _module in self._arcface_modules:
+            _module.init_optimizer(kvstore,
+                                   optimizer,
+                                   optimizer_params,
+                                   force_init=force_init)
+        self.optimizer_initialized = True
+
+    def kv_push(self, key, value):
+        #if value.context!=mx.cpu():
+        #  value = value.as_in_context(mx.cpu())
+        if not key in self._kvinit:
+            self._distkv.init(key, nd.zeros_like(value))
+            self._kvinit[key] = 1
+        self._distkv.push(key, value)
+
+    #get fc1 and partial fc7
+    def forward(self, data_batch, is_train=None):
+        #g,x = self.get_params()
+        #print('{fc7_weight[0][0]}', self._iter, g['fc7_0_weight'].asnumpy()[0][0])
+        #print('{pre_fc1_weight[0][0]}', self._iter, g['pre_fc1_weight'].asnumpy()[0][0])
+
+        assert self.binded and self.params_initialized
+        self._curr_module.forward(data_batch, is_train=is_train)
+        if is_train:
+            self._iter += 1
+            fc1, label = self._curr_module.get_outputs(
+                merge_multi_context=True)
+            global_fc1 = fc1
+            self.global_label = label.as_in_context(self._ctx_cpu)
+
+            for i, _module in enumerate(self._arcface_modules):
+                _label = self.global_label - self._ctx_class_start[i]
+                db_global_fc1 = io.DataBatch([global_fc1], [_label])
+                _module.forward(db_global_fc1)  #fc7 with margin
+        #print('forward end')
+
+    def get_ndarray(self, context, name, shape):
+        key = "%s_%s" % (name, context)
+        #print(key)
+        if not key in self._nd_cache:
+            v = nd.zeros(shape=shape, ctx=context)
+            self._nd_cache[key] = v
+        else:
+            v = self._nd_cache[key]
+        return v
+
+    def get_ndarray2(self, context, name, arr):
+        key = "%s_%s" % (name, context)
+        #print(key)
+        if not key in self._nd_cache:
+            v = nd.zeros(shape=arr.shape, ctx=context)
+            self._nd_cache[key] = v
+        else:
+            v = self._nd_cache[key]
+        arr.copyto(v)
+        return v
+
+    def backward(self, out_grads=None):
+        #print('in backward')
+        assert self.binded and self.params_initialized
+        #tmp_ctx = self._ctx_cpu
+        tmp_ctx = self._ctx_single_gpu
+        fc7_outs = []
+        ctx_fc7_max = self.get_ndarray(tmp_ctx, 'ctx_fc7_max',
+                                       (self._batch_size, len(self._context)))
+        #local_fc7_max = nd.zeros( (self.global_label.shape[0],1), ctx=mx.cpu())
+        arcface_module_outputs = []
+        for i, _module in enumerate(self._arcface_modules):
+            #_fc7 = _module.get_outputs(merge_multi_context=True)[0]
+            out = _module.get_outputs(merge_multi_context=True)
+            #print(out[0].shape)
+            #print(out[1].shape)
+            arcface_module_outputs.append(out)
+            _fc7 = out[0]
+            fc7_outs.append(_fc7)
+            _fc7_max = nd.max(_fc7, axis=1).as_in_context(tmp_ctx)
+            ctx_fc7_max[:, i] = _fc7_max
+
+        local_fc7_max = self.get_ndarray(tmp_ctx, 'local_fc7_max',
+                                         (self._batch_size, 1))
+        nd.max(ctx_fc7_max, axis=1, keepdims=True, out=local_fc7_max)
+        global_fc7_max = local_fc7_max
+        #local_fc7_sum = None
+        local_fc7_sum = self.get_ndarray(tmp_ctx, 'local_fc7_sum',
+                                         (self._batch_size, 1))
+        local_fc7_sum[:, :] = 0.0
+        for i, _module in enumerate(self._arcface_modules):
+            _max = self.get_ndarray2(fc7_outs[i].context, 'fc7_max',
+                                     global_fc7_max)
+            fc7_outs[i] = nd.broadcast_sub(fc7_outs[i], _max)
+            fc7_outs[i] = nd.exp(fc7_outs[i])
+            _sum = nd.sum(fc7_outs[i], axis=1,
+                          keepdims=True).as_in_context(tmp_ctx)
+            local_fc7_sum += _sum
+        global_fc7_sum = local_fc7_sum
+
+        if self._iter % self._verbose == 0:
+            #_ctx = self._context[-1]
+            _ctx = self._ctx_cpu
+            _probs = []
+            for i, _module in enumerate(self._arcface_modules):
+                _prob = self.get_ndarray2(_ctx, '_fc7_prob_%d' % i,
+                                          fc7_outs[i])
+                _probs.append(_prob)
+            fc7_prob = self.get_ndarray(
+                _ctx, 'test_fc7_prob',
+                (self._batch_size, self._ctx_num_classes * len(self._context)))
+            nd.concat(*_probs, dim=1, out=fc7_prob)
+            fc7_pred = nd.argmax(fc7_prob, axis=1)
+            local_label = self.global_label - self._local_class_start
+            #local_label = self.get_ndarray2(_ctx, 'test_label', local_label)
+            _pred = nd.equal(fc7_pred, local_label)
+            print('{fc7_acc}', self._iter, nd.mean(_pred).asnumpy()[0])
+
+        #local_fc1_grad = []
+        #fc1_grad_ctx = self._ctx_cpu
+        fc1_grad_ctx = self._ctx_single_gpu
+        local_fc1_grad = self.get_ndarray(fc1_grad_ctx, 'local_fc1_grad',
+                                          (self._batch_size, self._emb_size))
+        local_fc1_grad[:, :] = 0.0
+        total_eloss = []
+        celoss_verbose = 1000
+        if self._iter % celoss_verbose == 0:
+            fc7_celoss = self.get_ndarray(tmp_ctx, 'test_fc7_celoss',
+                                          (self._batch_size, ))
+            fc7_celoss[:] = 0.0
+
+        for i, _module in enumerate(self._arcface_modules):
+            _sum = self.get_ndarray2(fc7_outs[i].context, 'fc7_sum',
+                                     global_fc7_sum)
+            fc7_outs[i] = nd.broadcast_div(fc7_outs[i], _sum)
+            a = i * self._ctx_num_classes
+            b = (i + 1) * self._ctx_num_classes
+            _label = self.global_label - self._ctx_class_start[i]
+            _label = self.get_ndarray2(fc7_outs[i].context, 'label', _label)
+            onehot_label = self.get_ndarray(
+                fc7_outs[i].context, 'label_onehot',
+                (self._batch_size, self._ctx_num_classes))
+            nd.one_hot(_label,
+                       depth=self._ctx_num_classes,
+                       on_value=1.0,
+                       off_value=0.0,
+                       out=onehot_label)
+            #print(fc7_outs[i].shape, onehot_label.shape)
+
+            if self._iter % celoss_verbose == 0:
+                _ce_loss = fc7_outs[i] * onehot_label
+                _ce_loss = nd.sum(_ce_loss, axis=1)
+                fc7_celoss += _ce_loss.as_in_context(tmp_ctx)
+            fc7_outs[i] -= onehot_label
+
+            out = arcface_module_outputs[i]
+            out_grads = [fc7_outs[i]]
+            for j in range(1, len(out)):
+                eloss = out[j]
+                #print('eloss%d:'%j, eloss.shape)
+                #print(out_grads[0].shape)
+                #egrad_shape = (out_grads[0].shape[0], eloss.shape[0])
+                egrad_shape = eloss.shape
+                egrad = self.get_ndarray(fc7_outs[i].context, 'egrad%d' % j,
+                                         egrad_shape)
+                #egrad[:][:] = 1.0/egrad_shape[0]
+                egrad[:][:] = 1.0
+                out_grads.append(egrad)
+                if self._iter % self._verbose == 0:
+                    total_eloss.append(np.mean(eloss.asnumpy()))
+
+            _module.backward(out_grads=out_grads)
+            #ctx_fc1_grad = _module.get_input_grads()[0].as_in_context(mx.cpu())
+            ctx_fc1_grad = self.get_ndarray2(fc1_grad_ctx,
+                                             'ctx_fc1_grad_%d' % i,
+                                             _module.get_input_grads()[0])
+            local_fc1_grad += ctx_fc1_grad
+
+        if self._iter % self._verbose == 0 and len(total_eloss) > 0:
+            print('{eloss}', self._iter, np.mean(total_eloss))
+        #if self._iter%self._verbose==0:
+        if self._iter % celoss_verbose == 0:
+            ce_loss = nd.log(fc7_celoss) * -1.0
+            ce_loss = nd.mean(ce_loss)
+            print('CELOSS,%d,%f' % (self._iter, ce_loss.asnumpy()))
+
+        global_fc1_grad = local_fc1_grad
+        self._curr_module.backward(out_grads=[global_fc1_grad])
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+        for i, _module in enumerate(self._arcface_modules):
+            _module.update()
+        mx.nd.waitall()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(
+            merge_multi_context=merge_multi_context)
+        #return self._arcface_module.get_outputs(merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(
+            merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        #self._curr_module.update_metric(eval_metric, labels)
+        #label = labels[0]
+        #print(label.shape)
+        #self._arcface_module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._curr_module.install_monitor(mon)
+
+    def forward_backward(self, data_batch):
+        """A convenient function that calls both ``forward`` and ``backward``."""
+        self.forward(data_batch, is_train=True)  # get fc1 and partial fc7
+        self.backward()
+
+    def fit(self,
+            train_data,
+            eval_data=None,
+            eval_metric='acc',
+            epoch_end_callback=None,
+            batch_end_callback=None,
+            kvstore='local',
+            optimizer='sgd',
+            optimizer_params=(('learning_rate', 0.01), ),
+            eval_end_callback=None,
+            eval_batch_end_callback=None,
+            initializer=Uniform(0.01),
+            arg_params=None,
+            aux_params=None,
+            allow_missing=False,
+            force_rebind=False,
+            force_init=False,
+            begin_epoch=0,
+            num_epoch=None,
+            validation_metric=None,
+            monitor=None,
+            sparse_row_id_fn=None):
+        """Trains the module parameters.
+
+        Checkout `Module Tutorial <http://mxnet.io/tutorials/basic/module.html>`_ to see
+        a end-to-end use-case.
+
+        Parameters
+        ----------
+        train_data : DataIter
+            Train DataIter.
+        eval_data : DataIter
+            If not ``None``, will be used as validation set and the performance
+            after each epoch will be evaluated.
+        eval_metric : str or EvalMetric
+            Defaults to 'accuracy'. The performance measure used to display during training.
+            Other possible predefined metrics are:
+            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
+        epoch_end_callback : function or list of functions
+            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
+            and `aux_params`.
+        batch_end_callback : function or list of function
+            Each callback will be called with a `BatchEndParam`.
+        kvstore : str or KVStore
+            Defaults to 'local'.
+        optimizer : str or Optimizer
+            Defaults to 'sgd'.
+        optimizer_params : dict
+            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
+            the optimizer constructor.
+            The default value is not a dict, just to avoid pylint warning on dangerous
+            default values.
+        eval_end_callback : function or list of function
+            These will be called at the end of each full evaluation, with the metrics over
+            the entire evaluation set.
+        eval_batch_end_callback : function or list of function
+            These will be called at the end of each mini-batch during evaluation.
+        initializer : Initializer
+            The initializer is called to initialize the module parameters when they are
+            not already initialized.
+        arg_params : dict
+            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
+            model or loaded from a checkpoint (previously saved model). In this case,
+            the value here will be used to initialize the module parameters, unless they
+            are already initialized by the user via a call to `init_params` or `fit`.
+            `arg_params` has a higher priority than `initializer`.
+        aux_params : dict
+            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
+        allow_missing : bool
+            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
+            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
+            will be initialized via the `initializer`.
+        force_rebind : bool
+            Defaults to ``False``. Whether to force rebinding the executors if already bound.
+        force_init : bool
+            Defaults to ``False``. Indicates whether to force initialization even if the
+            parameters are already initialized.
+        begin_epoch : int
+            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
+            checkpoint saved at a previous training phase at epoch N, then this value should be
+            N+1.
+        num_epoch : int
+            Number of epochs for training.
+        sparse_row_id_fn : A callback function
+            The function  takes `data_batch` as an input and returns a dict of
+            str -> NDArray. The resulting dict is used for pulling row_sparse
+            parameters from the kvstore, where the str key is the name of the param,
+            and the value is the row id of the param to pull.
+
+        Examples
+        --------
+        >>> # An example of using fit for training.
+        >>> # Assume training dataIter and validation dataIter are ready
+        >>> # Assume loading a previously checkpointed model
+        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
+        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
+        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
+        ...     arg_params=arg_params, aux_params=aux_params,
+        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
+        """
+        assert num_epoch is not None, 'please specify number of epochs'
+        assert arg_params is None and aux_params is None
+
+        self.bind(data_shapes=train_data.provide_data,
+                  label_shapes=train_data.provide_label,
+                  for_training=True,
+                  force_rebind=force_rebind)
+        if monitor is not None:
+            self.install_monitor(monitor)
+        self.init_params(initializer=initializer,
+                         arg_params=arg_params,
+                         aux_params=aux_params,
+                         allow_missing=allow_missing,
+                         force_init=force_init)
+        self.init_optimizer(kvstore=kvstore,
+                            optimizer=optimizer,
+                            optimizer_params=optimizer_params)
+
+        if validation_metric is None:
+            validation_metric = eval_metric
+        if not isinstance(eval_metric, metric.EvalMetric):
+            eval_metric = metric.create(eval_metric)
+        epoch_eval_metric = copy.deepcopy(eval_metric)
+
+        ################################################################################
+        # training loop
+        ################################################################################
+        for epoch in range(begin_epoch, num_epoch):
+            tic = time.time()
+            eval_metric.reset()
+            epoch_eval_metric.reset()
+            nbatch = 0
+            data_iter = iter(train_data)
+            end_of_batch = False
+            next_data_batch = next(data_iter)
+            while not end_of_batch:
+                data_batch = next_data_batch
+                if monitor is not None:
+                    monitor.tic()
+                self.forward_backward(data_batch)
+                self.update()
+                assert not isinstance(data_batch, list)
+
+                #if isinstance(data_batch, list):
+                #    #print('XXX')
+                #    self.update_metric(eval_metric,
+                #                       [db.label for db in data_batch],
+                #                       pre_sliced=True)
+                #    self.update_metric(epoch_eval_metric,
+                #                       [db.label for db in data_batch],
+                #                       pre_sliced=True)
+                #else:
+                #    #print('before update metric')
+                #    self.update_metric(eval_metric, data_batch.label)
+                #    self.update_metric(epoch_eval_metric, data_batch.label)
+                #labels = data_batch.label
+                #labels = [self.global_label]
+                #self.update_metric(eval_metric, labels)
+                #self.update_metric(epoch_eval_metric, labels)
+
+                try:
+                    # pre fetch next batch
+                    next_data_batch = next(data_iter)
+                    self.prepare(next_data_batch,
+                                 sparse_row_id_fn=sparse_row_id_fn)
+                except StopIteration:
+                    end_of_batch = True
+
+                if monitor is not None:
+                    monitor.toc_print()
+
+                #if end_of_batch:
+                #    eval_name_vals = epoch_eval_metric.get_name_value()
+
+                if batch_end_callback is not None:
+                    batch_end_params = BatchEndParam(epoch=epoch,
+                                                     nbatch=nbatch,
+                                                     eval_metric=None,
+                                                     locals=locals())
+                    batch_end_callback(batch_end_params)
+                    #for callback in _as_list(batch_end_callback):
+                    #    callback(batch_end_params)
+                nbatch += 1
+
+            # one epoch of training is finished
+            #for name, val in eval_name_vals:
+            #    self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
+            toc = time.time()
+            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
+
+            # sync aux params across devices
+            arg_params, aux_params = self.get_params()
+            self.set_params(arg_params, aux_params)
+
+            # end of 1 epoch, reset the data-iter for another epoch
+            train_data.reset()
diff --git a/insightface/recognition/subcenter_arcface/sample_config.py b/insightface/recognition/subcenter_arcface/sample_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..084a5ae070c8ece66429df9284c0514e14fea861
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/sample_config.py
@@ -0,0 +1,224 @@
+import numpy as np
+import os
+from easydict import EasyDict as edict
+
+config = edict()
+
+config.bn_mom = 0.9
+config.workspace = 256
+config.emb_size = 512
+config.ckpt_embedding = True
+config.net_se = 0
+config.net_act = 'prelu'
+config.net_unit = 3
+config.net_input = 1
+config.net_blocks = [1, 4, 6, 2]
+config.net_output = 'E'
+config.net_multiplier = 1.0
+config.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+config.ce_loss = True
+config.fc7_lr_mult = 1.0
+config.fc7_wd_mult = 1.0
+config.fc7_no_bias = False
+config.max_steps = 0
+config.data_rand_mirror = True
+config.data_cutoff = False
+config.data_color = 0
+config.data_images_filter = 0
+config.count_flops = True
+config.memonger = False  #not work now
+
+config.loss_K = 3
+
+config.flip_noise_prob = 0.0
+
+# network settings
+network = edict()
+
+network.r100 = edict()
+network.r100.net_name = 'fresnet'
+network.r100.num_layers = 100
+
+network.r100fc = edict()
+network.r100fc.net_name = 'fresnet'
+network.r100fc.num_layers = 100
+network.r100fc.net_output = 'FC'
+
+network.r50 = edict()
+network.r50.net_name = 'fresnet'
+network.r50.num_layers = 50
+
+network.r50v1 = edict()
+network.r50v1.net_name = 'fresnet'
+network.r50v1.num_layers = 50
+network.r50v1.net_unit = 1
+
+network.d169 = edict()
+network.d169.net_name = 'fdensenet'
+network.d169.num_layers = 169
+network.d169.per_batch_size = 64
+network.d169.densenet_dropout = 0.0
+
+network.d201 = edict()
+network.d201.net_name = 'fdensenet'
+network.d201.num_layers = 201
+network.d201.per_batch_size = 64
+network.d201.densenet_dropout = 0.0
+
+network.y1 = edict()
+network.y1.net_name = 'fmobilefacenet'
+network.y1.emb_size = 128
+network.y1.net_output = 'GDC'
+
+network.y2 = edict()
+network.y2.net_name = 'fmobilefacenet'
+network.y2.emb_size = 256
+network.y2.net_output = 'GDC'
+network.y2.net_blocks = [2, 8, 16, 4]
+
+network.m1 = edict()
+network.m1.net_name = 'fmobilenet'
+network.m1.emb_size = 256
+network.m1.net_output = 'GDC'
+network.m1.net_multiplier = 1.0
+
+network.m05 = edict()
+network.m05.net_name = 'fmobilenet'
+network.m05.emb_size = 256
+network.m05.net_output = 'GDC'
+network.m05.net_multiplier = 0.5
+
+network.mnas = edict()
+network.mnas.net_name = 'fmnasnet'
+network.mnas.emb_size = 256
+network.mnas.net_output = 'GDC'
+network.mnas.net_multiplier = 1.0
+
+network.mnas05 = edict()
+network.mnas05.net_name = 'fmnasnet'
+network.mnas05.emb_size = 256
+network.mnas05.net_output = 'GDC'
+network.mnas05.net_multiplier = 0.5
+
+network.mnas025 = edict()
+network.mnas025.net_name = 'fmnasnet'
+network.mnas025.emb_size = 256
+network.mnas025.net_output = 'GDC'
+network.mnas025.net_multiplier = 0.25
+
+network.vargfacenet = edict()
+network.vargfacenet.net_name = 'vargfacenet'
+network.vargfacenet.net_multiplier = 1.25
+network.vargfacenet.emb_size = 512
+network.vargfacenet.net_output = 'J'
+
+# dataset settings
+dataset = edict()
+
+dataset.emore = edict()
+dataset.emore.dataset = 'emore'
+dataset.emore.dataset_path = '../datasets/faces_emore'
+dataset.emore.num_classes = 85742
+dataset.emore.image_shape = (112, 112, 3)
+dataset.emore.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+
+dataset.retina = edict()
+dataset.retina.dataset = 'retina'
+dataset.retina.dataset_path = '../datasets/ms1m-retinaface-t1'
+dataset.retina.num_classes = 93431
+dataset.retina.image_shape = (112, 112, 3)
+dataset.retina.val_targets = ['lfw', 'cfp_fp', 'agedb_30']
+
+loss = edict()
+loss.softmax = edict()
+loss.softmax.loss_name = 'softmax'
+
+loss.nsoftmax = edict()
+loss.nsoftmax.loss_name = 'margin_softmax'
+loss.nsoftmax.loss_s = 64.0
+loss.nsoftmax.loss_m1 = 1.0
+loss.nsoftmax.loss_m2 = 0.0
+loss.nsoftmax.loss_m3 = 0.0
+
+loss.arcface = edict()
+loss.arcface.loss_name = 'margin_softmax'
+loss.arcface.loss_s = 64.0
+loss.arcface.loss_m1 = 1.0
+loss.arcface.loss_m2 = 0.5
+loss.arcface.loss_m3 = 0.0
+
+loss.cosface = edict()
+loss.cosface.loss_name = 'margin_softmax'
+loss.cosface.loss_s = 64.0
+loss.cosface.loss_m1 = 1.0
+loss.cosface.loss_m2 = 0.0
+loss.cosface.loss_m3 = 0.35
+
+loss.combined = edict()
+loss.combined.loss_name = 'margin_softmax'
+loss.combined.loss_s = 64.0
+loss.combined.loss_m1 = 1.0
+loss.combined.loss_m2 = 0.3
+loss.combined.loss_m3 = 0.2
+
+loss.triplet = edict()
+loss.triplet.loss_name = 'triplet'
+loss.triplet.images_per_identity = 5
+loss.triplet.triplet_alpha = 0.3
+loss.triplet.triplet_bag_size = 7200
+loss.triplet.triplet_max_ap = 0.0
+loss.triplet.per_batch_size = 60
+loss.triplet.lr = 0.05
+
+loss.atriplet = edict()
+loss.atriplet.loss_name = 'atriplet'
+loss.atriplet.images_per_identity = 5
+loss.atriplet.triplet_alpha = 0.35
+loss.atriplet.triplet_bag_size = 7200
+loss.atriplet.triplet_max_ap = 0.0
+loss.atriplet.per_batch_size = 60
+loss.atriplet.lr = 0.05
+
+# default settings
+default = edict()
+
+# default network
+default.network = 'r100'
+default.pretrained = ''
+default.pretrained_epoch = 1
+# default dataset
+default.dataset = 'emore'
+default.loss = 'arcface'
+default.frequent = 20
+default.verbose = 2000
+default.kvstore = 'device'
+
+default.end_epoch = 10000
+default.lr = 0.1
+default.wd = 0.0005
+default.mom = 0.9
+default.per_batch_size = 128
+default.ckpt = 3
+default.lr_steps = '100000,160000,220000'
+default.models_root = './models'
+
+
+def generate_config(_network, _dataset, _loss):
+    for k, v in loss[_loss].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    for k, v in network[_network].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    for k, v in dataset[_dataset].items():
+        config[k] = v
+        if k in default:
+            default[k] = v
+    config.loss = _loss
+    config.network = _network
+    config.dataset = _dataset
+    config.num_workers = 1
+    if 'DMLC_NUM_WORKER' in os.environ:
+        config.num_workers = int(os.environ['DMLC_NUM_WORKER'])
diff --git a/insightface/recognition/subcenter_arcface/symbol/fdensenet.py b/insightface/recognition/subcenter_arcface/symbol/fdensenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3d49ee876f85dc48645543d5d8ad1170928566d
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/fdensenet.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""DenseNet, implemented in Gluon."""
+
+import sys
+import os
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act():
+    if config.net_act == 'prelu':
+        return nn.PReLU()
+    else:
+        return nn.Activation(config.net_act)
+
+
+# Helpers
+def _make_dense_block(num_layers, bn_size, growth_rate, dropout, stage_index):
+    out = nn.HybridSequential(prefix='stage%d_' % stage_index)
+    with out.name_scope():
+        for _ in range(num_layers):
+            out.add(_make_dense_layer(growth_rate, bn_size, dropout))
+    return out
+
+
+def _make_dense_layer(growth_rate, bn_size, dropout):
+    new_features = nn.HybridSequential(prefix='')
+    new_features.add(nn.BatchNorm())
+    #new_features.add(nn.Activation('relu'))
+    new_features.add(Act())
+    new_features.add(
+        nn.Conv2D(bn_size * growth_rate, kernel_size=1, use_bias=False))
+    new_features.add(nn.BatchNorm())
+    #new_features.add(nn.Activation('relu'))
+    new_features.add(Act())
+    new_features.add(
+        nn.Conv2D(growth_rate, kernel_size=3, padding=1, use_bias=False))
+    if dropout:
+        new_features.add(nn.Dropout(dropout))
+
+    out = gluon.contrib.nn.HybridConcurrent(axis=1, prefix='')
+    out.add(gluon.contrib.nn.Identity())
+    out.add(new_features)
+
+    return out
+
+
+def _make_transition(num_output_features):
+    out = nn.HybridSequential(prefix='')
+    out.add(nn.BatchNorm())
+    #out.add(nn.Activation('relu'))
+    out.add(Act())
+    out.add(nn.Conv2D(num_output_features, kernel_size=1, use_bias=False))
+    out.add(nn.AvgPool2D(pool_size=2, strides=2))
+    return out
+
+
+# Net
+class DenseNet(nn.HybridBlock):
+    r"""Densenet-BC model from the
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
+
+    Parameters
+    ----------
+    num_init_features : int
+        Number of filters to learn in the first convolution layer.
+    growth_rate : int
+        Number of filters to add each layer (`k` in the paper).
+    block_config : list of int
+        List of integers for numbers of layers in each pooling block.
+    bn_size : int, default 4
+        Multiplicative factor for number of bottle neck layers.
+        (i.e. bn_size * k features in the bottleneck layer)
+    dropout : float, default 0
+        Rate of dropout after each dense layer.
+    classes : int, default 1000
+        Number of classification classes.
+    """
+    def __init__(self,
+                 num_init_features,
+                 growth_rate,
+                 block_config,
+                 bn_size=4,
+                 dropout=0,
+                 classes=1000,
+                 **kwargs):
+
+        super(DenseNet, self).__init__(**kwargs)
+        with self.name_scope():
+            self.features = nn.HybridSequential(prefix='')
+            self.features.add(
+                nn.Conv2D(num_init_features,
+                          kernel_size=3,
+                          strides=1,
+                          padding=1,
+                          use_bias=False))
+            self.features.add(nn.BatchNorm())
+            self.features.add(nn.Activation('relu'))
+            self.features.add(nn.MaxPool2D(pool_size=3, strides=2, padding=1))
+            # Add dense blocks
+            num_features = num_init_features
+            for i, num_layers in enumerate(block_config):
+                self.features.add(
+                    _make_dense_block(num_layers, bn_size, growth_rate,
+                                      dropout, i + 1))
+                num_features = num_features + num_layers * growth_rate
+                if i != len(block_config) - 1:
+                    self.features.add(_make_transition(num_features // 2))
+                    num_features = num_features // 2
+            self.features.add(nn.BatchNorm())
+            self.features.add(nn.Activation('relu'))
+            #self.features.add(nn.AvgPool2D(pool_size=7))
+            #self.features.add(nn.Flatten())
+
+            #self.output = nn.Dense(classes)
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        #x = self.output(x)
+        return x
+
+
+# Specification
+densenet_spec = {
+    121: (64, 32, [6, 12, 24, 16]),
+    161: (96, 48, [6, 12, 36, 24]),
+    169: (64, 32, [6, 12, 32, 32]),
+    201: (64, 32, [6, 12, 48, 32])
+}
+
+
+# Constructor
+def get_symbol():
+    num_layers = config.num_layers
+    num_init_features, growth_rate, block_config = densenet_spec[num_layers]
+    net = DenseNet(num_init_features,
+                   growth_rate,
+                   block_config,
+                   dropout=config.densenet_dropout)
+    data = mx.sym.Variable(name='data')
+    data = data - 127.5
+    data = data * 0.0078125
+    body = net(data)
+    fc1 = symbol_utils.get_fc1(body, config.emb_size, config.net_output)
+    return fc1
diff --git a/insightface/recognition/subcenter_arcface/symbol/fmnasnet.py b/insightface/recognition/subcenter_arcface/symbol/fmnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..118beb94298cbaaef43811fdc4154bdb47a66bd9
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/fmnasnet.py
@@ -0,0 +1,213 @@
+import sys
+import os
+import mxnet as mx
+import mxnet.ndarray as nd
+import mxnet.gluon as gluon
+import mxnet.gluon.nn as nn
+import mxnet.autograd as ag
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act():
+    if config.net_act == 'prelu':
+        return nn.PReLU()
+    else:
+        return nn.Activation(config.net_act)
+
+
+def ConvBlock(channels, kernel_size, strides, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels,
+                      kernel_size,
+                      strides=strides,
+                      padding=1,
+                      use_bias=False), nn.BatchNorm(scale=True),
+            Act()
+            #nn.Activation('relu')
+        )
+    return out
+
+
+def Conv1x1(channels, is_linear=False, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(nn.Conv2D(channels, 1, padding=0, use_bias=False),
+                nn.BatchNorm(scale=True))
+        if not is_linear:
+            #out.add(nn.Activation('relu'))
+            out.add(Act())
+    return out
+
+
+def DWise(channels, strides, kernel_size=3, **kwargs):
+    out = nn.HybridSequential(**kwargs)
+    with out.name_scope():
+        out.add(
+            nn.Conv2D(channels,
+                      kernel_size,
+                      strides=strides,
+                      padding=kernel_size // 2,
+                      groups=channels,
+                      use_bias=False), nn.BatchNorm(scale=True),
+            Act()
+            #nn.Activation('relu')
+        )
+    return out
+
+
+class SepCONV(nn.HybridBlock):
+    def __init__(self,
+                 inp,
+                 output,
+                 kernel_size,
+                 depth_multiplier=1,
+                 with_bn=True,
+                 **kwargs):
+        super(SepCONV, self).__init__(**kwargs)
+        with self.name_scope():
+            self.net = nn.HybridSequential()
+            cn = int(inp * depth_multiplier)
+
+            if output is None:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp,
+                              channels=cn,
+                              groups=inp,
+                              kernel_size=kernel_size,
+                              strides=(1, 1),
+                              padding=kernel_size // 2,
+                              use_bias=not with_bn))
+            else:
+                self.net.add(
+                    nn.Conv2D(in_channels=inp,
+                              channels=cn,
+                              groups=inp,
+                              kernel_size=kernel_size,
+                              strides=(1, 1),
+                              padding=kernel_size // 2,
+                              use_bias=False),
+                    nn.BatchNorm(),
+                    Act(),
+                    #nn.Activation('relu'),
+                    nn.Conv2D(in_channels=cn,
+                              channels=output,
+                              kernel_size=(1, 1),
+                              strides=(1, 1),
+                              use_bias=not with_bn))
+
+            self.with_bn = with_bn
+            self.act = Act()
+            #self.act = nn.Activation('relu')
+            if with_bn:
+                self.bn = nn.BatchNorm()
+
+    def hybrid_forward(self, F, x):
+        x = self.net(x)
+        if self.with_bn:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class ExpandedConv(nn.HybridBlock):
+    def __init__(self,
+                 inp,
+                 oup,
+                 t,
+                 strides,
+                 kernel=3,
+                 same_shape=True,
+                 **kwargs):
+        super(ExpandedConv, self).__init__(**kwargs)
+
+        self.same_shape = same_shape
+        self.strides = strides
+        with self.name_scope():
+            self.bottleneck = nn.HybridSequential()
+            self.bottleneck.add(
+                Conv1x1(inp * t, prefix="expand_"),
+                DWise(inp * t, self.strides, kernel, prefix="dwise_"),
+                Conv1x1(oup, is_linear=True, prefix="linear_"))
+
+    def hybrid_forward(self, F, x):
+        out = self.bottleneck(x)
+        if self.strides == 1 and self.same_shape:
+            out = F.elemwise_add(out, x)
+        return out
+
+
+def ExpandedConvSequence(t, k, inp, oup, repeats, first_strides, **kwargs):
+    seq = nn.HybridSequential(**kwargs)
+    with seq.name_scope():
+        seq.add(ExpandedConv(inp, oup, t, first_strides, k, same_shape=False))
+        curr_inp = oup
+        for i in range(1, repeats):
+            seq.add(ExpandedConv(curr_inp, oup, t, 1))
+            curr_inp = oup
+    return seq
+
+
+class MNasNet(nn.HybridBlock):
+    def __init__(self, m=1.0, **kwargs):
+        super(MNasNet, self).__init__(**kwargs)
+
+        self.first_oup = int(32 * m)
+        self.second_oup = int(16 * m)
+        #self.second_oup = int(32*m)
+        self.interverted_residual_setting = [
+            # t, c,  n, s, k
+            [3, int(24 * m), 3, 2, 3, "stage2_"],  # -> 56x56
+            [3, int(40 * m), 3, 2, 5, "stage3_"],  # -> 28x28
+            [6, int(80 * m), 3, 2, 5, "stage4_1_"],  # -> 14x14
+            [6, int(96 * m), 2, 1, 3, "stage4_2_"],  # -> 14x14
+            [6, int(192 * m), 4, 2, 5, "stage5_1_"],  # -> 7x7
+            [6, int(320 * m), 1, 1, 3, "stage5_2_"],  # -> 7x7          
+        ]
+        self.last_channels = int(1024 * m)
+
+        with self.name_scope():
+            self.features = nn.HybridSequential()
+            self.features.add(
+                ConvBlock(self.first_oup, 3, 1, prefix="stage1_conv0_"))
+            self.features.add(
+                SepCONV(self.first_oup,
+                        self.second_oup,
+                        3,
+                        prefix="stage1_sepconv0_"))
+            inp = self.second_oup
+            for i, (t, c, n, s, k,
+                    prefix) in enumerate(self.interverted_residual_setting):
+                oup = c
+                self.features.add(
+                    ExpandedConvSequence(t, k, inp, oup, n, s, prefix=prefix))
+                inp = oup
+
+            self.features.add(Conv1x1(self.last_channels, prefix="stage5_3_"))
+            #self.features.add(nn.GlobalAvgPool2D())
+            #self.features.add(nn.Flatten())
+            #self.output = nn.Dense(num_classes)
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        #x = self.output(x)
+        return x
+
+    def num_output_channel(self):
+        return self.last_channels
+
+
+def get_symbol():
+    net = MNasNet(config.net_multiplier)
+    data = mx.sym.Variable(name='data')
+    data = data - 127.5
+    data = data * 0.0078125
+    body = net(data)
+    fc1 = symbol_utils.get_fc1(body,
+                               config.emb_size,
+                               config.net_output,
+                               input_channel=net.num_output_channel())
+    return fc1
diff --git a/insightface/recognition/subcenter_arcface/symbol/fmobilefacenet.py b/insightface/recognition/subcenter_arcface/symbol/fmobilefacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f498264ea5f765a5f9ff41f12569c8e6e70810e8
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/fmobilefacenet.py
@@ -0,0 +1,224 @@
+import sys
+import os
+import mxnet as mx
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def Conv(data,
+         num_filter=1,
+         kernel=(1, 1),
+         stride=(1, 1),
+         pad=(0, 0),
+         num_group=1,
+         name=None,
+         suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=config.bn_mom)
+    act = Act(data=bn,
+              act_type=config.net_act,
+              name='%s%s_relu' % (name, suffix))
+    return act
+
+
+def Linear(data,
+           num_filter=1,
+           kernel=(1, 1),
+           stride=(1, 1),
+           pad=(0, 0),
+           num_group=1,
+           name=None,
+           suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=config.bn_mom)
+    return bn
+
+
+def ConvOnly(data,
+             num_filter=1,
+             kernel=(1, 1),
+             stride=(1, 1),
+             pad=(0, 0),
+             num_group=1,
+             name=None,
+             suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    return conv
+
+
+def DResidual(data,
+              num_out=1,
+              kernel=(3, 3),
+              stride=(2, 2),
+              pad=(1, 1),
+              num_group=1,
+              name=None,
+              suffix=''):
+    conv = Conv(data=data,
+                num_filter=num_group,
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(1, 1),
+                name='%s%s_conv_sep' % (name, suffix))
+    conv_dw = Conv(data=conv,
+                   num_filter=num_group,
+                   num_group=num_group,
+                   kernel=kernel,
+                   pad=pad,
+                   stride=stride,
+                   name='%s%s_conv_dw' % (name, suffix))
+    proj = Linear(data=conv_dw,
+                  num_filter=num_out,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name='%s%s_conv_proj' % (name, suffix))
+    return proj
+
+
+def Residual(data,
+             num_block=1,
+             num_out=1,
+             kernel=(3, 3),
+             stride=(1, 1),
+             pad=(1, 1),
+             num_group=1,
+             name=None,
+             suffix=''):
+    identity = data
+    for i in range(num_block):
+        shortcut = identity
+        conv = DResidual(data=identity,
+                         num_out=num_out,
+                         kernel=kernel,
+                         stride=stride,
+                         pad=pad,
+                         num_group=num_group,
+                         name='%s%s_block' % (name, suffix),
+                         suffix='%d' % i)
+        identity = conv + shortcut
+    return identity
+
+
+def get_symbol():
+    num_classes = config.emb_size
+    print('in_network', config)
+    fc_type = config.net_output
+    data = mx.symbol.Variable(name="data")
+    data = data - 127.5
+    data = data * 0.0078125
+    blocks = config.net_blocks
+    conv_1 = Conv(data,
+                  num_filter=64,
+                  kernel=(3, 3),
+                  pad=(1, 1),
+                  stride=(2, 2),
+                  name="conv_1")
+    if blocks[0] == 1:
+        conv_2_dw = Conv(conv_1,
+                         num_group=64,
+                         num_filter=64,
+                         kernel=(3, 3),
+                         pad=(1, 1),
+                         stride=(1, 1),
+                         name="conv_2_dw")
+    else:
+        conv_2_dw = Residual(conv_1,
+                             num_block=blocks[0],
+                             num_out=64,
+                             kernel=(3, 3),
+                             stride=(1, 1),
+                             pad=(1, 1),
+                             num_group=64,
+                             name="res_2")
+    conv_23 = DResidual(conv_2_dw,
+                        num_out=64,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=128,
+                        name="dconv_23")
+    conv_3 = Residual(conv_23,
+                      num_block=blocks[1],
+                      num_out=64,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=128,
+                      name="res_3")
+    conv_34 = DResidual(conv_3,
+                        num_out=128,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=256,
+                        name="dconv_34")
+    conv_4 = Residual(conv_34,
+                      num_block=blocks[2],
+                      num_out=128,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=256,
+                      name="res_4")
+    conv_45 = DResidual(conv_4,
+                        num_out=128,
+                        kernel=(3, 3),
+                        stride=(2, 2),
+                        pad=(1, 1),
+                        num_group=512,
+                        name="dconv_45")
+    conv_5 = Residual(conv_45,
+                      num_block=blocks[3],
+                      num_out=128,
+                      kernel=(3, 3),
+                      stride=(1, 1),
+                      pad=(1, 1),
+                      num_group=256,
+                      name="res_5")
+    conv_6_sep = Conv(conv_5,
+                      num_filter=512,
+                      kernel=(1, 1),
+                      pad=(0, 0),
+                      stride=(1, 1),
+                      name="conv_6sep")
+
+    fc1 = symbol_utils.get_fc1(conv_6_sep, num_classes, fc_type)
+    return fc1
diff --git a/insightface/recognition/subcenter_arcface/symbol/fmobilenet.py b/insightface/recognition/subcenter_arcface/symbol/fmobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdbf8a559f3d9a374d1a4e753f509e90e4391290
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/fmobilenet.py
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import os
+import mxnet as mx
+import symbol_utils
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def Conv(data,
+         num_filter=1,
+         kernel=(1, 1),
+         stride=(1, 1),
+         pad=(0, 0),
+         num_group=1,
+         name=None,
+         suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=True)
+    act = Act(data=bn,
+              act_type=config.net_act,
+              name='%s%s_relu' % (name, suffix))
+    return act
+
+
+def ConvOnly(data,
+             num_filter=1,
+             kernel=(1, 1),
+             stride=(1, 1),
+             pad=(0, 0),
+             num_group=1,
+             name=None,
+             suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    return conv
+
+
+def get_symbol():
+    num_classes = config.emb_size
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    data = mx.symbol.Variable(name="data")  # 224
+    data = data - 127.5
+    data = data * 0.0078125
+    fc_type = config.net_output
+    bf = int(32 * config.net_multiplier)
+    if config.net_input == 0:
+        conv_1 = Conv(data,
+                      num_filter=bf,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(2, 2),
+                      name="conv_1")  # 224/112
+    else:
+        conv_1 = Conv(data,
+                      num_filter=bf,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_1")  # 224/112
+    conv_2_dw = Conv(conv_1,
+                     num_group=bf,
+                     num_filter=bf,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_2_dw")  # 112/112
+    conv_2 = Conv(conv_2_dw,
+                  num_filter=bf * 2,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_2")  # 112/112
+    conv_3_dw = Conv(conv_2,
+                     num_group=bf * 2,
+                     num_filter=bf * 2,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_3_dw")  # 112/56
+    conv_3 = Conv(conv_3_dw,
+                  num_filter=bf * 4,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_3")  # 56/56
+    conv_4_dw = Conv(conv_3,
+                     num_group=bf * 4,
+                     num_filter=bf * 4,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_4_dw")  # 56/56
+    conv_4 = Conv(conv_4_dw,
+                  num_filter=bf * 4,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_4")  # 56/56
+    conv_5_dw = Conv(conv_4,
+                     num_group=bf * 4,
+                     num_filter=bf * 4,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_5_dw")  # 56/28
+    conv_5 = Conv(conv_5_dw,
+                  num_filter=bf * 8,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_5")  # 28/28
+    conv_6_dw = Conv(conv_5,
+                     num_group=bf * 8,
+                     num_filter=bf * 8,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_6_dw")  # 28/28
+    conv_6 = Conv(conv_6_dw,
+                  num_filter=bf * 8,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_6")  # 28/28
+    conv_7_dw = Conv(conv_6,
+                     num_group=bf * 8,
+                     num_filter=bf * 8,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(2, 2),
+                     name="conv_7_dw")  # 28/14
+    conv_7 = Conv(conv_7_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_7")  # 14/14
+
+    conv_8_dw = Conv(conv_7,
+                     num_group=bf * 16,
+                     num_filter=bf * 16,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_8_dw")  # 14/14
+    conv_8 = Conv(conv_8_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_8")  # 14/14
+    conv_9_dw = Conv(conv_8,
+                     num_group=bf * 16,
+                     num_filter=bf * 16,
+                     kernel=(3, 3),
+                     pad=(1, 1),
+                     stride=(1, 1),
+                     name="conv_9_dw")  # 14/14
+    conv_9 = Conv(conv_9_dw,
+                  num_filter=bf * 16,
+                  kernel=(1, 1),
+                  pad=(0, 0),
+                  stride=(1, 1),
+                  name="conv_9")  # 14/14
+    conv_10_dw = Conv(conv_9,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_10_dw")  # 14/14
+    conv_10 = Conv(conv_10_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_10")  # 14/14
+    conv_11_dw = Conv(conv_10,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_11_dw")  # 14/14
+    conv_11 = Conv(conv_11_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_11")  # 14/14
+    conv_12_dw = Conv(conv_11,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_12_dw")  # 14/14
+    conv_12 = Conv(conv_12_dw,
+                   num_filter=bf * 16,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_12")  # 14/14
+
+    conv_13_dw = Conv(conv_12,
+                      num_group=bf * 16,
+                      num_filter=bf * 16,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(2, 2),
+                      name="conv_13_dw")  # 14/7
+    conv_13 = Conv(conv_13_dw,
+                   num_filter=bf * 32,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_13")  # 7/7
+    conv_14_dw = Conv(conv_13,
+                      num_group=bf * 32,
+                      num_filter=bf * 32,
+                      kernel=(3, 3),
+                      pad=(1, 1),
+                      stride=(1, 1),
+                      name="conv_14_dw")  # 7/7
+    conv_14 = Conv(conv_14_dw,
+                   num_filter=bf * 32,
+                   kernel=(1, 1),
+                   pad=(0, 0),
+                   stride=(1, 1),
+                   name="conv_14")  # 7/7
+    body = conv_14
+    fc1 = symbol_utils.get_fc1(body, num_classes, fc_type)
+    return fc1
diff --git a/insightface/recognition/subcenter_arcface/symbol/fresnet.py b/insightface/recognition/subcenter_arcface/symbol/fresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b17788d1a58d82161ef1931478c306656ffdbd9
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/fresnet.py
@@ -0,0 +1,1191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import mxnet as mx
+import numpy as np
+import symbol_utils
+import memonger
+import sklearn
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Conv(**kwargs):
+    #name = kwargs.get('name')
+    #_weight = mx.symbol.Variable(name+'_weight')
+    #_bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    #body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def residual_unit_v1(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v1_L(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def residual_unit_v2(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit2')
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act3 = Act(data=bn3, act_type=act_type, name=name + '_relu3')
+        conv3 = Conv(data=act3,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=conv3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv3 = mx.symbol.broadcast_mul(conv3, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv1 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv2 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=conv2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            conv2 = mx.symbol.broadcast_mul(conv2, body)
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = Conv(data=act1,
+                            num_filter=num_filter,
+                            kernel=(1, 1),
+                            stride=stride,
+                            no_bias=True,
+                            workspace=workspace,
+                            name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, bottle_neck,
+                     **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    #print('in unit3')
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn4 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn4')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn4,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn4 = mx.symbol.broadcast_mul(bn4, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn4 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        conv1 = Conv(data=bn1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return bn3 + shortcut
+
+
+def residual_unit_v3_x(data, num_filter, stride, dim_match, name, bottle_neck,
+                       **kwargs):
+    """Return ResNeXt Unit symbol for building ResNeXt
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    assert (bottle_neck)
+    use_se = kwargs.get('version_se', 1)
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    act_type = kwargs.get('version_act', 'prelu')
+    num_group = 32
+    #print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=act_type, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_group=num_group,
+                 num_filter=int(num_filter * 0.5),
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+    act2 = Act(data=bn3, act_type=act_type, name=name + '_relu2')
+    conv3 = Conv(data=act2,
+                 num_filter=num_filter,
+                 kernel=(1, 1),
+                 stride=stride,
+                 pad=(0, 0),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv3')
+    bn4 = mx.sym.BatchNorm(data=conv3,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn4')
+
+    if use_se:
+        #se begin
+        body = mx.sym.Pooling(data=bn4,
+                              global_pool=True,
+                              kernel=(7, 7),
+                              pool_type='avg',
+                              name=name + '_se_pool1')
+        body = Conv(data=body,
+                    num_filter=num_filter // 16,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv1",
+                    workspace=workspace)
+        body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+        body = Conv(data=body,
+                    num_filter=num_filter,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    name=name + "_se_conv2",
+                    workspace=workspace)
+        body = mx.symbol.Activation(data=body,
+                                    act_type='sigmoid',
+                                    name=name + "_se_sigmoid")
+        bn4 = mx.symbol.broadcast_mul(bn4, body)
+        #se end
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=bn_mom,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn4 + shortcut
+
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck,
+                  **kwargs):
+    uv = kwargs.get('version_unit', 3)
+    version_input = kwargs.get('version_input', 1)
+    if uv == 1:
+        if version_input == 0:
+            return residual_unit_v1(data, num_filter, stride, dim_match, name,
+                                    bottle_neck, **kwargs)
+        else:
+            return residual_unit_v1_L(data, num_filter, stride, dim_match,
+                                      name, bottle_neck, **kwargs)
+    elif uv == 2:
+        return residual_unit_v2(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    elif uv == 4:
+        return residual_unit_v4(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+    else:
+        return residual_unit_v3(data, num_filter, stride, dim_match, name,
+                                bottle_neck, **kwargs)
+
+
+def resnet(units, num_stages, filter_list, num_classes, bottle_neck):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {
+        'version_se': config.net_se,
+        'version_input': config.net_input,
+        'version_output': config.net_output,
+        'version_unit': config.net_unit,
+        'version_act': config.net_act,
+        'bn_mom': bn_mom,
+        'workspace': workspace,
+        'memonger': config.memonger,
+    }
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    """
+    version_se = kwargs.get('version_se', 1)
+    version_input = kwargs.get('version_input', 1)
+    assert version_input >= 0
+    version_output = kwargs.get('version_output', 'E')
+    fc_type = version_output
+    version_unit = kwargs.get('version_unit', 3)
+    act_type = kwargs.get('version_act', 'prelu')
+    memonger = kwargs.get('memonger', False)
+    print(version_se, version_input, version_output, version_unit, act_type,
+          memonger)
+    num_unit = len(units)
+    assert (num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if config.fp16:
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    if version_input == 0:
+        #data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+        #body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+    elif version_input == 2:
+        data = mx.sym.BatchNorm(data=data,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn_data')
+        body = Conv(data=data,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+    else:
+        data = mx.sym.identity(data=data, name='id')
+        data = data - 127.5
+        data = data * 0.0078125
+        body = data
+        body = Conv(data=body,
+                    num_filter=filter_list[0],
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=act_type, name='relu0')
+
+    for i in range(num_stages):
+        #if version_input==0:
+        #  body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+        #                       name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        #else:
+        #  body = residual_unit(body, filter_list[i+1], (2, 2), False,
+        #    name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
+        if i==num_stages-1 and config.fp16:
+            body = mx.sym.Cast(data=body, dtype=np.float32)
+        body = residual_unit(body,
+                             filter_list[i + 1], (2, 2),
+                             False,
+                             name='stage%d_unit%d' % (i + 1, 1),
+                             bottle_neck=bottle_neck,
+                             **kwargs)
+        for j in range(units[i] - 1):
+            body = residual_unit(body,
+                                 filter_list[i + 1], (1, 1),
+                                 True,
+                                 name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck,
+                                 **kwargs)
+
+    if bottle_neck:
+        body = Conv(data=body,
+                    num_filter=512,
+                    kernel=(1, 1),
+                    stride=(1, 1),
+                    pad=(0, 0),
+                    no_bias=True,
+                    name="convd",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnd')
+        body = Act(data=body, act_type=act_type, name='relud')
+
+    fc1 = symbol_utils.get_fc1(body, num_classes, fc_type)
+    return fc1
+
+
+def get_symbol():
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    num_classes = config.emb_size
+    num_layers = config.num_layers
+    if num_layers >= 500:
+        filter_list = [64, 256, 512, 1024, 2048]
+        bottle_neck = True
+    else:
+        filter_list = [64, 64, 128, 256, 512]
+        bottle_neck = False
+    num_stages = 4
+    if num_layers == 18:
+        units = [2, 2, 2, 2]
+    elif num_layers == 34:
+        units = [3, 4, 6, 3]
+    elif num_layers == 49:
+        units = [3, 4, 14, 3]
+    elif num_layers == 50:
+        units = [3, 4, 14, 3]
+    elif num_layers == 74:
+        units = [3, 6, 24, 3]
+    elif num_layers == 90:
+        units = [3, 8, 30, 3]
+    elif num_layers == 98:
+        units = [3, 4, 38, 3]
+    elif num_layers == 99:
+        units = [3, 8, 35, 3]
+    elif num_layers == 100:
+        units = [3, 13, 30, 3]
+    elif num_layers == 134:
+        units = [3, 10, 50, 3]
+    elif num_layers == 136:
+        units = [3, 13, 48, 3]
+    elif num_layers == 140:
+        units = [3, 15, 48, 3]
+    elif num_layers == 124:
+        units = [3, 13, 40, 5]
+    elif num_layers == 160:
+        units = [3, 24, 49, 3]
+    elif num_layers == 101:
+        units = [3, 4, 23, 3]
+    elif num_layers == 152:
+        units = [3, 8, 36, 3]
+    elif num_layers == 200:
+        units = [3, 24, 36, 3]
+    elif num_layers == 269:
+        units = [3, 30, 48, 8]
+    else:
+        raise ValueError(
+            "no experiments done on num_layers {}, you can do it yourself".
+            format(num_layers))
+
+    net = resnet(units=units,
+                 num_stages=num_stages,
+                 filter_list=filter_list,
+                 num_classes=num_classes,
+                 bottle_neck=bottle_neck)
+
+    if config.memonger:
+        dshape = (config.per_batch_size, config.image_shape[2],
+                  config.image_shape[0], config.image_shape[1])
+        net_mem_planned = memonger.search_plan(net, data=dshape)
+        old_cost = memonger.get_cost(net, data=dshape)
+        new_cost = memonger.get_cost(net_mem_planned, data=dshape)
+
+        print('Old feature map cost=%d MB' % old_cost)
+        print('New feature map cost=%d MB' % new_cost)
+        net = net_mem_planned
+    return net
diff --git a/insightface/recognition/subcenter_arcface/symbol/memonger.py b/insightface/recognition/subcenter_arcface/symbol/memonger.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad610b57b821ec6b8f0087ee2569ad6fda4d177
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/memonger.py
@@ -0,0 +1,175 @@
+import mxnet as mx
+import math
+
+
+def prod(shape):
+    """Get product of the shape.
+    """
+    ret = 1
+    for s in shape:
+        ret *= s
+    return ret
+
+
+def is_param(name):
+    """Quick script to check if name is a parameter.
+    """
+    if name == 'data':
+        return False
+    if name.endswith('weight'):
+        return True
+    if name.endswith('bias'):
+        return True
+    if name.endswith('beta'):
+        return True
+    if name.endswith('gamma'):
+        return True
+    return False
+
+
+def make_mirror_plan(sym, threshold, plan_info=None, **kwargs):
+    """Memory allocation planner with a given threshold.
+
+    The user can pass in a network configuration,
+    a threshold that limits memory per block.
+    And input shape configurations.
+
+    Parameters
+    ----------
+    sym : symbol
+        Input configuration of symbols.
+        The user need to pre-mark the attribute "mirror_stage" on the nodes
+        that can be book-kept as stage
+
+        The algorithm will decide whether to disbale mirror on the stage nodes.
+
+    threshold: integer
+        A tuning parameter to tune the approximate size of each stage blocks
+
+    plan_info: dict, optional
+        Used to hold plan information.
+
+    **kwargs:
+        The arguments to infer shape.
+
+    Returns
+    -------
+    alloc_sym: symbol
+        A symbol with force mirror tagged on the nodes for better allocation.
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_sb = None
+    last_local = 0
+    period = 1
+    last_stage = ''
+    stage_decision = ''
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        else:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def get_cost(sym, type_dict=None, **kwargs):
+    """Get the cost of the current symbolic plan by running bind on CPU.
+
+    sym : Symbolic Variable
+
+    """
+    texec = sym.simple_bind(ctx=mx.gpu(),
+                            grad_req='write',
+                            type_dict=type_dict,
+                            **kwargs)
+    return int(texec.debug_str().split('\n')[-3].split()[1])
+
+
+def search_plan(sym, ntrial=6, type_dict=None, **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    threshold = 0
+    min_threshold = None
+    min_cost = None
+    nbegin = 3
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan(sym,
+                               threshold=threshold,
+                               plan_info=info,
+                               **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = int(math.sqrt(save_size * local_size / 2))
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan(sym,
+                                   threshold=threshold,
+                                   plan_info=info,
+                                   **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
diff --git a/insightface/recognition/subcenter_arcface/symbol/memonger_v2.py b/insightface/recognition/subcenter_arcface/symbol/memonger_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..92963de5e4256855fa3192107b2e584c1fa449bb
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/memonger_v2.py
@@ -0,0 +1,300 @@
+import mxnet as mx
+import math
+
+
+def prod(shape):
+    """Get product of the shape.
+    """
+    ret = 1
+    for s in shape:
+        ret *= s
+    return ret
+
+
+def is_param(name):
+    """Quick script to check if name is a parameter.
+    """
+    if name == 'data':
+        return False
+    if name.endswith('weight'):
+        return True
+    if name.endswith('bias'):
+        return True
+    if name.endswith('beta'):
+        return True
+    if name.endswith('gamma'):
+        return True
+    return False
+
+
+def make_mirror_plan(sym, threshold, plan_info=None, **kwargs):
+    """Memory allocation planner with a given threshold.
+
+    The user can pass in a network configuration,
+    a threshold that limits memory per block.
+    And input shape configurations.
+
+    Parameters
+    ----------
+    sym : symbol
+        Input configuration of symbols.
+        The user need to pre-mark the attribute "mirror_stage" on the nodes
+        that can be book-kept as stage
+
+        The algorithm will decide whether to disbale mirror on the stage nodes.
+
+    threshold: integer
+        A tuning parameter to tune the approximate size of each stage blocks
+
+    plan_info: dict, optional
+        Used to hold plan information.
+
+    **kwargs:
+        The arguments to infer shape.
+
+    Returns
+    -------
+    alloc_sym: symbol
+        A symbol with force mirror tagged on the nodes for better allocation.
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_sb = None
+    last_local = 0
+    period = 1
+    last_stage = ''
+    stage_decision = ''
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        else:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def get_cost(sym, type_dict=None, **kwargs):
+    """Get the cost of the current symbolic plan by running bind on CPU.
+
+    sym : Symbolic Variable
+
+    """
+    texec = sym.simple_bind(ctx=mx.gpu(),
+                            grad_req='write',
+                            type_dict=type_dict,
+                            **kwargs)
+    return int(texec.debug_str().split('\n')[-3].split()[1])
+
+
+def search_plan(sym, ntrial=6, type_dict=None, **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    threshold = 0
+    min_threshold = None
+    min_cost = None
+    nbegin = 3
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan(sym,
+                               threshold=threshold,
+                               plan_info=info,
+                               **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = int(math.sqrt(save_size * local_size / 2))
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan(sym,
+                                   threshold=threshold,
+                                   plan_info=info,
+                                   **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
+
+
+def make_mirror_plan_to_layer(sym,
+                              layer_name,
+                              threshold,
+                              plan_info=None,
+                              **kwargs):
+    """
+    sym is the original symbal
+    layer_name is a name to which layer of the network should be set as mirror
+    threshhold is the approximate size of each mirror block
+    """
+    threshold = threshold << 20
+    sym = sym.__copy__()
+    internals = sym.get_internals()
+    _, out_shapes, _ = internals.infer_shape(**kwargs)
+    shape_dict = list(zip(internals.list_outputs(), out_shapes))
+    total_size = 0
+    param_size = 0
+    local_size = 0
+    save_size = 0
+    max_size = 0
+    last_stage = ''
+    stage_decision = ''
+    switch = True
+
+    for idx, item in enumerate(shape_dict):
+        sb = internals[idx]
+        name, shape = item
+        #print(name, switch)
+        if is_param(name):
+            param_size += prod(shape) * 4
+            continue
+        elif switch and not 'bn' in name:
+            total_size += prod(shape) * 4
+            local_size += prod(shape) * 4
+            sb._set_attr(force_mirroring='True')
+            print('set force_mirroring', name, total_size, local_size)
+        if layer_name != '' and layer_name in name:
+            switch = False
+
+        if sb.attr('mirror_stage') is not None:
+            stage = sb.attr('mirror_stage')
+            #print(name, stage)
+            if stage == 'True' or stage != last_stage:
+                if local_size > threshold:
+                    save_size += prod(shape) * 4
+                    max_size = max(max_size, local_size)
+                    local_size = 0
+                    stage_decision = 'False'
+                    sb._set_attr(force_mirroring=stage_decision)
+                else:
+                    stage_decision = 'True'
+                    pass
+                last_stage = stage
+            elif stage == last_stage and stage_decision == 'False':
+                save_size += prod(shape) * 4
+                sb._set_attr(force_mirroring=stage_decision)
+
+    if plan_info is not None:
+        plan_info['max_size'] = max_size
+        plan_info['save_size'] = save_size
+    return sym
+
+
+def search_plan_to_layer(sym,
+                         layer_name=None,
+                         threshold=500,
+                         ntrial=6,
+                         type_dict=None,
+                         **kwargs):
+    """Quickly heurestic search over possible plans to find good memory plan.
+
+    Parameters
+    ----------
+    sym : symbolic
+       Symbolic configurations
+
+    ntrial: integer
+       Additional grid search steps
+    """
+    history = []
+    min_threshold = None
+    min_cost = None
+    nbegin = 10
+
+    for k in range(nbegin):
+        info = {}
+        sym = make_mirror_plan_to_layer(sym,
+                                        layer_name=layer_name,
+                                        threshold=threshold,
+                                        plan_info=info,
+                                        **kwargs)
+        cost = get_cost(sym, type_dict, **kwargs)
+        save_size = info['save_size'] >> 20
+        local_size = info['max_size'] >> 20
+        guess = 300 * (k + 1)
+        if min_cost is None or min_cost > cost:
+            min_cost = cost
+        if min_threshold is None or local_size < min_threshold:
+            min_threshold = local_size
+        print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+        history.append((cost, threshold, sym))
+        threshold = guess
+
+    max_threshold = threshold * math.sqrt(2)
+    step = int((max_threshold - min_threshold) / ntrial)
+    print(min_threshold, max_threshold, step)
+    threshold = min_threshold + step
+    if step > 0:
+        for k in range(ntrial):
+            sym = make_mirror_plan_to_layer(sym,
+                                            layer_name=layer_name,
+                                            threshold=threshold,
+                                            plan_info=info,
+                                            **kwargs)
+            cost = get_cost(sym, type_dict, **kwargs)
+            print("Search threshold=%d MB, cost=%d MB" % (threshold, cost))
+            history.append((cost, threshold, sym))
+            threshold += step
+
+    history.sort(key=lambda x: x[0])
+    cost, threshold, sym = history[0]
+    print('Find best plan with threshold=%d, cost=%d MB' % (threshold, cost))
+    return sym
diff --git a/insightface/recognition/subcenter_arcface/symbol/symbol_utils.py b/insightface/recognition/subcenter_arcface/symbol/symbol_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb6f1cf96cb674aea22250cf372f6aab95590f8
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/symbol_utils.py
@@ -0,0 +1,595 @@
+import sys
+import os
+import mxnet as mx
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Conv(**kwargs):
+    #name = kwargs.get('name')
+    #_weight = mx.symbol.Variable(name+'_weight')
+    #_bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0)
+    #body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs)
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+
+def Act(data, act_type, name):
+    #ignore param act_type, set it in this function
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.sym.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+bn_mom = config.bn_mom
+
+
+def Linear(data,
+           num_filter=1,
+           kernel=(1, 1),
+           stride=(1, 1),
+           pad=(0, 0),
+           num_group=1,
+           name=None,
+           suffix=''):
+    conv = mx.sym.Convolution(data=data,
+                              num_filter=num_filter,
+                              kernel=kernel,
+                              num_group=num_group,
+                              stride=stride,
+                              pad=pad,
+                              no_bias=True,
+                              name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv,
+                          name='%s%s_batchnorm' % (name, suffix),
+                          fix_gamma=False,
+                          momentum=bn_mom)
+    return bn
+
+
+def get_fc1(last_conv, num_classes, fc_type, input_channel=512):
+    body = last_conv
+    if fc_type == 'Z':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = body
+    elif fc_type == 'E':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'FC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'SFC':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(3, 3),
+                    stride=(2, 2),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="convf",
+                    num_group=input_channel)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf')
+        body = Act(data=body, act_type=config.net_act, name='reluf')
+        body = Conv(data=body,
+                    num_filter=input_channel,
+                    kernel=(1, 1),
+                    pad=(0, 0),
+                    stride=(1, 1),
+                    name="convf2")
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bnf2')
+        body = Act(data=body, act_type=config.net_act, name='reluf2')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GAP':
+        bn1 = mx.sym.BatchNorm(data=body,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='bn1')
+        relu1 = Act(data=bn1, act_type=config.net_act, name='relu1')
+        # Although kernel is not used here when global_pool=True, we should put one
+        pool1 = mx.sym.Pooling(data=relu1,
+                               global_pool=True,
+                               kernel=(7, 7),
+                               pool_type='avg',
+                               name='pool1')
+        flat = mx.sym.Flatten(data=pool1)
+        fc1 = mx.sym.FullyConnected(data=flat,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'GNAP':  #mobilefacenet++
+        filters_in = 512  # param in mobilefacenet
+        if num_classes > filters_in:
+            body = mx.sym.Convolution(data=last_conv,
+                                      num_filter=num_classes,
+                                      kernel=(1, 1),
+                                      stride=(1, 1),
+                                      pad=(0, 0),
+                                      no_bias=True,
+                                      name='convx')
+            body = mx.sym.BatchNorm(data=body,
+                                    fix_gamma=False,
+                                    eps=2e-5,
+                                    momentum=0.9,
+                                    name='convx_bn')
+            body = Act(data=body, act_type=config.net_act, name='convx_relu')
+            filters_in = num_classes
+        else:
+            body = last_conv
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=True,
+                                eps=2e-5,
+                                momentum=0.9,
+                                name='bn6f')
+
+        spatial_norm = body * body
+        spatial_norm = mx.sym.sum(data=spatial_norm, axis=1, keepdims=True)
+        spatial_sqrt = mx.sym.sqrt(spatial_norm)
+        #spatial_mean=mx.sym.mean(spatial_sqrt, axis=(1,2,3), keepdims=True)
+        spatial_mean = mx.sym.mean(spatial_sqrt)
+        spatial_div_inverse = mx.sym.broadcast_div(spatial_mean, spatial_sqrt)
+
+        spatial_attention_inverse = mx.symbol.tile(spatial_div_inverse,
+                                                   reps=(1, filters_in, 1, 1))
+        body = body * spatial_attention_inverse
+        #body = mx.sym.broadcast_mul(body, spatial_div_inverse)
+
+        fc1 = mx.sym.Pooling(body,
+                             kernel=(7, 7),
+                             global_pool=True,
+                             pool_type='avg')
+        if num_classes < filters_in:
+            fc1 = mx.sym.BatchNorm(data=fc1,
+                                   fix_gamma=True,
+                                   eps=2e-5,
+                                   momentum=0.9,
+                                   name='bn6w')
+            fc1 = mx.sym.FullyConnected(data=fc1,
+                                        num_hidden=num_classes,
+                                        name='pre_fc1')
+        else:
+            fc1 = mx.sym.Flatten(data=fc1)
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=0.9,
+                               name='fc1')
+    elif fc_type == "GDC":  #mobilefacenet_v1
+        conv_6_dw = Linear(last_conv,
+                           num_filter=input_channel,
+                           num_group=input_channel,
+                           kernel=(7, 7),
+                           pad=(0, 0),
+                           stride=(1, 1),
+                           name="conv_6dw7_7")
+        conv_6_f = mx.sym.FullyConnected(data=conv_6_dw,
+                                         num_hidden=num_classes,
+                                         name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=conv_6_f,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'F':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        body = mx.symbol.Dropout(data=body, p=0.4)
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'G':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'H':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='fc1')
+    elif fc_type == 'I':
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn1')
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    elif fc_type == 'J':
+        fc1 = mx.sym.FullyConnected(data=body,
+                                    num_hidden=num_classes,
+                                    name='pre_fc1')
+        fc1 = mx.sym.BatchNorm(data=fc1,
+                               fix_gamma=True,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name='fc1')
+    return fc1
+
+
+def residual_unit_v3(data, num_filter, stride, dim_match, name, **kwargs):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    workspace = kwargs.get('workspace', 256)
+    memonger = kwargs.get('memonger', False)
+    #print('in unit3')
+    bn1 = mx.sym.BatchNorm(data=data,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn1')
+    conv1 = Conv(data=bn1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=(1, 1),
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn2')
+    act1 = Act(data=bn2, act_type=config.net_act, name=name + '_relu1')
+    conv2 = Conv(data=act1,
+                 num_filter=num_filter,
+                 kernel=(3, 3),
+                 stride=stride,
+                 pad=(1, 1),
+                 no_bias=True,
+                 workspace=workspace,
+                 name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2,
+                           fix_gamma=False,
+                           eps=2e-5,
+                           momentum=bn_mom,
+                           name=name + '_bn3')
+
+    if dim_match:
+        shortcut = data
+    else:
+        conv1sc = Conv(data=data,
+                       num_filter=num_filter,
+                       kernel=(1, 1),
+                       stride=stride,
+                       no_bias=True,
+                       workspace=workspace,
+                       name=name + '_conv1sc')
+        shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                    fix_gamma=False,
+                                    momentum=bn_mom,
+                                    eps=2e-5,
+                                    name=name + '_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return bn3 + shortcut
+
+
+def residual_unit_v1l(data, num_filter, stride, dim_match, name, bottle_neck):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    workspace = config.workspace
+    bn_mom = config.bn_mom
+    memonger = False
+    use_se = config.net_se
+    act_type = config.net_act
+    #print('in unit1')
+    if bottle_neck:
+        conv1 = Conv(data=data,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(1, 1),
+                     stride=(1, 1),
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=int(num_filter * 0.25),
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn2')
+        act2 = Act(data=bn2, act_type=act_type, name=name + '_relu2')
+        conv3 = Conv(data=act2,
+                     num_filter=num_filter,
+                     kernel=(1, 1),
+                     stride=stride,
+                     pad=(0, 0),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv3')
+        bn3 = mx.sym.BatchNorm(data=conv3,
+                               fix_gamma=False,
+                               eps=2e-5,
+                               momentum=bn_mom,
+                               name=name + '_bn3')
+
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn3,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn3 = mx.symbol.broadcast_mul(bn3, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        eps=2e-5,
+                                        momentum=bn_mom,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn3 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+    else:
+        conv1 = Conv(data=data,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=(1, 1),
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv1')
+        bn1 = mx.sym.BatchNorm(data=conv1,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn1')
+        act1 = Act(data=bn1, act_type=act_type, name=name + '_relu1')
+        conv2 = Conv(data=act1,
+                     num_filter=num_filter,
+                     kernel=(3, 3),
+                     stride=stride,
+                     pad=(1, 1),
+                     no_bias=True,
+                     workspace=workspace,
+                     name=name + '_conv2')
+        bn2 = mx.sym.BatchNorm(data=conv2,
+                               fix_gamma=False,
+                               momentum=bn_mom,
+                               eps=2e-5,
+                               name=name + '_bn2')
+        if use_se:
+            #se begin
+            body = mx.sym.Pooling(data=bn2,
+                                  global_pool=True,
+                                  kernel=(7, 7),
+                                  pool_type='avg',
+                                  name=name + '_se_pool1')
+            body = Conv(data=body,
+                        num_filter=num_filter // 16,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv1",
+                        workspace=workspace)
+            body = Act(data=body, act_type=act_type, name=name + '_se_relu1')
+            body = Conv(data=body,
+                        num_filter=num_filter,
+                        kernel=(1, 1),
+                        stride=(1, 1),
+                        pad=(0, 0),
+                        name=name + "_se_conv2",
+                        workspace=workspace)
+            body = mx.symbol.Activation(data=body,
+                                        act_type='sigmoid',
+                                        name=name + "_se_sigmoid")
+            bn2 = mx.symbol.broadcast_mul(bn2, body)
+            #se end
+
+        if dim_match:
+            shortcut = data
+        else:
+            conv1sc = Conv(data=data,
+                           num_filter=num_filter,
+                           kernel=(1, 1),
+                           stride=stride,
+                           no_bias=True,
+                           workspace=workspace,
+                           name=name + '_conv1sc')
+            shortcut = mx.sym.BatchNorm(data=conv1sc,
+                                        fix_gamma=False,
+                                        momentum=bn_mom,
+                                        eps=2e-5,
+                                        name=name + '_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return Act(data=bn2 + shortcut,
+                   act_type=act_type,
+                   name=name + '_relu3')
+
+
+def get_head(data, version_input, num_filter):
+    bn_mom = config.bn_mom
+    workspace = config.workspace
+    kwargs = {'bn_mom': bn_mom, 'workspace': workspace}
+    data = data - 127.5
+    data = data * 0.0078125
+    #data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    if version_input == 0:
+        body = Conv(data=data,
+                    num_filter=num_filter,
+                    kernel=(7, 7),
+                    stride=(2, 2),
+                    pad=(3, 3),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        body = mx.sym.Pooling(data=body,
+                              kernel=(3, 3),
+                              stride=(2, 2),
+                              pad=(1, 1),
+                              pool_type='max')
+    else:
+        body = data
+        _num_filter = min(num_filter, 64)
+        body = Conv(data=body,
+                    num_filter=_num_filter,
+                    kernel=(3, 3),
+                    stride=(1, 1),
+                    pad=(1, 1),
+                    no_bias=True,
+                    name="conv0",
+                    workspace=workspace)
+        body = mx.sym.BatchNorm(data=body,
+                                fix_gamma=False,
+                                eps=2e-5,
+                                momentum=bn_mom,
+                                name='bn0')
+        body = Act(data=body, act_type=config.net_act, name='relu0')
+        #body = residual_unit_v3(body, _num_filter, (2, 2), False, name='head', **kwargs)
+        body = residual_unit_v1l(body,
+                                 _num_filter, (2, 2),
+                                 False,
+                                 name='head',
+                                 bottle_neck=False)
+    return body
diff --git a/insightface/recognition/subcenter_arcface/symbol/vargfacenet.py b/insightface/recognition/subcenter_arcface/symbol/vargfacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e8beb835a190725d430e9b4200ceca4360ac6
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/symbol/vargfacenet.py
@@ -0,0 +1,578 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+'''
+Author: Horizon Robotics Inc.
+The company is committed to be the global leader of edge AI platform.
+The model implemented in this scripts runs ~200fps on the Sunrise 2.
+Sunrise 2 is the second generation of an embedded AI chip designed by Horizon Robotics,
+targeting to empower AIoT devices by AI.
+
+Implemented the following paper:
+Mengjia Yan, Mengao Zhao, Zining Xu, Qian Zhang, Guoli Wang, Zhizhong Su. "VarGFaceNet: An Efficient Variable Group Convolutional Neural Network for Lightweight Face Recognition" (https://arxiv.org/abs/1910.04985)
+
+'''
+
+import os
+import sys
+
+import mxnet as mx
+import symbol_utils
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from config import config
+
+
+def Act(data, act_type, name):
+    if act_type == 'prelu':
+        body = mx.sym.LeakyReLU(data=data, act_type='prelu', name=name)
+    else:
+        body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+
+def get_setting_params(**kwargs):
+    # bn_params
+    bn_mom = kwargs.get('bn_mom', 0.9)
+    bn_eps = kwargs.get('bn_eps', 2e-5)
+    fix_gamma = kwargs.get('fix_gamma', False)
+    use_global_stats = kwargs.get('use_global_stats', False)
+    # net_setting param
+    workspace = kwargs.get('workspace', 512)
+    act_type = kwargs.get('act_type', 'prelu')
+    use_se = kwargs.get('use_se', True)
+    se_ratio = kwargs.get('se_ratio', 4)
+    group_base = kwargs.get('group_base', 8)
+
+    setting_params = {}
+    setting_params['bn_mom'] = bn_mom
+    setting_params['bn_eps'] = bn_eps
+    setting_params['fix_gamma'] = fix_gamma
+    setting_params['use_global_stats'] = use_global_stats
+    setting_params['workspace'] = workspace
+    setting_params['act_type'] = act_type
+    setting_params['use_se'] = use_se
+    setting_params['se_ratio'] = se_ratio
+    setting_params['group_base'] = group_base
+
+    return setting_params
+
+
+def se_block(data, num_filter, setting_params, name):
+    se_ratio = setting_params['se_ratio']
+    act_type = setting_params['act_type']
+
+    pool1 = mx.sym.Pooling(data=data,
+                           global_pool=True,
+                           pool_type='avg',
+                           name=name + '_se_pool1')
+    conv1 = mx.sym.Convolution(data=pool1,
+                               num_filter=num_filter // se_ratio,
+                               kernel=(1, 1),
+                               stride=(1, 1),
+                               pad=(0, 0),
+                               name=name + "_se_conv1")
+    act1 = Act(data=conv1, act_type=act_type, name=name + '_se_act1')
+
+    conv2 = mx.sym.Convolution(data=act1,
+                               num_filter=num_filter,
+                               kernel=(1, 1),
+                               stride=(1, 1),
+                               pad=(0, 0),
+                               name=name + "_se_conv2")
+    act2 = mx.symbol.Activation(data=conv2,
+                                act_type='sigmoid',
+                                name=name + "_se_sigmoid")
+    out_data = mx.symbol.broadcast_mul(data, act2)
+    return out_data
+
+
+def separable_conv2d(data,
+                     in_channels,
+                     out_channels,
+                     kernel,
+                     pad,
+                     setting_params,
+                     stride=(1, 1),
+                     factor=1,
+                     bias=False,
+                     bn_dw_out=True,
+                     act_dw_out=True,
+                     bn_pw_out=True,
+                     act_pw_out=True,
+                     dilate=1,
+                     name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    group_base = setting_params['group_base']
+    act_type = setting_params['act_type']
+    assert in_channels % group_base == 0
+
+    # depthwise
+    dw_out = mx.sym.Convolution(data=data,
+                                num_filter=int(in_channels * factor),
+                                kernel=kernel,
+                                pad=pad,
+                                stride=stride,
+                                no_bias=False if bias else True,
+                                num_group=int(in_channels / group_base),
+                                dilate=(dilate, dilate),
+                                workspace=workspace,
+                                name=name + '_conv2d_depthwise')
+    if bn_dw_out:
+        dw_out = mx.sym.BatchNorm(data=dw_out,
+                                  fix_gamma=fix_gamma,
+                                  eps=bn_eps,
+                                  momentum=bn_mom,
+                                  use_global_stats=use_global_stats,
+                                  name=name + '_conv2d_depthwise_bn')
+    if act_dw_out:
+        dw_out = Act(data=dw_out,
+                     act_type=act_type,
+                     name=name + '_conv2d_depthwise_act')
+    # pointwise
+    pw_out = mx.sym.Convolution(data=dw_out,
+                                num_filter=out_channels,
+                                kernel=(1, 1),
+                                stride=(1, 1),
+                                pad=(0, 0),
+                                num_group=1,
+                                no_bias=False if bias else True,
+                                workspace=workspace,
+                                name=name + '_conv2d_pointwise')
+    if bn_pw_out:
+        pw_out = mx.sym.BatchNorm(data=pw_out,
+                                  fix_gamma=fix_gamma,
+                                  eps=bn_eps,
+                                  momentum=bn_mom,
+                                  use_global_stats=use_global_stats,
+                                  name=name + '_conv2d_pointwise_bn')
+    if act_pw_out:
+        pw_out = Act(data=pw_out,
+                     act_type=act_type,
+                     name=name + '_conv2d_pointwise_act')
+    return pw_out
+
+
+def vargnet_block(data,
+                  n_out_ch1,
+                  n_out_ch2,
+                  n_out_ch3,
+                  setting_params,
+                  factor=2,
+                  dim_match=True,
+                  multiplier=1,
+                  kernel=(3, 3),
+                  stride=(1, 1),
+                  dilate=1,
+                  with_dilate=False,
+                  name=None):
+    use_se = setting_params['use_se']
+    act_type = setting_params['act_type']
+
+    out_channels_1 = int(n_out_ch1 * multiplier)
+    out_channels_2 = int(n_out_ch2 * multiplier)
+    out_channels_3 = int(n_out_ch3 * multiplier)
+
+    pad = (((kernel[0] - 1) * dilate + 1) // 2,
+           ((kernel[1] - 1) * dilate + 1) // 2)
+
+    if with_dilate:
+        stride = (1, 1)
+    if dim_match:
+        short_cut = data
+    else:
+        short_cut = separable_conv2d(data=data,
+                                     in_channels=out_channels_1,
+                                     out_channels=out_channels_3,
+                                     kernel=kernel,
+                                     pad=pad,
+                                     setting_params=setting_params,
+                                     stride=stride,
+                                     factor=factor,
+                                     bias=False,
+                                     act_pw_out=False,
+                                     dilate=dilate,
+                                     name=name + '_shortcut')
+    sep1_data = separable_conv2d(data=data,
+                                 in_channels=out_channels_1,
+                                 out_channels=out_channels_2,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=stride,
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 name=name + '_sep1_data')
+    sep2_data = separable_conv2d(data=sep1_data,
+                                 in_channels=out_channels_2,
+                                 out_channels=out_channels_3,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=(1, 1),
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 act_pw_out=False,
+                                 name=name + '_sep2_data')
+
+    if use_se:
+        sep2_data = se_block(data=sep2_data,
+                             num_filter=out_channels_3,
+                             setting_params=setting_params,
+                             name=name)
+
+    out_data = sep2_data + short_cut
+    out_data = Act(data=out_data,
+                   act_type=act_type,
+                   name=name + '_out_data_act')
+    return out_data
+
+
+def vargnet_branch_merge_block(data,
+                               n_out_ch1,
+                               n_out_ch2,
+                               n_out_ch3,
+                               setting_params,
+                               factor=2,
+                               dim_match=False,
+                               multiplier=1,
+                               kernel=(3, 3),
+                               stride=(2, 2),
+                               dilate=1,
+                               with_dilate=False,
+                               name=None):
+    act_type = setting_params['act_type']
+
+    out_channels_1 = int(n_out_ch1 * multiplier)
+    out_channels_2 = int(n_out_ch2 * multiplier)
+    out_channels_3 = int(n_out_ch3 * multiplier)
+
+    pad = (((kernel[0] - 1) * dilate + 1) // 2,
+           ((kernel[1] - 1) * dilate + 1) // 2)
+
+    if with_dilate:
+        stride = (1, 1)
+    if dim_match:
+        short_cut = data
+    else:
+        short_cut = separable_conv2d(data=data,
+                                     in_channels=out_channels_1,
+                                     out_channels=out_channels_3,
+                                     kernel=kernel,
+                                     pad=pad,
+                                     setting_params=setting_params,
+                                     stride=stride,
+                                     factor=factor,
+                                     bias=False,
+                                     act_pw_out=False,
+                                     dilate=dilate,
+                                     name=name + '_shortcut')
+    sep1_data_brach1 = separable_conv2d(data=data,
+                                        in_channels=out_channels_1,
+                                        out_channels=out_channels_2,
+                                        kernel=kernel,
+                                        pad=pad,
+                                        setting_params=setting_params,
+                                        stride=stride,
+                                        factor=factor,
+                                        bias=False,
+                                        dilate=dilate,
+                                        act_pw_out=False,
+                                        name=name + '_sep1_data_branch')
+    sep1_data_brach2 = separable_conv2d(data=data,
+                                        in_channels=out_channels_1,
+                                        out_channels=out_channels_2,
+                                        kernel=kernel,
+                                        pad=pad,
+                                        setting_params=setting_params,
+                                        stride=stride,
+                                        factor=factor,
+                                        bias=False,
+                                        dilate=dilate,
+                                        act_pw_out=False,
+                                        name=name + '_sep2_data_branch')
+    sep1_data = sep1_data_brach1 + sep1_data_brach2
+    sep1_data = Act(data=sep1_data,
+                    act_type=act_type,
+                    name=name + '_sep1_data_act')
+    sep2_data = separable_conv2d(data=sep1_data,
+                                 in_channels=out_channels_2,
+                                 out_channels=out_channels_3,
+                                 kernel=kernel,
+                                 pad=pad,
+                                 setting_params=setting_params,
+                                 stride=(1, 1),
+                                 factor=factor,
+                                 bias=False,
+                                 dilate=dilate,
+                                 act_pw_out=False,
+                                 name=name + '_sep2_data')
+    out_data = sep2_data + short_cut
+    out_data = Act(data=out_data,
+                   act_type=act_type,
+                   name=name + '_out_data_act')
+    return out_data
+
+
+def add_vargnet_conv_block(data,
+                           stage,
+                           units,
+                           in_channels,
+                           out_channels,
+                           setting_params,
+                           kernel=(3, 3),
+                           stride=(2, 2),
+                           multiplier=1,
+                           factor=2,
+                           dilate=1,
+                           with_dilate=False,
+                           name=None):
+    assert stage >= 2, 'stage is {}, stage must be set >=2'.format(stage)
+    data = vargnet_branch_merge_block(data=data,
+                                      n_out_ch1=in_channels,
+                                      n_out_ch2=out_channels,
+                                      n_out_ch3=out_channels,
+                                      setting_params=setting_params,
+                                      factor=factor,
+                                      dim_match=False,
+                                      multiplier=multiplier,
+                                      kernel=kernel,
+                                      stride=stride,
+                                      dilate=dilate,
+                                      with_dilate=with_dilate,
+                                      name=name +
+                                      '_stage_{}_unit_1'.format(stage))
+    for i in range(units - 1):
+        data = vargnet_block(data=data,
+                             n_out_ch1=out_channels,
+                             n_out_ch2=out_channels,
+                             n_out_ch3=out_channels,
+                             setting_params=setting_params,
+                             factor=factor,
+                             dim_match=True,
+                             multiplier=multiplier,
+                             kernel=kernel,
+                             stride=(1, 1),
+                             dilate=dilate,
+                             with_dilate=with_dilate,
+                             name=name +
+                             '_stage_{}_unit_{}'.format(stage, i + 2))
+    return data
+
+
+def add_head_block(data,
+                   num_filter,
+                   setting_params,
+                   multiplier,
+                   head_pooling=False,
+                   kernel=(3, 3),
+                   stride=(2, 2),
+                   pad=(1, 1),
+                   name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    act_type = setting_params['act_type']
+    channels = int(num_filter * multiplier)
+
+    conv1 = mx.sym.Convolution(data=data,
+                               num_filter=channels,
+                               kernel=kernel,
+                               pad=pad,
+                               stride=stride,
+                               no_bias=True,
+                               num_group=1,
+                               workspace=workspace,
+                               name=name + '_conv1')
+    bn1 = mx.sym.BatchNorm(data=conv1,
+                           fix_gamma=fix_gamma,
+                           eps=bn_eps,
+                           momentum=bn_mom,
+                           use_global_stats=use_global_stats,
+                           name=name + '_conv1_bn')
+
+    act1 = Act(data=bn1, act_type=act_type, name=name + '_conv1_act')
+
+    if head_pooling:
+        head_data = mx.symbol.Pooling(data=act1,
+                                      kernel=(3, 3),
+                                      stride=(2, 2),
+                                      pad=(1, 1),
+                                      pool_type='max',
+                                      name=name + '_max_pooling')
+    else:
+        head_data = vargnet_block(data=act1,
+                                  n_out_ch1=num_filter,
+                                  n_out_ch2=num_filter,
+                                  n_out_ch3=num_filter,
+                                  setting_params=setting_params,
+                                  factor=1,
+                                  dim_match=False,
+                                  multiplier=multiplier,
+                                  kernel=kernel,
+                                  stride=(2, 2),
+                                  dilate=1,
+                                  with_dilate=False,
+                                  name=name + '_head_pooling')
+    return head_data
+
+
+def add_emb_block(data,
+                  input_channels,
+                  last_channels,
+                  emb_size,
+                  fc_type,
+                  setting_params,
+                  bias=False,
+                  name=None):
+    bn_mom = setting_params['bn_mom']
+    bn_eps = setting_params['bn_eps']
+    fix_gamma = setting_params['fix_gamma']
+    use_global_stats = setting_params['use_global_stats']
+    workspace = setting_params['workspace']
+    act_type = setting_params['act_type']
+    group_base = setting_params['group_base']
+    # last channels
+    if input_channels != last_channels:
+        data = mx.sym.Convolution(data=data,
+                                  num_filter=last_channels,
+                                  kernel=(1, 1),
+                                  pad=(0, 0),
+                                  stride=(1, 1),
+                                  no_bias=False if bias else True,
+                                  workspace=workspace,
+                                  name=name + '_convx')
+        data = mx.sym.BatchNorm(data=data,
+                                fix_gamma=fix_gamma,
+                                eps=bn_eps,
+                                momentum=bn_mom,
+                                use_global_stats=use_global_stats,
+                                name=name + '_convx_bn')
+        data = Act(data=data, act_type=act_type, name=name + '_convx_act')
+    # depthwise
+    convx_depthwise = mx.sym.Convolution(data=data,
+                                         num_filter=last_channels,
+                                         num_group=int(last_channels /
+                                                       group_base),
+                                         kernel=(7, 7),
+                                         pad=(0, 0),
+                                         stride=(1, 1),
+                                         no_bias=False if bias else True,
+                                         workspace=workspace,
+                                         name=name + '_convx_depthwise')
+    convx_depthwise = mx.sym.BatchNorm(data=convx_depthwise,
+                                       fix_gamma=fix_gamma,
+                                       eps=bn_eps,
+                                       momentum=bn_mom,
+                                       use_global_stats=use_global_stats,
+                                       name=name + '_convx_depthwise_bn')
+    # pointwise
+    convx_pointwise = mx.sym.Convolution(data=convx_depthwise,
+                                         num_filter=last_channels // 2,
+                                         kernel=(1, 1),
+                                         pad=(0, 0),
+                                         stride=(1, 1),
+                                         no_bias=False if bias else True,
+                                         workspace=workspace,
+                                         name=name + '_convx_pointwise')
+    convx_pointwise = mx.sym.BatchNorm(data=convx_pointwise,
+                                       fix_gamma=fix_gamma,
+                                       eps=bn_eps,
+                                       momentum=bn_mom,
+                                       use_global_stats=use_global_stats,
+                                       name=name + '_convx_pointwise_bn')
+    convx_pointwise = Act(data=convx_pointwise,
+                          act_type=act_type,
+                          name=name + '_convx_pointwise_act')
+
+    fc1 = symbol_utils.get_fc1(convx_pointwise, emb_size, fc_type)
+    return fc1
+
+
+def get_symbol():
+    multiplier = config.net_multiplier
+    emb_size = config.emb_size
+    fc_type = config.net_output
+
+    kwargs = {
+        'use_se': config.net_se,
+        'act_type': config.net_act,
+        'bn_mom': config.bn_mom,
+        'workspace': config.workspace,
+    }
+
+    setting_params = get_setting_params(**kwargs)
+
+    factor = 2
+    head_pooling = False
+    num_stage = 3
+    stage_list = [2, 3, 4]
+    units = [3, 7, 4]
+    filter_list = [32, 64, 128, 256]
+    last_channels = 1024
+    dilate_list = [1, 1, 1]
+    with_dilate_list = [False, False, False]
+
+    data = mx.sym.Variable(name='data')
+    data = mx.sym.identity(data=data, name='id')
+    data = data - 127.5
+    data = data * 0.0078125
+
+    body = add_head_block(data=data,
+                          num_filter=filter_list[0],
+                          setting_params=setting_params,
+                          multiplier=multiplier,
+                          head_pooling=head_pooling,
+                          kernel=(3, 3),
+                          stride=(1, 1),
+                          pad=(1, 1),
+                          name="vargface_head")
+
+    for i in range(num_stage):
+        body = add_vargnet_conv_block(data=body,
+                                      stage=stage_list[i],
+                                      units=units[i],
+                                      in_channels=filter_list[i],
+                                      out_channels=filter_list[i + 1],
+                                      setting_params=setting_params,
+                                      kernel=(3, 3),
+                                      stride=(2, 2),
+                                      multiplier=multiplier,
+                                      factor=factor,
+                                      dilate=dilate_list[i],
+                                      with_dilate=with_dilate_list[i],
+                                      name="vargface")
+    emb_feat = add_emb_block(data=body,
+                             input_channels=filter_list[3],
+                             last_channels=last_channels,
+                             emb_size=emb_size,
+                             fc_type=fc_type,
+                             setting_params=setting_params,
+                             bias=False,
+                             name='embed')
+    return emb_feat
+
+
+if __name__ == '__main__':
+    get_symbol()
diff --git a/insightface/recognition/subcenter_arcface/train_parall.py b/insightface/recognition/subcenter_arcface/train_parall.py
new file mode 100644
index 0000000000000000000000000000000000000000..72b1ff7d29b4865a5e237be2165997824f11519f
--- /dev/null
+++ b/insightface/recognition/subcenter_arcface/train_parall.py
@@ -0,0 +1,420 @@
+'''
+@author: insightface
+'''
+
+import os
+import sys
+import math
+import random
+import logging
+import pickle
+import sklearn
+import numpy as np
+from image_iter import FaceImageIter
+import mxnet as mx
+from mxnet import ndarray as nd
+import argparse
+import mxnet.optimizer as optimizer
+from config import config, default, generate_config
+sys.path.append(os.path.join(os.path.dirname(__file__), 'symbol'))
+sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
+import verification
+
+import fresnet
+import fmobilefacenet
+import fmobilenet
+import fmnasnet
+import fdensenet
+import vargfacenet
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train parall face network')
+    # general
+    parser.add_argument('--dataset',
+                        default=default.dataset,
+                        help='dataset config')
+    parser.add_argument('--network',
+                        default=default.network,
+                        help='network config')
+    parser.add_argument('--loss', default=default.loss, help='loss config')
+    args, rest = parser.parse_known_args()
+    generate_config(args.network, args.dataset, args.loss)
+    parser.add_argument('--models-root',
+                        default=default.models_root,
+                        help='root directory to save model.')
+    parser.add_argument('--pretrained',
+                        default=default.pretrained,
+                        help='pretrained model to load')
+    parser.add_argument('--pretrained-epoch',
+                        type=int,
+                        default=default.pretrained_epoch,
+                        help='pretrained epoch to load')
+    parser.add_argument(
+        '--ckpt',
+        type=int,
+        default=default.ckpt,
+        help=
+        'checkpoint saving option. 0: discard saving. 1: save when necessary. 2: always save'
+    )
+    parser.add_argument(
+        '--verbose',
+        type=int,
+        default=default.verbose,
+        help='do verification testing and model saving every verbose batches')
+    parser.add_argument('--lr',
+                        type=float,
+                        default=default.lr,
+                        help='start learning rate')
+    parser.add_argument('--lr-steps',
+                        type=str,
+                        default=default.lr_steps,
+                        help='steps of lr changing')
+    parser.add_argument('--wd',
+                        type=float,
+                        default=default.wd,
+                        help='weight decay')
+    parser.add_argument('--mom',
+                        type=float,
+                        default=default.mom,
+                        help='momentum')
+    parser.add_argument('--frequent',
+                        type=int,
+                        default=default.frequent,
+                        help='')
+    parser.add_argument('--per-batch-size',
+                        type=int,
+                        default=default.per_batch_size,
+                        help='batch size in each context')
+    parser.add_argument('--kvstore',
+                        type=str,
+                        default=default.kvstore,
+                        help='kvstore setting')
+    parser.add_argument('--worker-id',
+                        type=int,
+                        default=0,
+                        help='worker id for dist training, starts from 0')
+    parser.add_argument('--extra-model-name',
+                        type=str,
+                        default='',
+                        help='extra model name')
+    args = parser.parse_args()
+    return args
+
+
+def get_symbol_embedding():
+    embedding = eval(config.net_name).get_symbol()
+    all_label = mx.symbol.Variable('softmax_label')
+    #embedding = mx.symbol.BlockGrad(embedding)
+    all_label = mx.symbol.BlockGrad(all_label)
+    out_list = [embedding, all_label]
+    out = mx.symbol.Group(out_list)
+    return out
+
+
+def get_symbol_arcface(args):
+    embedding = mx.symbol.Variable('data')
+    all_label = mx.symbol.Variable('softmax_label')
+    gt_label = all_label
+    is_softmax = True
+    #print('call get_sym_arcface with', args, config)
+    if config.loss_name == 'margin_softmax':
+        _weight = mx.symbol.Variable("fc7_%d_weight" % args._ctxid,
+                                     shape=(args.ctx_num_classes *
+                                            config.loss_K, config.emb_size),
+                                     lr_mult=config.fc7_lr_mult,
+                                     wd_mult=config.fc7_wd_mult)
+        nweight = mx.symbol.L2Normalization(_weight, mode='instance')
+        nembedding = mx.symbol.L2Normalization(embedding,
+                                               mode='instance',
+                                               name='fc1n_%d' % args._ctxid)
+        fc7 = mx.sym.FullyConnected(data=nembedding,
+                                    weight=nweight,
+                                    no_bias=True,
+                                    num_hidden=args.ctx_num_classes *
+                                    config.loss_K,
+                                    name='fc7_%d' % args._ctxid)
+        if config.loss_K > 1:
+            sim_s3 = mx.symbol.reshape(
+                fc7, (-1, args.ctx_num_classes, config.loss_K))
+
+            sim = mx.symbol.max(sim_s3, axis=2)
+            fc7 = sim
+
+    if config.loss_m1 != 1.0 or config.loss_m2 != 0.0 or config.loss_m3 != 0.0:
+        gt_one_hot = mx.sym.one_hot(gt_label,
+                                    depth=args.ctx_num_classes,
+                                    on_value=1.0,
+                                    off_value=0.0)
+        if config.loss_m1 == 1.0 and config.loss_m2 == 0.0:
+            _one_hot = gt_one_hot * args.margin_b
+            fc7 = fc7 - _one_hot
+        else:
+            fc7_onehot = fc7 * gt_one_hot
+            cos_t = fc7_onehot
+            t = mx.sym.arccos(cos_t)
+            if config.loss_m1 != 1.0:
+                t = t * config.loss_m1
+            if config.loss_m2 != 0.0:
+                t = t + config.loss_m2
+            margin_cos = mx.sym.cos(t)
+            if config.loss_m3 != 0.0:
+                margin_cos = margin_cos - config.loss_m3
+            margin_fc7 = margin_cos
+            margin_fc7_onehot = margin_fc7 * gt_one_hot
+            diff = margin_fc7_onehot - fc7_onehot
+            fc7 = fc7 + diff
+    fc7 = fc7 * config.loss_s
+    out_list = []
+    out_list.append(fc7)
+    if config.loss_name == 'softmax':  #softmax
+        out_list.append(gt_label)
+    out = mx.symbol.Group(out_list)
+    return out
+
+
+def train_net(args):
+    #_seed = 727
+    #random.seed(_seed)
+    #np.random.seed(_seed)
+    #mx.random.seed(_seed)
+    ctx = []
+    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+    if len(cvd) > 0:
+        for i in range(len(cvd.split(','))):
+            ctx.append(mx.gpu(i))
+    if len(ctx) == 0:
+        ctx = [mx.cpu()]
+        print('use cpu')
+    else:
+        print('gpu num:', len(ctx))
+    if len(args.extra_model_name) == 0:
+        prefix = os.path.join(
+            args.models_root,
+            '%s-%s-%s' % (args.network, args.loss, args.dataset), 'model')
+    else:
+        prefix = os.path.join(
+            args.models_root, '%s-%s-%s-%s' %
+            (args.network, args.loss, args.dataset, args.extra_model_name),
+            'model')
+    prefix_dir = os.path.dirname(prefix)
+    print('prefix', prefix)
+    if not os.path.exists(prefix_dir):
+        os.makedirs(prefix_dir)
+    args.ctx_num = len(ctx)
+    if args.per_batch_size == 0:
+        args.per_batch_size = 128
+    args.batch_size = args.per_batch_size * args.ctx_num
+    args.rescale_threshold = 0
+    args.image_channel = config.image_shape[2]
+    config.batch_size = args.batch_size
+    config.per_batch_size = args.per_batch_size
+    data_dir = config.dataset_path
+    path_imgrec = None
+    path_imglist = None
+    image_size = config.image_shape[0:2]
+    assert len(image_size) == 2
+    assert image_size[0] == image_size[1]
+    print('image_size', image_size)
+    print('num_classes', config.num_classes)
+    path_imgrec = os.path.join(data_dir, "train.rec")
+
+    data_shape = (args.image_channel, image_size[0], image_size[1])
+
+    num_workers = config.num_workers
+    global_num_ctx = num_workers * args.ctx_num
+    if config.num_classes % global_num_ctx == 0:
+        args.ctx_num_classes = config.num_classes // global_num_ctx
+    else:
+        args.ctx_num_classes = config.num_classes // global_num_ctx + 1
+    args.local_num_classes = args.ctx_num_classes * args.ctx_num
+    args.local_class_start = args.local_num_classes * args.worker_id
+
+    #if len(args.partial)==0:
+    #  local_classes_range = (0, args.num_classes)
+    #else:
+    #  _vec = args.partial.split(',')
+    #  local_classes_range = (int(_vec[0]), int(_vec[1]))
+
+    #args.partial_num_classes = local_classes_range[1] - local_classes_range[0]
+    #args.partial_start = local_classes_range[0]
+
+    print('Called with argument:', args, config)
+    mean = None
+
+    begin_epoch = 0
+    base_lr = args.lr
+    base_wd = args.wd
+    base_mom = args.mom
+    arg_params = None
+    aux_params = None
+    if len(args.pretrained) == 0:
+        esym = get_symbol_embedding()
+        asym = get_symbol_arcface
+    else:
+        assert False
+
+    if config.num_workers == 1:
+        from parall_module_local_v1 import ParallModule
+    else:
+        from parall_module_dist import ParallModule
+
+    model = ParallModule(
+        context=ctx,
+        symbol=esym,
+        data_names=['data'],
+        label_names=['softmax_label'],
+        asymbol=asym,
+        args=args,
+    )
+    val_dataiter = None
+    train_dataiter = FaceImageIter(
+        batch_size=args.batch_size,
+        data_shape=data_shape,
+        path_imgrec=path_imgrec,
+        shuffle=True,
+        rand_mirror=config.data_rand_mirror,
+        mean=mean,
+        cutoff=config.data_cutoff,
+        color_jittering=config.data_color,
+        images_filter=config.data_images_filter,
+    )
+
+    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
+        initializer = mx.init.Xavier(rnd_type='gaussian',
+                                     factor_type="out",
+                                     magnitude=2)  #resnet style
+    else:
+        initializer = mx.init.Xavier(rnd_type='uniform',
+                                     factor_type="in",
+                                     magnitude=2)
+
+    _rescale = 1.0 / args.batch_size
+    opt = optimizer.SGD(learning_rate=base_lr,
+                        momentum=base_mom,
+                        wd=base_wd,
+                        rescale_grad=_rescale)
+    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
+
+    ver_list = []
+    ver_name_list = []
+    for name in config.val_targets:
+        path = os.path.join(data_dir, name + ".bin")
+        if os.path.exists(path):
+            data_set = verification.load_bin(path, image_size)
+            ver_list.append(data_set)
+            ver_name_list.append(name)
+            print('ver', name)
+
+    def ver_test(nbatch):
+        results = []
+        for i in range(len(ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                ver_list[i], model, args.batch_size, 10, None, None)
+            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
+            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
+            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
+                  (ver_name_list[i], nbatch, acc2, std2))
+            results.append(acc2)
+        return results
+
+    highest_acc = [0.0, 0.0]  #lfw and target
+    #for i in range(len(ver_list)):
+    #  highest_acc.append(0.0)
+    global_step = [0]
+    save_step = [0]
+    lr_steps = [int(x) for x in args.lr_steps.split(',')]
+    print('lr_steps', lr_steps)
+
+    def _batch_callback(param):
+        #global global_step
+        global_step[0] += 1
+        mbatch = global_step[0]
+        for step in lr_steps:
+            if mbatch == step:
+                opt.lr *= 0.1
+                print('lr change to', opt.lr)
+                break
+
+        _cb(param)
+        if mbatch % 1000 == 0:
+            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
+
+        if mbatch >= 0 and mbatch % args.verbose == 0:
+            acc_list = ver_test(mbatch)
+            save_step[0] += 1
+            msave = save_step[0]
+            do_save = False
+            is_highest = False
+            if len(acc_list) > 0:
+                #lfw_score = acc_list[0]
+                #if lfw_score>highest_acc[0]:
+                #  highest_acc[0] = lfw_score
+                #  if lfw_score>=0.998:
+                #    do_save = True
+                score = sum(acc_list)
+                if acc_list[-1] >= highest_acc[-1]:
+                    if acc_list[-1] > highest_acc[-1]:
+                        is_highest = True
+                    else:
+                        if score >= highest_acc[0]:
+                            is_highest = True
+                            highest_acc[0] = score
+                    highest_acc[-1] = acc_list[-1]
+                    #if lfw_score>=0.99:
+                    #  do_save = True
+            if is_highest:
+                do_save = True
+            if args.ckpt == 0:
+                do_save = False
+            elif args.ckpt == 2:
+                do_save = True
+            elif args.ckpt == 3:
+                msave = 1
+
+            if do_save:
+                print('saving', msave)
+                if config.ckpt_embedding:
+                    arg, aux = model.get_export_params()
+                else:
+                    arg, aux = model.get_params()
+                all_layers = model.symbol.get_internals()
+                _sym = all_layers['fc1_output']
+                mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
+            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
+        if config.max_steps > 0 and mbatch > config.max_steps:
+            sys.exit(0)
+
+    epoch_cb = None
+    train_dataiter = mx.io.PrefetchingIter(train_dataiter)
+
+    model.fit(
+        train_dataiter,
+        begin_epoch=begin_epoch,
+        num_epoch=999999,
+        eval_data=val_dataiter,
+        #eval_metric        = eval_metrics,
+        kvstore=args.kvstore,
+        optimizer=opt,
+        #optimizer_params   = optimizer_params,
+        initializer=initializer,
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True,
+        batch_end_callback=_batch_callback,
+        epoch_end_callback=epoch_cb)
+
+
+def main():
+    global args
+    args = parse_args()
+    train_net(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/insightface/recognition/vpl/README.md b/insightface/recognition/vpl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2f26c2e21139a4131f30d657d2af7511407e45d
--- /dev/null
+++ b/insightface/recognition/vpl/README.md
@@ -0,0 +1,21 @@
+
+# Variational Prototype Learning for Deep Face Recognition
+
+This is the Pytorch implementation of our paper  [Variational Prototype Learning for Deep Face Recognition](https://openaccess.thecvf.com/content/CVPR2021/papers/Deng_Variational_Prototype_Learning_for_Deep_Face_Recognition_CVPR_2021_paper.pdf) which is accepted by CVPR-2021.
+
+## How to run
+
+Define a new configure file such as `configs/example_ms1m.py`, and start the training process by:
+
+``
+bash run.sh configs/example_ms1m.py
+``
+
+## Results
+
+Results on WebFace600K(subset of WebFace260M), loss is margin-based softmax.
+
+| Backbone   | Dataset    | VPL? | Mask   | Children | African | Caucasian | South Asian | East Asian | MR-All    | 
+|------------|------------|------------|--------|----------|---------|-----------|-------------|------------|--------|
+| R50  | WebFace600K  | NO | 78.949 | 74.772   | 89.231  | 94.114    | 92.308      | 73.765     | 90.591 | 
+| R50  | WebFace600K  | YES | 78.884 | 75.739   | 89.424  | 94.220    | 92.609      | 74.365     | 90.942 | 
diff --git a/insightface/recognition/vpl/backbones/__init__.py b/insightface/recognition/vpl/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71f3ed9e8e6232fd23e282399c82ce2d6a662c1
--- /dev/null
+++ b/insightface/recognition/vpl/backbones/__init__.py
@@ -0,0 +1,19 @@
+from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
+
+
+def get_model(name, **kwargs):
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    elif name == "r1024":
+        from .iresnet1024 import iresnet1024
+        return iresnet1024(False, **kwargs)
+    else:
+        raise ValueError()
diff --git a/insightface/recognition/vpl/backbones/iresnet.py b/insightface/recognition/vpl/backbones/iresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3b9c240c24687d432197f976ee01fbf423216
--- /dev/null
+++ b/insightface/recognition/vpl/backbones/iresnet.py
@@ -0,0 +1,187 @@
+import torch
+from torch import nn
+
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)
+
diff --git a/insightface/recognition/vpl/backbones/iresnet1024.py b/insightface/recognition/vpl/backbones/iresnet1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0ed0a9e2342f57b6832e68cc700241a80785e66
--- /dev/null
+++ b/insightface/recognition/vpl/backbones/iresnet1024.py
@@ -0,0 +1,171 @@
+import torch
+from torch import nn
+
+assert torch.__version__ >= "1.8.1"
+from torch.utils.checkpoint import checkpoint_sequential
+
+__all__ = ['iresnet1024']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05, )
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05, )
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+
+
+class IResNet(nn.Module):
+    fc_scale = 7 * 7
+
+    def __init__(self,
+                 block, layers, dropout=0, num_features=512, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=False):
+        super(IResNet, self).__init__()
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05, )
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = checkpoint_sequential(self.layer1, 10, x)
+            x = checkpoint_sequential(self.layer2, 10, x)
+            x = checkpoint_sequential(self.layer3, 10, x)
+            x = checkpoint_sequential(self.layer4, 10, x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet1024(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet1024', IBasicBlock, [256, 256, 512, 256], pretrained,
+                    progress, **kwargs)
diff --git a/insightface/recognition/vpl/configs/__init__.py b/insightface/recognition/vpl/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/vpl/configs/base.py b/insightface/recognition/vpl/configs/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba82b214c174bb4afae1dbd49a126f34c971c6e2
--- /dev/null
+++ b/insightface/recognition/vpl/configs/base.py
@@ -0,0 +1,19 @@
+from easydict import EasyDict as edict
+
+config = edict()
+config.embedding_size = 512
+config.sample_rate = 1
+config.fp16 = False
+config.tf32 = False
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.batch_size = 128
+config.lr = 0.1  # when batch size is 512
+config.warmup_epoch = -1
+config.loss = 'arcface'
+config.network = 'r50'
+config.output = None
+config.val_targets = ['lfw', "cfp_fp", "agedb_30"]
+config.vpl = {'start_iters': 8000, 'allowed_delta': 200, 'lambda': 0.15, 'mode': -1, 'momentum': False} #mode==-1 disables vpl
+
+
diff --git a/insightface/recognition/vpl/configs/example_ms1mv3.py b/insightface/recognition/vpl/configs/example_ms1mv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..14dcdbe6d20af4a384a0ad083e12ea5ca3018ac0
--- /dev/null
+++ b/insightface/recognition/vpl/configs/example_ms1mv3.py
@@ -0,0 +1,18 @@
+from easydict import EasyDict as edict
+
+config = edict()
+config.dataset = "ms1mv3"
+config.fp16 = True
+config.batch_size = 128
+config.vpl = {'start_iters': 8000, 'allowed_delta': 200, 'lambda': 0.15, 'mode': 0, 'momentum': False}
+
+config.rec = "/train_tmp/ms1m-retinaface-t1"
+config.num_classes = 93431
+config.num_image = 5179510
+config.num_epoch = 25
+
+def lr_step_func(epoch):
+    return ((epoch + 1) / (4 + 1)) ** 2 if epoch < -1 else 0.1 ** len(
+        [m for m in [11, 17, 22] if m - 1 <= epoch])
+config.lr_func = lr_step_func
+
diff --git a/insightface/recognition/vpl/dataset.py b/insightface/recognition/vpl/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d3f632b5aeb8356d9dfaaca8ceb0bf8dc6197e
--- /dev/null
+++ b/insightface/recognition/vpl/dataset.py
@@ -0,0 +1,113 @@
+import numbers
+import os
+import queue as Queue
+import threading
+
+import mxnet as mx
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank,
+                                                 non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+class MXFaceDataset(Dataset):
+    def __init__(self, root_dir, local_rank):
+        super(MXFaceDataset, self).__init__()
+        self.transform = transforms.Compose(
+            [transforms.ToPILImage(),
+             transforms.RandomHorizontalFlip(),
+             transforms.ToTensor(),
+             transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+             ])
+        self.root_dir = root_dir
+        self.local_rank = local_rank
+        path_imgrec = os.path.join(root_dir, 'train.rec')
+        path_imgidx = os.path.join(root_dir, 'train.idx')
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        s = self.imgrec.read_idx(0)
+        header, _ = mx.recordio.unpack(s)
+        #print('ddd1')
+        if header.flag > 0:
+            if len(header.label)==2:
+                self.header0 = (int(header.label[0]), int(header.label[1]))
+                self.imgidx = np.array(range(1, int(header.label[0])))
+            else:
+                self.imgidx = np.array(list(self.imgrec.keys))
+        else:
+            self.imgidx = np.array(list(self.imgrec.keys))
+        #print('ddd2')
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        s = self.imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        label = header.label
+        if not isinstance(label, numbers.Number):
+            label = label[0]
+        sample = mx.image.imdecode(img).asnumpy()
+        label = torch.tensor(label, dtype=torch.long)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return sample, label
+
+    def __len__(self):
+        return len(self.imgidx)
+
diff --git a/insightface/recognition/vpl/eval/__init__.py b/insightface/recognition/vpl/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/vpl/eval/verification.py b/insightface/recognition/vpl/eval/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..edacf8d8136bc2dadb3d24d37fd2a812d0a443ee
--- /dev/null
+++ b/insightface/recognition/vpl/eval/verification.py
@@ -0,0 +1,409 @@
+"""Helper for evaluation on the Labeled Faces in the Wild dataset 
+"""
+
+# MIT License
+#
+# Copyright (c) 2016 David Sandberg
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import datetime
+import os
+import pickle
+
+import mxnet as mx
+import numpy as np
+import sklearn
+import torch
+from mxnet import ndarray as nd
+from scipy import interpolate
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+
+
+class LFold:
+    def __init__(self, n_splits=2, shuffle=False):
+        self.n_splits = n_splits
+        if self.n_splits > 1:
+            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
+
+    def split(self, indices):
+        if self.n_splits > 1:
+            return self.k_fold.split(indices)
+        else:
+            return [(indices, indices)]
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(
+                threshold, dist[test_set],
+                actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(np.logical_not(predict_issame),
+                       np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
+
+
+def calculate_val(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  far_target,
+                  nrof_folds=10):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    diff = np.subtract(embeddings1, embeddings2)
+    dist = np.sum(np.square(diff), 1)
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(
+                threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train) >= far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(
+            threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(
+        np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    # print(true_accept, false_accept)
+    # print(n_same, n_diff)
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy = calculate_roc(thresholds,
+                                       embeddings1,
+                                       embeddings2,
+                                       np.asarray(actual_issame),
+                                       nrof_folds=nrof_folds,
+                                       pca=pca)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds,
+                                      embeddings1,
+                                      embeddings2,
+                                      np.asarray(actual_issame),
+                                      1e-3,
+                                      nrof_folds=nrof_folds)
+    return tpr, fpr, accuracy, val, val_std, far
+
+@torch.no_grad()
+def load_bin(path, image_size):
+    try:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f)  # py2
+    except UnicodeDecodeError as e:
+        with open(path, 'rb') as f:
+            bins, issame_list = pickle.load(f, encoding='bytes')  # py3
+    data_list = []
+    for flip in [0, 1]:
+        data = torch.empty((len(issame_list) * 2, 3, image_size[0], image_size[1]))
+        data_list.append(data)
+    for idx in range(len(issame_list) * 2):
+        _bin = bins[idx]
+        img = mx.image.imdecode(_bin)
+        if img.shape[1] != image_size[0]:
+            img = mx.image.resize_short(img, image_size[0])
+        img = nd.transpose(img, axes=(2, 0, 1))
+        for flip in [0, 1]:
+            if flip == 1:
+                img = mx.ndarray.flip(data=img, axis=2)
+            data_list[flip][idx][:] = torch.from_numpy(img.asnumpy())
+        if idx % 1000 == 0:
+            print('loading bin', idx)
+    print(data_list[0].shape)
+    return data_list, issame_list
+
+@torch.no_grad()
+def test(data_set, backbone, batch_size, nfolds=10):
+    print('testing verification..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+            _data = data[bb - batch_size: bb]
+            time0 = datetime.datetime.now()
+            img = ((_data / 255) - 0.5) / 0.5
+            net_out: torch.Tensor = backbone(img)
+            _embeddings = net_out.detach().cpu().numpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+
+    _xnorm = 0.0
+    _xnorm_cnt = 0
+    for embed in embeddings_list:
+        for i in range(embed.shape[0]):
+            _em = embed[i]
+            _norm = np.linalg.norm(_em)
+            _xnorm += _norm
+            _xnorm_cnt += 1
+    _xnorm /= _xnorm_cnt
+
+    embeddings = embeddings_list[0].copy()
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    acc1 = 0.0
+    std1 = 0.0
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    print(embeddings.shape)
+    print('infer time', time_consumed)
+    _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
+    acc2, std2 = np.mean(accuracy), np.std(accuracy)
+    return acc1, std1, acc2, std2, _xnorm, embeddings_list
+
+
+def dumpR(data_set,
+          backbone,
+          batch_size,
+          name='',
+          data_extra=None,
+          label_shape=None):
+    print('dump verification embedding..')
+    data_list = data_set[0]
+    issame_list = data_set[1]
+    embeddings_list = []
+    time_consumed = 0.0
+    for i in range(len(data_list)):
+        data = data_list[i]
+        embeddings = None
+        ba = 0
+        while ba < data.shape[0]:
+            bb = min(ba + batch_size, data.shape[0])
+            count = bb - ba
+
+            _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb)
+            time0 = datetime.datetime.now()
+            if data_extra is None:
+                db = mx.io.DataBatch(data=(_data,), label=(_label,))
+            else:
+                db = mx.io.DataBatch(data=(_data, _data_extra),
+                                     label=(_label,))
+            model.forward(db, is_train=False)
+            net_out = model.get_outputs()
+            _embeddings = net_out[0].asnumpy()
+            time_now = datetime.datetime.now()
+            diff = time_now - time0
+            time_consumed += diff.total_seconds()
+            if embeddings is None:
+                embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
+            embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
+            ba = bb
+        embeddings_list.append(embeddings)
+    embeddings = embeddings_list[0] + embeddings_list[1]
+    embeddings = sklearn.preprocessing.normalize(embeddings)
+    actual_issame = np.asarray(issame_list)
+    outname = os.path.join('temp.bin')
+    with open(outname, 'wb') as f:
+        pickle.dump((embeddings, issame_list),
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+
+
+# if __name__ == '__main__':
+#
+#     parser = argparse.ArgumentParser(description='do verification')
+#     # general
+#     parser.add_argument('--data-dir', default='', help='')
+#     parser.add_argument('--model',
+#                         default='../model/softmax,50',
+#                         help='path to load model.')
+#     parser.add_argument('--target',
+#                         default='lfw,cfp_ff,cfp_fp,agedb_30',
+#                         help='test targets.')
+#     parser.add_argument('--gpu', default=0, type=int, help='gpu id')
+#     parser.add_argument('--batch-size', default=32, type=int, help='')
+#     parser.add_argument('--max', default='', type=str, help='')
+#     parser.add_argument('--mode', default=0, type=int, help='')
+#     parser.add_argument('--nfolds', default=10, type=int, help='')
+#     args = parser.parse_args()
+#     image_size = [112, 112]
+#     print('image_size', image_size)
+#     ctx = mx.gpu(args.gpu)
+#     nets = []
+#     vec = args.model.split(',')
+#     prefix = args.model.split(',')[0]
+#     epochs = []
+#     if len(vec) == 1:
+#         pdir = os.path.dirname(prefix)
+#         for fname in os.listdir(pdir):
+#             if not fname.endswith('.params'):
+#                 continue
+#             _file = os.path.join(pdir, fname)
+#             if _file.startswith(prefix):
+#                 epoch = int(fname.split('.')[0].split('-')[1])
+#                 epochs.append(epoch)
+#         epochs = sorted(epochs, reverse=True)
+#         if len(args.max) > 0:
+#             _max = [int(x) for x in args.max.split(',')]
+#             assert len(_max) == 2
+#             if len(epochs) > _max[1]:
+#                 epochs = epochs[_max[0]:_max[1]]
+#
+#     else:
+#         epochs = [int(x) for x in vec[1].split('|')]
+#     print('model number', len(epochs))
+#     time0 = datetime.datetime.now()
+#     for epoch in epochs:
+#         print('loading', prefix, epoch)
+#         sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+#         # arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
+#         all_layers = sym.get_internals()
+#         sym = all_layers['fc1_output']
+#         model = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+#         # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+#         model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0],
+#                                           image_size[1]))])
+#         model.set_params(arg_params, aux_params)
+#         nets.append(model)
+#     time_now = datetime.datetime.now()
+#     diff = time_now - time0
+#     print('model loading time', diff.total_seconds())
+#
+#     ver_list = []
+#     ver_name_list = []
+#     for name in args.target.split(','):
+#         path = os.path.join(args.data_dir, name + ".bin")
+#         if os.path.exists(path):
+#             print('loading.. ', name)
+#             data_set = load_bin(path, image_size)
+#             ver_list.append(data_set)
+#             ver_name_list.append(name)
+#
+#     if args.mode == 0:
+#         for i in range(len(ver_list)):
+#             results = []
+#             for model in nets:
+#                 acc1, std1, acc2, std2, xnorm, embeddings_list = test(
+#                     ver_list[i], model, args.batch_size, args.nfolds)
+#                 print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
+#                 print('[%s]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], acc1, std1))
+#                 print('[%s]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], acc2, std2))
+#                 results.append(acc2)
+#             print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
+#     elif args.mode == 1:
+#         raise ValueError
+#     else:
+#         model = nets[0]
+#         dumpR(ver_list[0], model, args.batch_size, args.target)
diff --git a/insightface/recognition/vpl/eval_ijbc.py b/insightface/recognition/vpl/eval_ijbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c144e4e8eef56e9cddbaf68cafe353ee512b07c7
--- /dev/null
+++ b/insightface/recognition/vpl/eval_ijbc.py
@@ -0,0 +1,483 @@
+# coding: utf-8
+
+import os
+import pickle
+
+import matplotlib
+import pandas as pd
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import timeit
+import sklearn
+import argparse
+from sklearn.metrics import roc_curve, auc
+
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from pathlib import Path
+import sys
+import warnings
+
+sys.path.insert(0, "../")
+warnings.filterwarnings("ignore")
+
+parser = argparse.ArgumentParser(description='do ijb test')
+# general
+parser.add_argument('--model-prefix', default='', help='path to load model.')
+parser.add_argument('--image-path', default='', type=str, help='')
+parser.add_argument('--result-dir', default='.', type=str, help='')
+parser.add_argument('--batch-size', default=128, type=int, help='')
+parser.add_argument('--network', default='iresnet50', type=str, help='')
+parser.add_argument('--job', default='insightface', type=str, help='job name')
+parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+args = parser.parse_args()
+
+target = args.target
+model_path = args.model_prefix
+image_path = args.image_path
+result_dir = args.result_dir
+gpu_id = None
+use_norm_score = True  # if Ture, TestMode(N1)
+use_detector_score = True  # if Ture, TestMode(D1)
+use_flip_test = True  # if Ture, TestMode(F1)
+job = args.job
+batch_size = args.batch_size
+
+import cv2
+import numpy as np
+import torch
+from skimage import transform as trans
+import backbones
+
+
+class Embedding(object):
+    def __init__(self, prefix, data_shape, batch_size=1):
+        image_size = (112, 112)
+        self.image_size = image_size
+        weight = torch.load(prefix)
+        resnet = eval("backbones.{}".format(args.network))(False).cuda()
+        resnet.load_state_dict(weight)
+        model = torch.nn.DataParallel(resnet)
+        self.model = model
+        self.model.eval()
+        src = np.array([
+            [30.2946, 51.6963],
+            [65.5318, 51.5014],
+            [48.0252, 71.7366],
+            [33.5493, 92.3655],
+            [62.7299, 92.2041]], dtype=np.float32)
+        src[:, 0] += 8.0
+        self.src = src
+        self.batch_size = batch_size
+        self.data_shape = data_shape
+
+    def get(self, rimg, landmark):
+
+        assert landmark.shape[0] == 68 or landmark.shape[0] == 5
+        assert landmark.shape[1] == 2
+        if landmark.shape[0] == 68:
+            landmark5 = np.zeros((5, 2), dtype=np.float32)
+            landmark5[0] = (landmark[36] + landmark[39]) / 2
+            landmark5[1] = (landmark[42] + landmark[45]) / 2
+            landmark5[2] = landmark[30]
+            landmark5[3] = landmark[48]
+            landmark5[4] = landmark[54]
+        else:
+            landmark5 = landmark
+        tform = trans.SimilarityTransform()
+        tform.estimate(landmark5, self.src)
+        M = tform.params[0:2, :]
+        img = cv2.warpAffine(rimg,
+                             M, (self.image_size[1], self.image_size[0]),
+                             borderValue=0.0)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img_flip = np.fliplr(img)
+        img = np.transpose(img, (2, 0, 1))  # 3*112*112, RGB
+        img_flip = np.transpose(img_flip, (2, 0, 1))
+        input_blob = np.zeros((2, 3, self.image_size[1], self.image_size[0]), dtype=np.uint8)
+        input_blob[0] = img
+        input_blob[1] = img_flip
+        return input_blob
+
+    @torch.no_grad()
+    def forward_db(self, batch_data):
+        imgs = torch.Tensor(batch_data).cuda()
+        imgs.div_(255).sub_(0.5).div_(0.5)
+        feat = self.model(imgs)
+        feat = feat.reshape([self.batch_size, 2 * feat.shape[1]])
+        return feat.cpu().numpy()
+
+
+# 将一个list尽量均分成n份，限制len(list)==n，份数大于原list内元素个数则分配空list[]
+def divideIntoNstrand(listTemp, n):
+    twoList = [[] for i in range(n)]
+    for i, e in enumerate(listTemp):
+        twoList[i % n].append(e)
+    return twoList
+
+
+def read_template_media_list(path):
+    # ijb_meta = np.loadtxt(path, dtype=str)
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+# In[ ]:
+
+
+def read_template_pair_list(path):
+    # pairs = np.loadtxt(path, dtype=str)
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    # print(pairs.shape)
+    # print(pairs[:, 0].astype(np.int))
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+# In[ ]:
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# In[ ]:
+
+
+def get_image_feature(img_path, files_list, model_path, epoch, gpu_id):
+    batch_size = args.batch_size
+    data_shape = (3, 112, 112)
+
+    files = files_list
+    print('files:', len(files))
+    rare_size = len(files) % batch_size
+    faceness_scores = []
+    batch = 0
+    img_feats = np.empty((len(files), 1024), dtype=np.float32)
+
+    batch_data = np.empty((2 * batch_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, batch_size)
+    for img_index, each_line in enumerate(files[:len(files) - rare_size]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+
+        batch_data[2 * (img_index - batch * batch_size)][:] = input_blob[0]
+        batch_data[2 * (img_index - batch * batch_size) + 1][:] = input_blob[1]
+        if (img_index + 1) % batch_size == 0:
+            print('batch', batch)
+            img_feats[batch * batch_size:batch * batch_size +
+                                         batch_size][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+
+    batch_data = np.empty((2 * rare_size, 3, 112, 112))
+    embedding = Embedding(model_path, data_shape, rare_size)
+    for img_index, each_line in enumerate(files[len(files) - rare_size:]):
+        name_lmk_score = each_line.strip().split(' ')
+        img_name = os.path.join(img_path, name_lmk_score[0])
+        img = cv2.imread(img_name)
+        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
+                       dtype=np.float32)
+        lmk = lmk.reshape((5, 2))
+        input_blob = embedding.get(img, lmk)
+        batch_data[2 * img_index][:] = input_blob[0]
+        batch_data[2 * img_index + 1][:] = input_blob[1]
+        if (img_index + 1) % rare_size == 0:
+            print('batch', batch)
+            img_feats[len(files) -
+                      rare_size:][:] = embedding.forward_db(batch_data)
+            batch += 1
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    # img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
+    # faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
+    return img_feats, faceness_scores
+
+
+# In[ ]:
+
+
+def image2template_feature(img_feats=None, templates=None, medias=None):
+    # ==========================================================
+    # 1. face image feature l2 normalization. img_feats:[number_image x feats_dim]
+    # 2. compute media feature.
+    # 3. compute template feature.
+    # ==========================================================
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+
+    for count_template, uqt in enumerate(unique_templates):
+
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias,
+                                                       return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [
+                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True)
+                ]
+        media_norm_feats = np.array(media_norm_feats)
+        # media_norm_feats = media_norm_feats / np.sqrt(np.sum(media_norm_feats ** 2, -1, keepdims=True))
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    # template_norm_feats = template_feats / np.sqrt(np.sum(template_feats ** 2, -1, keepdims=True))
+    template_norm_feats = sklearn.preprocessing.normalize(template_feats)
+    # print(template_norm_feats.shape)
+    return template_norm_feats, unique_templates
+
+
+# In[ ]:
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    # ==========================================================
+    #         Compute set-to-set Similarity Score.
+    # ==========================================================
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+# In[ ]:
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [
+        total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)
+    ]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def read_score(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+# # Step1: Load Meta Data
+
+# In[ ]:
+
+assert target == 'IJBC' or target == 'IJBB'
+
+# =============================================================
+# load image and template relationships for template feature embedding
+# tid --> template id,  mid --> media id
+# format:
+#           image_name tid mid
+# =============================================================
+start = timeit.default_timer()
+templates, medias = read_template_media_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_face_tid_mid.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+
+# =============================================================
+# load template pairs for template-to-template verification
+# tid : template id,  label : 1/0
+# format:
+#           tid_1 tid_2 label
+# =============================================================
+start = timeit.default_timer()
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % target.lower()))
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 2: Get Image Features
+
+# In[ ]:
+
+# =============================================================
+# load image features
+# format:
+#           img_feats: [image_num x feats_dim] (227630, 512)
+# =============================================================
+start = timeit.default_timer()
+img_path = '%s/loose_crop' % image_path
+img_list_path = '%s/meta/%s_name_5pts_score.txt' % (image_path, target.lower())
+img_list = open(img_list_path)
+files = img_list.readlines()
+# files_list = divideIntoNstrand(files, rank_size)
+files_list = files
+
+# img_feats
+# for i in range(rank_size):
+img_feats, faceness_scores = get_image_feature(img_path, files_list,
+                                               model_path, 0, gpu_id)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0],
+                                          img_feats.shape[1]))
+
+# # Step3: Get Template Features
+
+# In[ ]:
+
+# =============================================================
+# compute template features from image features.
+# =============================================================
+start = timeit.default_timer()
+# ==========================================================
+# Norm feature before aggregation into template feature?
+# Feature norm from embedding network and faceness score are able to decrease weights for noise samples (not face).
+# ==========================================================
+# 1. FaceScore （Feature Norm）
+# 2. FaceScore （Detector）
+
+if use_flip_test:
+    # concat --- F1
+    # img_input_feats = img_feats
+    # add --- F2
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] //
+                                     2] + img_feats[:, img_feats.shape[1] // 2:]
+else:
+    img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+if use_norm_score:
+    img_input_feats = img_input_feats
+else:
+    # normalise features to remove norm information
+    img_input_feats = img_input_feats / np.sqrt(
+        np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+if use_detector_score:
+    print(img_input_feats.shape, faceness_scores.shape)
+    img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+else:
+    img_input_feats = img_input_feats
+
+template_norm_feats, unique_templates = image2template_feature(
+    img_input_feats, templates, medias)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# # Step 4: Get Template Similarity Scores
+
+# In[ ]:
+
+# =============================================================
+# compute verification scores between template pairs.
+# =============================================================
+start = timeit.default_timer()
+score = verification(template_norm_feats, unique_templates, p1, p2)
+stop = timeit.default_timer()
+print('Time: %.2f s. ' % (stop - start))
+
+# In[ ]:
+save_path = os.path.join(result_dir, args.job)
+# save_path = result_dir + '/%s_result' % target
+
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+
+score_save_file = os.path.join(save_path, "%s.npy" % target.lower())
+np.save(score_save_file, score)
+
+# # Step 5: Get ROC Curves and TPR@FPR Table
+
+# In[ ]:
+
+files = [score_save_file]
+methods = []
+scores = []
+for file in files:
+    methods.append(Path(file).stem)
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, target))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+fig.savefig(os.path.join(save_path, '%s.pdf' % target.lower()))
+print(tpr_fpr_table)
diff --git a/insightface/recognition/vpl/inference.py b/insightface/recognition/vpl/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..839c5703937d76cb03bfb213a01e23f0dee309ae
--- /dev/null
+++ b/insightface/recognition/vpl/inference.py
@@ -0,0 +1,34 @@
+import argparse
+
+import cv2
+import numpy as np
+import torch
+
+from backbones import get_model
+
+
+@torch.no_grad()
+def inference(weight, name, img):
+    if img is None:
+        img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.uint8)
+    else:
+        img = cv2.imread(img)
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, (2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+    img.div_(255).sub_(0.5).div_(0.5)
+    net = get_model(name, fp16=False)
+    net.load_state_dict(torch.load(weight))
+    net.eval()
+    feat = net(img).numpy()
+    print(feat)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
+    parser.add_argument('--network', type=str, default='r50', help='backbone network')
+    parser.add_argument('--weight', type=str, default='')
+    parser.add_argument('--img', type=str, default=None)
+    args = parser.parse_args()
+    inference(args.weight, args.network, args.img)
diff --git a/insightface/recognition/vpl/losses.py b/insightface/recognition/vpl/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..12caa345a3c60f38aa1e273b730ae5907f3e55cb
--- /dev/null
+++ b/insightface/recognition/vpl/losses.py
@@ -0,0 +1,40 @@
+import torch
+from torch import nn
+
+def get_loss(name):
+    if name == "cosface":
+        return CosFace()
+    elif name == "arcface":
+        return ArcFace()
+    else:
+        raise ValueError()
+
+class CosFace(nn.Module):
+    def __init__(self, s=64.0, m=0.40):
+        super(CosFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine, label):
+        index = torch.where(label != -1)[0]
+        m_hot = torch.zeros(index.size()[0], cosine.size()[1], device=cosine.device)
+        m_hot.scatter_(1, label[index, None], self.m)
+        cosine[index] -= m_hot
+        ret = cosine * self.s
+        return ret
+
+
+class ArcFace(nn.Module):
+    def __init__(self, s=64.0, m=0.5):
+        super(ArcFace, self).__init__()
+        self.s = s
+        self.m = m
+
+    def forward(self, cosine: torch.Tensor, label):
+        index = torch.where(label != -1)[0]
+        m_hot = torch.zeros(index.size()[0], cosine.size()[1], device=cosine.device)
+        m_hot.scatter_(1, label[index, None], self.m)
+        cosine.acos_()
+        cosine[index] += m_hot
+        cosine.cos_().mul_(self.s)
+        return cosine
diff --git a/insightface/recognition/vpl/onnx_helper.py b/insightface/recognition/vpl/onnx_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4c47c6e4e1731e1d1d69ed4159765091463b6a
--- /dev/null
+++ b/insightface/recognition/vpl/onnx_helper.py
@@ -0,0 +1,199 @@
+import argparse
+import datetime
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+import onnx
+import onnxruntime
+from onnx import numpy_helper
+
+
+class ArcFaceORT:
+    def __init__(self, model_path):
+        self.model_path = model_path
+
+    def check(self, test_img=None):
+        max_model_size_mb = 1024
+        max_feat_dim = 512
+        max_time_cost = 15
+
+        if not os.path.exists(self.model_path):
+            return "model_path not exists"
+        if not os.path.isdir(self.model_path):
+            return "model_path should be directory"
+        onnx_files = []
+        for _file in os.listdir(self.model_path):
+            print('file_:', _file)
+            if _file.endswith('.onnx'):
+                onnx_files.append(osp.join(self.model_path, _file))
+        if len(onnx_files) == 0:
+            return "do not have onnx files"
+        self.model_file = sorted(onnx_files)[-1]
+        print('use onnx-model:', self.model_file)
+        try:
+            session = onnxruntime.InferenceSession(self.model_file, None)
+        except:
+            return "load onnx failed"
+
+        input_cfg = session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        print('input-shape:', input_shape)
+        if len(input_shape) != 4:
+            return "length of input_shape should be 4"
+        if not isinstance(input_shape[0], str):
+            # return "input_shape[0] should be str to support batch-inference"
+            print('reset input-shape[0] to None')
+            model = onnx.load(self.model_file)
+            model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+            new_model_file = osp.join(self.model_path, 'zzzzrefined.onnx')
+            onnx.save(model, new_model_file)
+            self.model_file = new_model_file
+            print('use new onnx-model:', self.model_file)
+            try:
+                session = onnxruntime.InferenceSession(self.model_file, None)
+            except:
+                return "load onnx failed"
+
+            input_cfg = session.get_inputs()[0]
+            input_shape = input_cfg.shape
+            print('new-input-shape:', input_shape)
+
+        self.image_size = tuple(input_shape[2:4][::-1])
+        # print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        outputs = session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+            # print(o.name, o.shape)
+        if len(output_names) != 1:
+            return "number of output nodes should be 1"
+        self.session = session
+        self.input_name = input_name
+        self.output_names = output_names
+        # print(self.output_names)
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        if len(graph.node) < 8:
+            return "too small onnx graph"
+
+        input_size = (112, 112)
+        self.crop = None
+        if True:
+            crop_file = osp.join(self.model_path, 'crop.txt')
+            if osp.exists(crop_file):
+                lines = open(crop_file, 'r').readlines()
+                if len(lines) != 6:
+                    return "crop.txt should contain 6 lines"
+                lines = [int(x) for x in lines]
+                self.crop = lines[:4]
+                input_size = tuple(lines[4:6])
+        if input_size != self.image_size:
+            return "input-size is inconsistant with onnx model input, %s vs %s" % (input_size, self.image_size)
+
+        self.model_size_mb = os.path.getsize(self.model_file) / float(1024 * 1024)
+        if self.model_size_mb > max_model_size_mb:
+            return "max model size exceed, given %.3f-MB" % self.model_size_mb
+
+        input_mean = None
+        input_std = None
+        if True:
+            pn_file = osp.join(self.model_path, 'pixel_norm.txt')
+            if osp.exists(pn_file):
+                lines = open(pn_file, 'r').readlines()
+                if len(lines) != 2:
+                    return "pixel_norm.txt should contain 2 lines"
+                input_mean = float(lines[0])
+                input_std = float(lines[1])
+        if input_mean is not None or input_std is not None:
+            if input_mean is None or input_std is None:
+                return "please set input_mean and input_std simultaneously"
+        else:
+            find_sub = False
+            find_mul = False
+            for nid, node in enumerate(graph.node[:8]):
+                print(nid, node.name)
+                if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                    find_sub = True
+                if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                    find_mul = True
+            if find_sub and find_mul:
+                # mxnet arcface model
+                input_mean = 0.0
+                input_std = 1.0
+            else:
+                input_mean = 127.5
+                input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        for initn in graph.initializer:
+            weight_array = numpy_helper.to_array(initn)
+
+            dt = weight_array.dtype
+            if dt.itemsize < 4:
+                return 'invalid weight type - (%s:%s)' % (initn.name, dt.name)
+        if test_img is None:
+            test_img = np.random.randint(0, 255, size=(self.image_size[1], self.image_size[0], 3), dtype=np.uint8)
+        else:
+            test_img = cv2.resize(test_img, self.image_size)
+        feat, cost = self.benchmark(test_img)
+        if feat.shape[1] > max_feat_dim:
+            return "max feat dim exceed, given %d" % feat.shape[1]
+        self.feat_dim = feat.shape[1]
+        cost_ms = cost * 1000
+        if cost_ms > max_time_cost:
+            return "max time cost exceed, given %.4f" % cost_ms
+        self.cost_ms = cost_ms
+        print(
+            'check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f' % (
+                self.model_size_mb, self.feat_dim, self.cost_ms, self.input_mean, self.input_std))
+        return None
+
+    def meta_info(self):
+        return {'model-size-mb': self.model_size_mb, 'feature-dim': self.feat_dim, 'infer': self.cost_ms}
+
+    def forward(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.image_size
+        if self.crop is not None:
+            nimgs = []
+            for img in imgs:
+                nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :]
+                if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]:
+                    nimg = cv2.resize(nimg, input_size)
+                nimgs.append(nimg)
+            imgs = nimgs
+        blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+    def benchmark(self, img):
+        input_size = self.image_size
+        if self.crop is not None:
+            nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :]
+            if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]:
+                nimg = cv2.resize(nimg, input_size)
+            img = nimg
+        blob = cv2.dnn.blobFromImage(img, 1.0 / self.input_std, input_size,
+                                     (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        costs = []
+        for _ in range(50):
+            ta = datetime.datetime.now()
+            net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+            tb = datetime.datetime.now()
+            cost = (tb - ta).total_seconds()
+            costs.append(cost)
+        costs = sorted(costs)
+        cost = costs[5]
+        return net_out, cost
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_root", help="onnx model root, default is './'", default="./")
+    args = parser.parse_args()
+    ArcFaceORT(args.model_root).check()
diff --git a/insightface/recognition/vpl/onnx_ijbc.py b/insightface/recognition/vpl/onnx_ijbc.py
new file mode 100644
index 0000000000000000000000000000000000000000..05b50bfad4b4cf38903b89f596263a8e29a50d3e
--- /dev/null
+++ b/insightface/recognition/vpl/onnx_ijbc.py
@@ -0,0 +1,267 @@
+import argparse
+import os
+import pickle
+import timeit
+
+import cv2
+import mxnet as mx
+import numpy as np
+import pandas as pd
+import prettytable
+import skimage.transform
+from sklearn.metrics import roc_curve
+from sklearn.preprocessing import normalize
+
+from onnx_helper import ArcFaceORT
+
+SRC = np.array(
+    [
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041]]
+    , dtype=np.float32)
+SRC[:, 0] += 8.0
+
+
+class AlignedDataSet(mx.gluon.data.Dataset):
+    def __init__(self, root, lines, align=True):
+        self.lines = lines
+        self.root = root
+        self.align = align
+
+    def __len__(self):
+        return len(self.lines)
+
+    def __getitem__(self, idx):
+        each_line = self.lines[idx]
+        name_lmk_score = each_line.strip().split(' ')
+        name = os.path.join(self.root, name_lmk_score[0])
+        img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB)
+        landmark5 = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32).reshape((5, 2))
+        st = skimage.transform.SimilarityTransform()
+        st.estimate(landmark5, SRC)
+        img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0)
+        img_1 = np.expand_dims(img, 0)
+        img_2 = np.expand_dims(np.fliplr(img), 0)
+        output = np.concatenate((img_1, img_2), axis=0).astype(np.float32)
+        output = np.transpose(output, (0, 3, 1, 2))
+        output = mx.nd.array(output)
+        return output
+
+
+def extract(model_root, dataset):
+    model = ArcFaceORT(model_path=model_root)
+    model.check()
+    feat_mat = np.zeros(shape=(len(dataset), 2 * model.feat_dim))
+
+    def batchify_fn(data):
+        return mx.nd.concat(*data, dim=0)
+
+    data_loader = mx.gluon.data.DataLoader(
+        dataset, 128, last_batch='keep', num_workers=4,
+        thread_pool=True, prefetch=16, batchify_fn=batchify_fn)
+    num_iter = 0
+    for batch in data_loader:
+        batch = batch.asnumpy()
+        batch = (batch - model.input_mean) / model.input_std
+        feat = model.session.run(model.output_names, {model.input_name: batch})[0]
+        feat = np.reshape(feat, (-1, model.feat_dim * 2))
+        feat_mat[128 * num_iter: 128 * num_iter + feat.shape[0], :] = feat
+        num_iter += 1
+        if num_iter % 50 == 0:
+            print(num_iter)
+    return feat_mat
+
+
+def read_template_media_list(path):
+    ijb_meta = pd.read_csv(path, sep=' ', header=None).values
+    templates = ijb_meta[:, 1].astype(np.int)
+    medias = ijb_meta[:, 2].astype(np.int)
+    return templates, medias
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+def read_image_feature(path):
+    with open(path, 'rb') as fid:
+        img_feats = pickle.load(fid)
+    return img_feats
+
+
+def image2template_feature(img_feats=None,
+                           templates=None,
+                           medias=None):
+    unique_templates = np.unique(templates)
+    template_feats = np.zeros((len(unique_templates), img_feats.shape[1]))
+    for count_template, uqt in enumerate(unique_templates):
+        (ind_t,) = np.where(templates == uqt)
+        face_norm_feats = img_feats[ind_t]
+        face_medias = medias[ind_t]
+        unique_medias, unique_media_counts = np.unique(face_medias, return_counts=True)
+        media_norm_feats = []
+        for u, ct in zip(unique_medias, unique_media_counts):
+            (ind_m,) = np.where(face_medias == u)
+            if ct == 1:
+                media_norm_feats += [face_norm_feats[ind_m]]
+            else:  # image features from the same video will be aggregated into one feature
+                media_norm_feats += [np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), ]
+        media_norm_feats = np.array(media_norm_feats)
+        template_feats[count_template] = np.sum(media_norm_feats, axis=0)
+        if count_template % 2000 == 0:
+            print('Finish Calculating {} template features.'.format(
+                count_template))
+    template_norm_feats = normalize(template_feats)
+    return template_norm_feats, unique_templates
+
+
+def verification(template_norm_feats=None,
+                 unique_templates=None,
+                 p1=None,
+                 p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000
+    sublists = [total_pairs[i: i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def verification2(template_norm_feats=None,
+                  unique_templates=None,
+                  p1=None,
+                  p2=None):
+    template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int)
+    for count_template, uqt in enumerate(unique_templates):
+        template2id[uqt] = count_template
+    score = np.zeros((len(p1),))  # save cosine distance between pairs
+    total_pairs = np.array(range(len(p1)))
+    batchsize = 100000  # small batchsize instead of all pairs in one batch due to the memory limiation
+    sublists = [total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)]
+    total_sublists = len(sublists)
+    for c, s in enumerate(sublists):
+        feat1 = template_norm_feats[template2id[p1[s]]]
+        feat2 = template_norm_feats[template2id[p2[s]]]
+        similarity_score = np.sum(feat1 * feat2, -1)
+        score[s] = similarity_score.flatten()
+        if c % 10 == 0:
+            print('Finish {}/{} pairs.'.format(c, total_sublists))
+    return score
+
+
+def main(args):
+    use_norm_score = True  # if Ture, TestMode(N1)
+    use_detector_score = True  # if Ture, TestMode(D1)
+    use_flip_test = True  # if Ture, TestMode(F1)
+    assert args.target == 'IJBC' or args.target == 'IJBB'
+
+    start = timeit.default_timer()
+    templates, medias = read_template_media_list(
+        os.path.join('%s/meta' % args.image_path, '%s_face_tid_mid.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    p1, p2, label = read_template_pair_list(
+        os.path.join('%s/meta' % args.image_path,
+                     '%s_template_pair_label.txt' % args.target.lower()))
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    img_path = '%s/loose_crop' % args.image_path
+    img_list_path = '%s/meta/%s_name_5pts_score.txt' % (args.image_path, args.target.lower())
+    img_list = open(img_list_path)
+    files = img_list.readlines()
+    dataset = AlignedDataSet(root=img_path, lines=files, align=True)
+    img_feats = extract(args.model_root, dataset)
+
+    faceness_scores = []
+    for each_line in files:
+        name_lmk_score = each_line.split()
+        faceness_scores.append(name_lmk_score[-1])
+    faceness_scores = np.array(faceness_scores).astype(np.float32)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0], img_feats.shape[1]))
+    start = timeit.default_timer()
+
+    if use_flip_test:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_feats[:, img_feats.shape[1] // 2:]
+    else:
+        img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2]
+
+    if use_norm_score:
+        img_input_feats = img_input_feats
+    else:
+        img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True))
+
+    if use_detector_score:
+        print(img_input_feats.shape, faceness_scores.shape)
+        img_input_feats = img_input_feats * faceness_scores[:, np.newaxis]
+    else:
+        img_input_feats = img_input_feats
+
+    template_norm_feats, unique_templates = image2template_feature(
+        img_input_feats, templates, medias)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+
+    start = timeit.default_timer()
+    score = verification(template_norm_feats, unique_templates, p1, p2)
+    stop = timeit.default_timer()
+    print('Time: %.2f s. ' % (stop - start))
+    save_path = os.path.join(args.result_dir, "{}_result".format(args.target))
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    score_save_file = os.path.join(save_path, "{}.npy".format(args.model_root))
+    np.save(score_save_file, score)
+    files = [score_save_file]
+    methods = []
+    scores = []
+    for file in files:
+        methods.append(os.path.basename(file))
+        scores.append(np.load(file))
+    methods = np.array(methods)
+    scores = dict(zip(methods, scores))
+    x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+    tpr_fpr_table = prettytable.PrettyTable(['Methods'] + [str(x) for x in x_labels])
+    for method in methods:
+        fpr, tpr, _ = roc_curve(label, scores[method])
+        fpr = np.flipud(fpr)
+        tpr = np.flipud(tpr)
+        tpr_fpr_row = []
+        tpr_fpr_row.append("%s-%s" % (method, args.target))
+        for fpr_iter in np.arange(len(x_labels)):
+            _, min_index = min(
+                list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+            tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+        tpr_fpr_table.add_row(tpr_fpr_row)
+    print(tpr_fpr_table)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='do ijb test')
+    # general
+    parser.add_argument('--model-root', default='', help='path to load model.')
+    parser.add_argument('--image-path', default='', type=str, help='')
+    parser.add_argument('--result-dir', default='.', type=str, help='')
+    parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB')
+    main(parser.parse_args())
diff --git a/insightface/recognition/vpl/run.sh b/insightface/recognition/vpl/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f86c5acb7f8a7d59d714cc1902134a3fe23f0a39
--- /dev/null
+++ b/insightface/recognition/vpl/run.sh
@@ -0,0 +1,2 @@
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -u -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py $@
+#ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
diff --git a/insightface/recognition/vpl/torch2onnx.py b/insightface/recognition/vpl/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7a68e682b649804a853f999c1281a1a4b49191
--- /dev/null
+++ b/insightface/recognition/vpl/torch2onnx.py
@@ -0,0 +1,59 @@
+import numpy as np
+import onnx
+import torch
+
+
+def convert_onnx(net, path_module, output, opset=11, simplify=False):
+    assert isinstance(net, torch.nn.Module)
+    img = np.random.randint(0, 255, size=(112, 112, 3), dtype=np.int32)
+    img = img.astype(np.float)
+    img = (img / 255. - 0.5) / 0.5  # torch style norm
+    img = img.transpose((2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+
+    weight = torch.load(path_module)
+    net.load_state_dict(weight)
+    net.eval()
+    torch.onnx.export(net, img, output, keep_initializers_as_inputs=False, verbose=False, opset_version=opset)
+    model = onnx.load(output)
+    graph = model.graph
+    graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None'
+    if simplify:
+        from onnxsim import simplify
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model, output)
+
+    
+if __name__ == '__main__':
+    import os
+    import argparse
+    from backbones import get_model
+
+    parser = argparse.ArgumentParser(description='ArcFace PyTorch to onnx')
+    parser.add_argument('input', type=str, help='input backbone.pth file or path')
+    parser.add_argument('--output', type=str, default=None, help='output onnx path')
+    parser.add_argument('--network', type=str, default=None, help='backbone network')
+    parser.add_argument('--simplify', type=bool, default=True, help='onnx simplify')
+    args = parser.parse_args()
+    input_file = args.input
+    if os.path.isdir(input_file):
+        input_file = os.path.join(input_file, "backbone.pth")
+    assert os.path.exists(input_file)
+    model_name = os.path.basename(os.path.dirname(input_file)).lower()
+    params = model_name.split("_")
+    if len(params) >= 3 and params[1] in ('arcface', 'cosface'):
+        if args.network is None:
+            args.network = params[2]
+    assert args.network is not None
+    print(args)
+    backbone_onnx = get_model(args.network, dropout=0)
+
+    output_path = args.output
+    if output_path is None:
+        output_path = os.path.join(os.path.dirname(__file__), 'onnx')
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    assert os.path.isdir(output_path)
+    output_file = os.path.join(output_path, "%s.onnx" % model_name)
+    convert_onnx(backbone_onnx, input_file, output_file, simplify=args.simplify)
diff --git a/insightface/recognition/vpl/train.py b/insightface/recognition/vpl/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a594ba8dc99500496b23db10fe69c00b4919b9
--- /dev/null
+++ b/insightface/recognition/vpl/train.py
@@ -0,0 +1,180 @@
+import argparse
+import logging
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.utils.data.distributed
+from torch.nn.utils import clip_grad_norm_
+
+import losses
+from backbones import get_model
+from dataset import MXFaceDataset, DataLoaderX
+from torch.utils.data import DataLoader, Dataset
+from vpl import VPL
+from utils.utils_amp import MaxClipGradScaler
+from utils.utils_callbacks import CallBackVerification, CallBackLogging, CallBackModelCheckpoint
+from utils.utils_logging import AverageMeter, init_logging
+from utils.utils_dist import concat_all_gather, batch_shuffle_ddp, batch_unshuffle_ddp
+from utils.utils_config import get_config
+
+
+def main(args):
+    cfg = get_config(args.config)
+    if not cfg.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+    try:
+        world_size = int(os.environ['WORLD_SIZE'])
+        rank = int(os.environ['RANK'])
+        dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
+    except KeyError:
+        world_size = 1
+        rank = 0
+        dist_url = "tcp://127.0.0.1:12584"
+
+    dist.init_process_group(backend='nccl', init_method=dist_url, rank=rank, world_size=world_size)
+    local_rank = args.local_rank
+    torch.cuda.set_device(local_rank)
+
+    if not os.path.exists(cfg.output) and rank==0:
+        os.makedirs(cfg.output)
+    else:
+        time.sleep(2)
+
+    log_root = logging.getLogger()
+    init_logging(log_root, rank, cfg.output)
+    if rank==0:
+        logging.info(args)
+        logging.info(cfg)
+
+    train_set = MXFaceDataset(root_dir=cfg.rec, local_rank=local_rank)
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        train_set, shuffle=True)
+    train_loader = DataLoaderX(
+        local_rank=local_rank, dataset=train_set, batch_size=cfg.batch_size,
+        sampler=train_sampler, num_workers=2, pin_memory=True, drop_last=True)
+
+    dropout = 0.4 if cfg.dataset == "webface" else 0
+    backbone = get_model(cfg.network, dropout=dropout, fp16=cfg.fp16).to(local_rank)
+    backbone_onnx = get_model(cfg.network, dropout=dropout, fp16=False)
+
+    if args.resume:
+        try:
+            backbone_pth = os.path.join(cfg.output, "backbone.pth")
+            backbone.load_state_dict(torch.load(backbone_pth, map_location=torch.device(local_rank)))
+            if rank==0:
+                logging.info("backbone resume successfully!")
+        except (FileNotFoundError, KeyError, IndexError, RuntimeError):
+            logging.info("resume fail, backbone init successfully!")
+
+    for ps in backbone.parameters():
+        dist.broadcast(ps, 0)
+    backbone = torch.nn.parallel.DistributedDataParallel(
+        module=backbone, broadcast_buffers=False, device_ids=[local_rank])
+    backbone.train()
+
+    cfg_vpl = cfg.vpl
+    vpl_momentum = cfg_vpl['momentum']
+    if vpl_momentum:
+        backbone_w = get_model(cfg.network, dropout=dropout, fp16=cfg.fp16).to(local_rank)
+        backbone_w.train()
+        for param_b, param_w in zip(backbone.module.parameters(), backbone_w.parameters()):
+            param_w.data.copy_(param_b.data)
+            param_w.requires_grad = False
+
+    margin_softmax = losses.get_loss(cfg.loss)
+    module_fc = VPL(
+        rank=rank, local_rank=local_rank, world_size=world_size, resume=args.resume,
+        batch_size=cfg.batch_size, margin_softmax=margin_softmax, num_classes=cfg.num_classes,
+        sample_rate=cfg.sample_rate, embedding_size=cfg.embedding_size, prefix=cfg.output,
+        cfg = cfg_vpl)
+    #print('AAA')
+
+    opt_backbone = torch.optim.SGD(
+        params=[{'params': backbone.parameters()}],
+        lr=cfg.lr / 512 * cfg.batch_size * world_size,
+        momentum=0.9, weight_decay=cfg.weight_decay)
+    opt_pfc = torch.optim.SGD(
+        params=[{'params': module_fc.parameters()}],
+        lr=cfg.lr / 512 * cfg.batch_size * world_size,
+        momentum=0.9, weight_decay=cfg.weight_decay)
+
+    #print('AAA')
+    scheduler_backbone = torch.optim.lr_scheduler.LambdaLR(
+        optimizer=opt_backbone, lr_lambda=cfg.lr_func)
+    scheduler_pfc = torch.optim.lr_scheduler.LambdaLR(
+        optimizer=opt_pfc, lr_lambda=cfg.lr_func)
+
+    start_epoch = 0
+    total_step = int(len(train_set) / cfg.batch_size / world_size * cfg.num_epoch)
+    if rank==0: logging.info("Total Step is: %d" % total_step)
+
+    #for epoch in range(start_epoch, cfg.num_epoch):
+    #    _lr = cfg.lr_func(epoch)
+    #    logging.info('%d:%f'%(epoch, _lr))
+
+    callback_verification = CallBackVerification(10000, rank, cfg.val_targets, cfg.rec)
+    callback_logging = CallBackLogging(50, rank, total_step, cfg.batch_size, world_size, None)
+    callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output)
+
+    loss = AverageMeter()
+    global_step = 0
+    grad_amp = MaxClipGradScaler(cfg.batch_size, 128 * cfg.batch_size, growth_interval=100) if cfg.fp16 else None
+    use_batch_shuffle = True
+    alpha = 0.999
+    for epoch in range(start_epoch, cfg.num_epoch):
+        train_sampler.set_epoch(epoch)
+        for step, (img, label) in enumerate(train_loader):
+            global_step += 1
+            #img = img.to(memory_format=torch.channels_last)
+            features = F.normalize(backbone(img))
+            feature_w = None
+            if vpl_momentum:
+                with torch.no_grad():
+                    for param_b, param_w in zip(backbone.module.parameters(), backbone_w.parameters()):
+                        param_w.data = param_w.data * alpha + param_b.data * (1. - alpha)
+                    if use_batch_shuffle:
+                        img_w, idx_unshuffle = batch_shuffle_ddp(img, rank, world_size)
+
+                    feature_w = F.normalize(backbone_w(img_w))
+                    if use_batch_shuffle:
+                        feature_w = batch_unshuffle_ddp(feature_w, idx_unshuffle, rank, world_size)
+                    feature_w = feature_w.detach()
+
+            x_grad, loss_v = module_fc.forward_backward(label, features, opt_pfc, feature_w)
+            if cfg.fp16:
+                features.backward(grad_amp.scale(x_grad))
+                grad_amp.unscale_(opt_backbone)
+                clip_grad_norm_(backbone.parameters(), max_norm=5, norm_type=2)
+                grad_amp.step(opt_backbone)
+                grad_amp.update()
+            else:
+                features.backward(x_grad)
+                clip_grad_norm_(backbone.parameters(), max_norm=5, norm_type=2)
+                opt_backbone.step()
+
+            opt_pfc.step()
+            module_fc.update()
+            opt_backbone.zero_grad()
+            opt_pfc.zero_grad()
+            loss.update(loss_v, 1)
+            callback_logging(global_step, loss, epoch, cfg.fp16, grad_amp)
+            callback_verification(global_step, backbone)
+        callback_checkpoint(global_step, backbone, module_fc, backbone_onnx)
+        scheduler_backbone.step()
+        scheduler_pfc.step()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+    parser = argparse.ArgumentParser(description='PyTorch ArcFace-VPL Training')
+    parser.add_argument('config', type=str, help='py config file')
+    parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
+    parser.add_argument('--resume', type=int, default=0, help='model resuming')
+    args_ = parser.parse_args()
+    main(args_)
+
diff --git a/insightface/recognition/vpl/utils/__init__.py b/insightface/recognition/vpl/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/vpl/utils/plot.py b/insightface/recognition/vpl/utils/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc588e5c01ca550b69c385aeb3fd139c59fb88a
--- /dev/null
+++ b/insightface/recognition/vpl/utils/plot.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+
+import os
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from sklearn.metrics import roc_curve, auc
+
+image_path = "/data/anxiang/IJB_release/IJBC"
+files = [
+        "./ms1mv3_arcface_r100/ms1mv3_arcface_r100/ijbc.npy"
+]
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % 'ijbc'))
+
+methods = []
+scores = []
+for file in files:
+    methods.append(file.split('/')[-2])
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, "IJBC"))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+print(tpr_fpr_table)
diff --git a/insightface/recognition/vpl/utils/utils_amp.py b/insightface/recognition/vpl/utils/utils_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c60067e99059e8802664d3664823d30ac4cf5ccc
--- /dev/null
+++ b/insightface/recognition/vpl/utils/utils_amp.py
@@ -0,0 +1,81 @@
+from typing import Dict, List
+
+import torch
+from torch._six import container_abcs
+from torch.cuda.amp import GradScaler
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+class MaxClipGradScaler(GradScaler):
+    def __init__(self, init_scale, max_scale: float, growth_interval=100):
+        GradScaler.__init__(self, init_scale=init_scale, growth_interval=growth_interval)
+        self.max_scale = max_scale
+
+    def scale_clip(self):
+        if self.get_scale() == self.max_scale:
+            self.set_growth_factor(1)
+        elif self.get_scale() < self.max_scale:
+            self.set_growth_factor(2)
+        elif self.get_scale() > self.max_scale:
+            self._scale.fill_(self.max_scale)
+            self.set_growth_factor(1)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Arguments:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+        self.scale_clip()
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[_MultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, container_abcs.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+        return apply_scale(outputs)
diff --git a/insightface/recognition/vpl/utils/utils_callbacks.py b/insightface/recognition/vpl/utils/utils_callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..43fb708438436b5a1ff5294a8e5be6781364e838
--- /dev/null
+++ b/insightface/recognition/vpl/utils/utils_callbacks.py
@@ -0,0 +1,112 @@
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+from eval import verification
+from torch2onnx import convert_onnx
+from utils.utils_logging import AverageMeter
+
+
+class CallBackVerification(object):
+    def __init__(self, frequent, rank, val_targets, rec_prefix, image_size=(112, 112)):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.highest_acc: float = 0.0
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        if self.rank is 0:
+            self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size)
+
+    def ver_test(self, backbone: torch.nn.Module, global_step: int):
+        results = []
+        for i in range(len(self.ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], backbone, 10, 10)
+            logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                '[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update, backbone: torch.nn.Module):
+        if self.rank is 0 and num_update > 0 and num_update % self.frequent == 0:
+            backbone.eval()
+            self.ver_test(backbone, num_update)
+            backbone.train()
+
+
+class CallBackLogging(object):
+    def __init__(self, frequent, rank, total_step, batch_size, world_size, writer=None):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.batch_size: int = batch_size
+        self.world_size: int = world_size
+        self.writer = writer
+
+        self.init = False
+        self.tic = 0
+
+    def __call__(self, global_step, loss: AverageMeter, epoch: int, fp16: bool, grad_scaler: torch.cuda.amp.GradScaler):
+        if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0:
+            if self.init:
+                try:
+                    speed: float = self.frequent * self.batch_size / (time.time() - self.tic)
+                    speed_total = speed * self.world_size
+                except ZeroDivisionError:
+                    speed_total = float('inf')
+
+                time_now = (time.time() - self.time_start) / 3600
+                time_total = time_now / ((global_step + 1) / self.total_step)
+                time_for_end = time_total - time_now
+                if self.writer is not None:
+                    self.writer.add_scalar('time_for_end', time_for_end, global_step)
+                    self.writer.add_scalar('loss', loss.avg, global_step)
+                if fp16:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   Epoch: %d   Global Step: %d   " \
+                          "Fp16 Grad Scale: %2.f   Required: %1.f hours" % (
+                              speed_total, loss.avg, epoch, global_step, grad_scaler.get_scale(), time_for_end
+                          )
+                else:
+                    msg = "Speed %.2f samples/sec   Loss %.4f   Epoch: %d   Global Step: %d   Required: %1.f hours" % (
+                        speed_total, loss.avg, epoch, global_step, time_for_end
+                    )
+                logging.info(msg)
+                loss.reset()
+                self.tic = time.time()
+            else:
+                self.init = True
+                self.tic = time.time()
+
+
+class CallBackModelCheckpoint(object):
+    def __init__(self, rank, output="./"):
+        self.rank: int = rank
+        self.output: str = output
+
+    def __call__(self, global_step, backbone, partial_fc, backbone_onnx):
+        if global_step > 100 and self.rank is 0:
+            path_module = os.path.join(self.output, "backbone.pth")
+            path_onnx = os.path.join(self.output, "backbone.onnx")
+            torch.save(backbone.module.state_dict(), path_module)
+            logging.info("Pytorch Model Saved in '{}'".format(path_module))
+            convert_onnx(backbone_onnx, path_module, path_onnx)
+            logging.info("Onnx Model Saved in '{}'".format(path_onnx))
+
+        if global_step > 100 and partial_fc is not None:
+            partial_fc.save_params()
diff --git a/insightface/recognition/vpl/utils/utils_config.py b/insightface/recognition/vpl/utils/utils_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..372ed921f127d7182190e62d141cf72f19089f55
--- /dev/null
+++ b/insightface/recognition/vpl/utils/utils_config.py
@@ -0,0 +1,16 @@
+import importlib
+import os
+import os.path as osp
+
+def get_config(config_file):
+    assert config_file.startswith('configs/'), 'config file setting must start with configs/'
+    temp_config_name = osp.basename(config_file)
+    temp_module_name = osp.splitext(temp_config_name)[0]
+    config = importlib.import_module("configs.base")
+    cfg = config.config
+    config = importlib.import_module("configs.%s"%temp_module_name)
+    job_cfg = config.config
+    cfg.update(job_cfg)
+    if cfg.output is None:
+        cfg.output = osp.join('work_dirs', temp_module_name)
+    return cfg
diff --git a/insightface/recognition/vpl/utils/utils_dist.py b/insightface/recognition/vpl/utils/utils_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..c191f288f7b272a2b94a63b13a82aa2dbe7ba39e
--- /dev/null
+++ b/insightface/recognition/vpl/utils/utils_dist.py
@@ -0,0 +1,57 @@
+import torch
+
+@torch.no_grad()
+def concat_all_gather(tensor):
+	"""
+	Performs all_gather operation on the provided tensors.
+	*** Warning ***: torch.distributed.all_gather has no gradient.
+	"""
+	tensors_gather = [torch.ones_like(tensor)
+		for _ in range(torch.distributed.get_world_size())]
+	torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+	output = torch.cat(tensors_gather, dim=0)
+	return output
+
+@torch.no_grad()
+def batch_shuffle_ddp(x, rank, world_size):
+	"""
+	Batch shuffle, for making use of BatchNorm.
+	*** Only support DistributedDataParallel (DDP) model. ***
+	"""
+	# gather from all gpus
+	batch_size_this = x.shape[0]
+	x_gather = concat_all_gather(x)
+	batch_size_all = x_gather.shape[0]
+
+
+	# random shuffle index
+	idx_shuffle = torch.randperm(batch_size_all).cuda()
+
+	# broadcast to all gpus
+	torch.distributed.broadcast(idx_shuffle, src=0)
+
+	# index for restoring
+	idx_unshuffle = torch.argsort(idx_shuffle)
+
+	# shuffled index for this gpu
+	idx_this = idx_shuffle.view(world_size, -1)[rank]
+
+	return x_gather[idx_this], idx_unshuffle
+
+@torch.no_grad()
+def batch_unshuffle_ddp(x, idx_unshuffle, rank, world_size):
+	"""
+	Undo batch shuffle.
+	*** Only support DistributedDataParallel (DDP) model. ***
+	"""
+	# gather from all gpus
+	batch_size_this = x.shape[0]
+	x_gather = concat_all_gather(x)
+	batch_size_all = x_gather.shape[0]
+
+
+	# restored index for this gpu
+	idx_this = idx_unshuffle.view(world_size, -1)[rank]
+
+	return x_gather[idx_this]
diff --git a/insightface/recognition/vpl/utils/utils_logging.py b/insightface/recognition/vpl/utils/utils_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d58012ead1b0245abc8e778c0f47b036a1250c1
--- /dev/null
+++ b/insightface/recognition/vpl/utils/utils_logging.py
@@ -0,0 +1,40 @@
+import logging
+import os
+import sys
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(log_root, rank, models_root):
+    if rank is 0:
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
+        handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info('rank_id: %d' % rank)
diff --git a/insightface/recognition/vpl/utils/utils_os.py b/insightface/recognition/vpl/utils/utils_os.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/recognition/vpl/vpl.py b/insightface/recognition/vpl/vpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..039fa76ccf8b768f3b69385849e584d32a74856e
--- /dev/null
+++ b/insightface/recognition/vpl/vpl.py
@@ -0,0 +1,186 @@
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+from torch.nn import Module
+from torch.nn.functional import normalize, linear
+from torch.nn.parameter import Parameter
+
+
+class VPL(Module):
+    """
+    Modified from Partial-FC
+    """
+
+    @torch.no_grad()
+    def __init__(self, rank, local_rank, world_size, batch_size, resume,
+                 margin_softmax, num_classes, sample_rate=1.0, embedding_size=512, prefix="./", cfg=None):
+        super(VPL, self).__init__()
+        #
+        assert sample_rate==1.0
+        assert not resume
+        self.num_classes: int = num_classes
+        self.rank: int = rank
+        self.local_rank: int = local_rank
+        self.device: torch.device = torch.device("cuda:{}".format(self.local_rank))
+        self.world_size: int = world_size
+        self.batch_size: int = batch_size
+        self.margin_softmax: callable = margin_softmax
+        self.sample_rate: float = sample_rate
+        self.embedding_size: int = embedding_size
+        self.prefix: str = prefix
+        self.num_local: int = num_classes // world_size + int(rank < num_classes % world_size)
+        self.class_start: int = num_classes // world_size * rank + min(rank, num_classes % world_size)
+        self.num_sample: int = int(self.sample_rate * self.num_local)
+
+        self.weight_name = os.path.join(self.prefix, "rank_{}_softmax_weight.pt".format(self.rank))
+        self.weight_mom_name = os.path.join(self.prefix, "rank_{}_softmax_weight_mom.pt".format(self.rank))
+
+        self.weight = torch.normal(0, 0.01, (self.num_local, self.embedding_size), device=self.device)
+        self.weight_mom: torch.Tensor = torch.zeros_like(self.weight)
+        logging.info("softmax weight init successfully!")
+        logging.info("softmax weight mom init successfully!")
+        self.stream: torch.cuda.Stream = torch.cuda.Stream(local_rank)
+
+        self.index = None
+        self.update = lambda: 0
+        self.sub_weight = Parameter(self.weight)
+        self.sub_weight_mom = self.weight_mom
+
+        #vpl variables
+
+        self._iters = 0
+        self.cfg = cfg
+        self.vpl_mode = -1
+        if self.cfg is not None:
+            self.vpl_mode = self.cfg['mode']
+            if self.vpl_mode>=0:
+                self.register_buffer("queue", torch.randn(self.num_local, self.embedding_size, device=self.device))
+                self.queue = normalize(self.queue)
+                self.register_buffer("queue_iters", torch.zeros((self.num_local,), dtype=torch.long, device=self.device))
+                self.register_buffer("queue_lambda", torch.zeros((self.num_local,), dtype=torch.float32, device=self.device))
+
+
+    def save_params(self):
+        pass
+        #torch.save(self.weight.data, self.weight_name)
+        #torch.save(self.weight_mom, self.weight_mom_name)
+
+    @torch.no_grad()
+    def sample(self, total_label):
+        index_positive = (self.class_start <= total_label) & (total_label < self.class_start + self.num_local)
+        total_label[~index_positive] = -1
+        total_label[index_positive] -= self.class_start
+        return index_positive
+
+    def forward(self, total_features, norm_weight):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        logits = linear(total_features, norm_weight)
+        return logits
+
+    @torch.no_grad()
+    def update(self):
+        self.weight_mom[self.index] = self.sub_weight_mom
+        self.weight[self.index] = self.sub_weight
+
+    def prepare(self, label, optimizer):
+        with torch.cuda.stream(self.stream):
+            total_label = torch.zeros(
+                size=[self.batch_size * self.world_size], device=self.device, dtype=torch.long)
+            dist.all_gather(list(total_label.chunk(self.world_size, dim=0)), label)
+            index_positive = self.sample(total_label)
+            optimizer.state.pop(optimizer.param_groups[-1]['params'][0], None)
+            optimizer.param_groups[-1]['params'][0] = self.sub_weight
+            optimizer.state[self.sub_weight]['momentum_buffer'] = self.sub_weight_mom
+            norm_weight = normalize(self.sub_weight)
+            return total_label, norm_weight, index_positive
+
+    @torch.no_grad()
+    def prepare_queue_lambda(self, label, iters):
+        self.queue_lambda[:] = 0.0
+        if iters>self.cfg['start_iters']:
+            allowed_delta = self.cfg['allowed_delta']
+            if self.vpl_mode==0:
+                past_iters = iters - self.queue_iters
+                idx = torch.where(past_iters <= allowed_delta)[0]
+                self.queue_lambda[idx] = self.cfg['lambda']
+
+            if iters % 2000 == 0 and self.rank == 0:
+                logging.info('[%d]use-lambda: %d/%d'%(iters,len(idx), self.num_local))
+
+    @torch.no_grad()
+    def set_queue(self, total_features, total_label, index_positive, iters):
+        local_label = total_label[index_positive]
+        sel_features = normalize(total_features[index_positive,:])
+        self.queue[local_label,:] = sel_features
+        self.queue_iters[local_label] = iters
+
+    def forward_backward(self, label, features, optimizer, feature_w):
+        self._iters += 1
+        total_label, norm_weight, index_positive = self.prepare(label, optimizer)
+        total_features = torch.zeros(
+            size=[self.batch_size * self.world_size, self.embedding_size], device=self.device)
+        dist.all_gather(list(total_features.chunk(self.world_size, dim=0)), features.data)
+        total_features.requires_grad = True
+
+        if feature_w is not None:
+            total_feature_w = torch.zeros(
+                size=[self.batch_size * self.world_size, self.embedding_size], device=self.device)
+            dist.all_gather(list(total_feature_w.chunk(self.world_size, dim=0)), feature_w.data)
+
+        if self.vpl_mode>=0:
+            self.prepare_queue_lambda(total_label, self._iters)
+            _lambda = self.queue_lambda.view(self.num_local, 1)
+            injected_weight = norm_weight*(1.0-_lambda) + self.queue*_lambda
+            injected_norm_weight = normalize(injected_weight)
+            logits = self.forward(total_features, injected_norm_weight)
+        else:
+            logits = self.forward(total_features, norm_weight)
+
+        logits = self.margin_softmax(logits, total_label)
+
+        with torch.no_grad():
+            max_fc = torch.max(logits, dim=1, keepdim=True)[0]
+            dist.all_reduce(max_fc, dist.ReduceOp.MAX)
+
+            # calculate exp(logits) and all-reduce
+            logits_exp = torch.exp(logits - max_fc)
+            logits_sum_exp = logits_exp.sum(dim=1, keepdims=True)
+            dist.all_reduce(logits_sum_exp, dist.ReduceOp.SUM)
+
+            # calculate prob
+            logits_exp.div_(logits_sum_exp)
+
+            # get one-hot
+            grad = logits_exp
+            index = torch.where(total_label != -1)[0]
+            one_hot = torch.zeros(size=[index.size()[0], grad.size()[1]], device=grad.device)
+            one_hot.scatter_(1, total_label[index, None], 1)
+
+            # calculate loss
+            loss = torch.zeros(grad.size()[0], 1, device=grad.device)
+            loss[index] = grad[index].gather(1, total_label[index, None])
+            dist.all_reduce(loss, dist.ReduceOp.SUM)
+            loss_v = loss.clamp_min_(1e-30).log_().mean() * (-1)
+
+            # calculate grad
+            grad[index] -= one_hot
+            grad.div_(self.batch_size * self.world_size)
+
+        logits.backward(grad)
+        if total_features.grad is not None:
+            total_features.grad.detach_()
+        x_grad: torch.Tensor = torch.zeros_like(features, requires_grad=True)
+        # feature gradient all-reduce
+        dist.reduce_scatter(x_grad, list(total_features.grad.chunk(self.world_size, dim=0)))
+        x_grad = x_grad * self.world_size
+        #vpl set queue
+        if self.vpl_mode>=0:
+            if feature_w is None:
+                self.set_queue(total_features.detach(), total_label, index_positive, self._iters)
+            else:
+                self.set_queue(total_feature_w, total_label, index_positive, self._iters)
+        # backward backbone
+        return x_grad, loss_v
+
diff --git a/insightface/reconstruction/PBIDR/INSTALL.md b/insightface/reconstruction/PBIDR/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..9db65aa6e102fc3b94c1ba02b466edfbf1a67bdd
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/INSTALL.md
@@ -0,0 +1,27 @@
+## MANUAL INSTALL
+
+```bash
+conda create -n pbidr python=3.7
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+conda install -c conda-forge scikit-image shapely rtree pyembree
+pip install trimesh[all]
+```
+
+#### Install Pytorch3D
+
+```bash
+wget https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+tar xzf 1.10.0.tar.gz
+export CUB_HOME=$PWD/cub-1.10.0
+wget https://github.com/facebookresearch/pytorch3d/archive/refs/tags/v0.4.0.tar.gz
+tar xzf v0.4.0.tar.gz
+export TORCH_CUDA_ARCH_LIST="7.5"
+cd pytorch3d-0.4.0 && pip install -e .
+```
+
+#### Install Other Requirments
+
+```bash
+pip install requirements.txt
+```
+
diff --git a/insightface/reconstruction/PBIDR/README.md b/insightface/reconstruction/PBIDR/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..03610f154f1c004406e93a168ea4cc206565e0c9
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/README.md
@@ -0,0 +1,75 @@
+# Facial Geometric Detail Recovery via Implicit Representation
+
+:herb: **[Facial Geometric Detail Recovery via Implicit Representation](https://arxiv.org/abs/2203.09692)**
+
+Xingyu Ren, Alexandros Lattas, Baris Gecer, Jiankang Deng, Chao Ma, Xiaokang Yang, and Stefanos Zafeiriou. 
+
+*arXiv Preprint 2022*
+
+## Introduction
+
+![overview](https://github.com/deepinsight/insightface/blob/master/reconstruction/PBIDR/figures/overview.jpg)
+
+This paper introduces a single facial image geometric detail recovery algorithm. The method generates complete high-fidelity texture maps from occluded facial images, and employs implicit renderer and shape functions, to derive fine geometric details by decoupled specular normals. As a bonus, it disentangles the facial texture into approximate diffuse albedo, diffuse and specular shading in a self-supervision manner.
+
+## Installation
+
+Please refer to the installation and usage of [IDR](https://github.com/lioryariv/idr).
+
+The code is compatible with python 3.7 and pytorch 1.7.1. In addition, the following packages are required:  
+numpy, scikit-image, trimesh (with pyembree), opencv, torchvision, pytorch3d 0.4.0.
+
+You can see [INSTALL.md](INSTALL.md) for manual installation.
+
+## Tutorial
+
+### Data Preprocessing
+
+ We have provided several textured meshes from [Google Drive](https://drive.google.com/file/d/1R7MdWawdMSjQUOnciJ5mb1pcwoY61Tzc/view?usp=sharing) and [Baidu Drive](https://pan.baidu.com/s/16mAqB_7mlbW2--0__patWA) (password: wp47). Otherwise, please refer to [OSTeC](https://github.com/barisgecer/OSTeC) to make a textured mesh firstly.
+
+Please download raw textured meshes and run:
+
+ ```shell
+cd ./code
+bash script/data_process.sh
+ ```
+
+ You can synthesize the auxiliary image sets for the next implicit details recovery.
+
+### Train & Eval
+
+You can start the training phase with the following script.
+
+ ```shell
+cd ./code
+bash script/fast_train.sh
+ ```
+
+ We also provide a script for eval: 
+
+ ```shell
+cd ./code
+bash script/fast_eval.sh
+ ```
+
+## Citation
+
+ If any parts of our paper and codes are helpful to your work, please generously citing:
+
+ ```
+@misc{ren2022facial,
+      title={Facial Geometric Detail Recovery via Implicit Representation}, 
+      author={Xingyu Ren and Alexandros Lattas and Baris Gecer and Jiankang Deng and Chao Ma and Xiaokang Yang and Stefanos Zafeiriou},
+      year={2022},
+      eprint={2203.09692},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+ ```
+
+## Reference
+
+ We refer to the following repositories when implementing our whole pipeline. Thanks for their great work.
+
+ - [barisgecer/OSTeC](https://github.com/barisgecer/OSTeC)
+ - [lioryariv/idr](https://github.com/lioryariv/idr)
diff --git a/insightface/reconstruction/PBIDR/code/confs/test.conf b/insightface/reconstruction/PBIDR/code/confs/test.conf
new file mode 100644
index 0000000000000000000000000000000000000000..8851fb43a5ffdcbf3dfc4cd6c9cae7e2e3d2c4bf
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/confs/test.conf
@@ -0,0 +1,72 @@
+train{
+    expname = test
+    dataset_class = datasets.dataset.IFDataset
+    model_class = model.renderer.IFNetwork
+    loss_class = model.loss.IFLoss
+    learning_rate = 1.0e-4
+    num_pixels = 2048
+    plot_freq = 100
+    alpha_milestones = [250, 500, 750, 1000, 1250]
+    alpha_factor = 2
+    sched_milestones = [1000,1500]
+    sched_factor = 0.5
+}
+plot{
+    plot_nimgs = 1
+    max_depth = 3.0
+    resolution = 100
+}
+loss{
+    eikonal_weight = 0.1
+    mask_weight = 100.0
+    reg_weight = 5.0
+    normal_weight = 1.0
+    alpha = 50.0
+}
+dataset{
+    data_dir = Test
+    img_res = [1024, 1024]
+    scan_id = 0
+}
+model{
+    feature_vector_size = 256
+    implicit_network
+    {
+        d_in = 3
+        d_out = 1
+        dims = [512, 512, 512, 512, 512, 512, 512, 512]
+        geometric_init = True
+        bias = 0.6
+        skip_in = [4]
+        weight_norm = True
+        multires = 6
+    }
+    diffuse_network
+    {
+        dims = [128]
+        weight_norm = True
+        multires_view = 6
+    }
+    specular_network
+    {
+        dims = [128]
+        weight_norm = True
+        multires_view = 4
+    }
+    albedo_network
+    {
+        dims = [256, 256, 256, 256]
+        weight_norm = True
+        multires_view = 4
+    }
+    ray_tracer
+    {
+        object_bounding_sphere = 1.0
+        sdf_threshold = 5.0e-5
+        line_search_step = 0.5
+        line_step_iters = 3
+        sphere_tracing_iters = 10
+        n_steps = 100
+        n_secant_steps = 8
+    }
+}
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/datasets/dataset.py b/insightface/reconstruction/PBIDR/code/datasets/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae12367bd0a9ca88d9e7422a2f0ed4e13a0c8104
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/datasets/dataset.py
@@ -0,0 +1,154 @@
+import os
+import torch
+import numpy as np
+
+import utils.general as utils
+from utils import rend_util
+
+class IFDataset(torch.utils.data.Dataset):
+    """Dataset for a class of objects, where each datapoint is a SceneInstanceDataset."""
+
+    def __init__(self,
+                 train_cameras,
+                 data_dir,
+                 img_res,
+                 scan_id=0,
+                 cam_file=None
+                 ):
+
+        self.instance_dir = os.path.join('../data', data_dir, 'scan{0}'.format(scan_id))
+
+        self.total_pixels = img_res[0] * img_res[1]
+        self.img_res = img_res
+
+        assert os.path.exists(self.instance_dir), "Data directory is empty"
+
+        self.sampling_idx = None
+        self.train_cameras = train_cameras
+
+        image_dir = '{0}/image'.format(self.instance_dir)
+        image_paths = sorted(utils.glob_imgs(image_dir))
+        mask_dir = '{0}/mask'.format(self.instance_dir)
+        mask_paths = sorted(utils.glob_imgs(mask_dir))
+
+        self.n_images = len(image_paths)
+
+        self.cam_file = '{0}/cameras.npz'.format(self.instance_dir)
+        if cam_file is not None:
+            self.cam_file = '{0}/{1}'.format(self.instance_dir, cam_file)
+
+        camera_dict = np.load(self.cam_file)
+        scale_mats = [camera_dict['scale_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+        world_mats = [camera_dict['world_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+
+
+        self.intrinsics_all = []
+        self.pose_all = []
+        for scale_mat, world_mat in zip(scale_mats, world_mats):
+            P = world_mat @ scale_mat
+            P = P[:3, :4]
+            intrinsics, pose = rend_util.load_K_Rt_from_P(None, P)
+            self.intrinsics_all.append(torch.from_numpy(intrinsics).float())
+            self.pose_all.append(torch.from_numpy(pose).float())
+
+        self.rgb_images = []
+        for path in image_paths:
+            rgb = rend_util.load_rgb(path)
+            rgb = rgb.reshape(3, -1).transpose(1, 0)
+            self.rgb_images.append(torch.from_numpy(rgb).float())
+
+        self.object_masks = []
+        for path in mask_paths:
+            object_mask = rend_util.load_mask_white_bg(path)
+            object_mask = object_mask.reshape(-1)
+            self.object_masks.append(torch.from_numpy(object_mask).bool())
+
+    def __len__(self):
+        return self.n_images
+
+    def __getitem__(self, idx):
+        uv = np.mgrid[0:self.img_res[0], 0:self.img_res[1]].astype(np.int32)
+        uv = torch.from_numpy(np.flip(uv, axis=0).copy()).float()
+        uv = uv.reshape(2, -1).transpose(1, 0)
+
+        sample = {
+            "object_mask": self.object_masks[idx],
+            "uv": uv,
+            "intrinsics": self.intrinsics_all[idx],
+        }
+
+        ground_truth = {
+            "rgb": self.rgb_images[idx]
+        }
+
+        if self.sampling_idx is not None:
+            ground_truth["rgb"] = self.rgb_images[idx][self.sampling_idx, :]
+            sample["object_mask"] = self.object_masks[idx][self.sampling_idx]
+            sample["uv"] = uv[self.sampling_idx, :]
+
+        if not self.train_cameras:
+            sample["pose"] = self.pose_all[idx]
+
+        return idx, sample, ground_truth
+
+    def collate_fn(self, batch_list):
+        # get list of dictionaries and returns input, ground_true as dictionary for all batch instances
+        batch_list = zip(*batch_list)
+
+        all_parsed = []
+        for entry in batch_list:
+            if type(entry[0]) is dict:
+                # make them all into a new dict
+                ret = {}
+                for k in entry[0].keys():
+                    ret[k] = torch.stack([obj[k] for obj in entry])
+                all_parsed.append(ret)
+            else:
+                all_parsed.append(torch.LongTensor(entry))
+
+        return tuple(all_parsed)
+
+    def change_sampling_idx(self, sampling_size):
+        if sampling_size == -1:
+            self.sampling_idx = None
+        else:
+            self.sampling_idx = torch.randperm(self.total_pixels)[:sampling_size]
+
+    def get_scale_mat(self):
+        return np.load(self.cam_file)['scale_mat_0']
+
+    def get_gt_pose(self, scaled=False):
+        # Load gt pose without normalization to unit sphere
+        camera_dict = np.load(self.cam_file)
+        world_mats = [camera_dict['world_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+        scale_mats = [camera_dict['scale_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+
+        pose_all = []
+        for scale_mat, world_mat in zip(scale_mats, world_mats):
+            P = world_mat
+            if scaled:
+                P = world_mat @ scale_mat
+            P = P[:3, :4]
+            _, pose = rend_util.load_K_Rt_from_P(None, P)
+            pose_all.append(torch.from_numpy(pose).float())
+
+        return torch.cat([p.float().unsqueeze(0) for p in pose_all], 0)
+
+    def get_pose_init(self):
+        # get noisy initializations obtained with the linear method
+        cam_file = '{0}/cameras_linear_init.npz'.format(self.instance_dir)
+        camera_dict = np.load(cam_file)
+        scale_mats = [camera_dict['scale_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+        world_mats = [camera_dict['world_mat_%d' % idx].astype(np.float32) for idx in range(self.n_images)]
+
+        init_pose = []
+        for scale_mat, world_mat in zip(scale_mats, world_mats):
+            P = world_mat @ scale_mat
+            P = P[:3, :4]
+            _, pose = rend_util.load_K_Rt_from_P(None, P)
+            init_pose.append(pose)
+        init_pose = torch.cat([torch.Tensor(pose).float().unsqueeze(0) for pose in init_pose], 0).cuda()
+        init_quat = rend_util.rot_to_quat(init_pose[:, :3, :3])
+        init_quat = torch.cat([init_quat, init_pose[:, :3, 3]], 1)
+
+        return init_quat
diff --git a/insightface/reconstruction/PBIDR/code/evaluation/eval.py b/insightface/reconstruction/PBIDR/code/evaluation/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb1b6ae374cb7b490d2f50e528795d8616c0051
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/evaluation/eval.py
@@ -0,0 +1,211 @@
+import sys
+sys.path.append('../code')
+import argparse
+import GPUtil
+import os
+from pyhocon import ConfigFactory
+import torch
+import numpy as np
+import cvxpy as cp
+from PIL import Image
+import math
+
+import utils.general as utils
+import utils.plots as plt
+from utils import rend_util
+
+def evaluate(**kwargs):
+    torch.set_default_dtype(torch.float32)
+
+    conf = ConfigFactory.parse_file(kwargs['conf'])
+    exps_folder_name = kwargs['exps_folder_name']
+    evals_folder_name = kwargs['evals_folder_name']
+    eval_rendering = kwargs['eval_rendering']
+    eval_animation = kwargs['eval_animation']
+
+    expname = conf.get_string('train.expname') + kwargs['expname']
+    scan_id = kwargs['scan_id'] if kwargs['scan_id'] != -1 else conf.get_int('dataset.scan_id', default=-1)
+    if scan_id != -1:
+        expname = expname + '_{0}'.format(scan_id)
+
+    if kwargs['timestamp'] == 'latest':
+        if os.path.exists(os.path.join('../', kwargs['exps_folder_name'], expname)):
+            timestamps = os.listdir(os.path.join('../', kwargs['exps_folder_name'], expname))
+            if (len(timestamps)) == 0:
+                print('WRONG EXP FOLDER')
+                exit()
+            else:
+                timestamp = sorted(timestamps)[-1]
+        else:
+            print('WRONG EXP FOLDER')
+            exit()
+    else:
+        timestamp = kwargs['timestamp']
+
+    utils.mkdir_ifnotexists(os.path.join('../', evals_folder_name))
+    expdir = os.path.join('../', exps_folder_name, expname)
+    evaldir = os.path.join('../', evals_folder_name, expname)
+    utils.mkdir_ifnotexists(evaldir)
+
+    dataset_conf = conf.get_config('dataset')
+    model = utils.get_class(conf.get_string('train.model_class'))(conf=conf.get_config('model'),\
+                                                                  id=scan_id, datadir=dataset_conf['data_dir'])
+    if torch.cuda.is_available():
+        model.cuda()
+
+
+    if kwargs['scan_id'] != -1:
+        dataset_conf['scan_id'] = kwargs['scan_id']
+    eval_dataset = utils.get_class(conf.get_string('train.dataset_class'))(False, **dataset_conf)
+
+    if eval_rendering:
+        eval_dataloader = torch.utils.data.DataLoader(eval_dataset,
+                                                      batch_size=1,
+                                                      shuffle=False,
+                                                      collate_fn=eval_dataset.collate_fn
+                                                      )
+        total_pixels = eval_dataset.total_pixels
+        img_res = eval_dataset.img_res
+
+    old_checkpnts_dir = os.path.join(expdir, timestamp, 'checkpoints')
+
+    saved_model_state = torch.load(os.path.join(old_checkpnts_dir, 'ModelParameters', str(kwargs['checkpoint']) + ".pth"))
+    model.load_state_dict(saved_model_state["model_state_dict"])
+    epoch = saved_model_state['epoch']
+
+    ####################################################################################################################
+    print("evaluating...")
+
+    model.eval()
+
+    detail_3dmm, detail_3dmm_subdivision_full = plt.get_displacement_mesh(model)
+    detail_3dmm.export('{0}/Detailed_3dmm_{1}.obj'.format(evaldir, epoch), 'obj')
+    detail_3dmm_subdivision_full.export('{0}/Subdivide_full_{1}.obj'.format(evaldir, epoch), 'obj')
+
+    if eval_animation:
+        sdf_np0, sdf_np1 = plt.get_displacement_animation(model)
+        np.save('{0}/Cropped_Detailed_sdf_{1}.npy'.format(evaldir, epoch), sdf_np0)
+        np.save('{0}/Cropped_Subdivide_full_{1}.npy'.format(evaldir, epoch), sdf_np1)
+
+    if eval_rendering:
+        images_dir = '{0}/rendering'.format(evaldir)
+        utils.mkdir_ifnotexists(images_dir)
+
+        psnrs = []
+        for data_index, (indices, model_input, ground_truth) in enumerate(eval_dataloader):
+            model_input["intrinsics"] = model_input["intrinsics"].cuda()
+            model_input["uv"] = model_input["uv"].cuda()
+            model_input["object_mask"] = model_input["object_mask"].cuda()
+            model_input['pose'] = model_input['pose'].cuda()
+
+            split = utils.split_input(model_input, total_pixels)
+            res = []
+            for s in split:
+                out = model(s)
+                res.append({
+                    'rgb_values': out['rgb_values'].detach(),
+                    'diffuse_values': out['diffuse_values'].detach(),
+                    'specular_values': out['specular_values'].detach(),
+                    'albedo_values': out['albedo_values'].detach(),
+                })
+
+            batch_size = ground_truth['rgb'].shape[0]
+            model_outputs = utils.merge_output(res, total_pixels, batch_size)
+            rgb_eval = model_outputs['rgb_values']
+            rgb_eval = rgb_eval.reshape(batch_size, total_pixels, 3)
+            rgb_eval = (rgb_eval + 1.) / 2.
+            rgb_eval = plt.lin2img(rgb_eval, img_res).detach().cpu().numpy()[0]
+            rgb_eval = rgb_eval.transpose(1, 2, 0)
+            img = Image.fromarray((rgb_eval * 255).astype(np.uint8))
+            img.save('{0}/eval_{1}.png'.format(images_dir,'%03d' % indices[0]))
+
+            diffuse_eval = model_outputs['diffuse_values']
+            diffuse_eval = diffuse_eval.reshape(batch_size, total_pixels, 3)
+            diffuse_eval = (diffuse_eval + 1.) / 2.
+            diffuse_eval = plt.lin2img(diffuse_eval, img_res).detach().cpu().numpy()[0]
+            diffuse_eval = diffuse_eval.transpose(1, 2, 0)
+            img = Image.fromarray((diffuse_eval * 255).astype(np.uint8))
+            img.save('{0}/eval_{1}_diffuse.png'.format(images_dir, '%03d' % indices[0]))
+
+            specular_eval = model_outputs['specular_values']
+            specular_eval = specular_eval.reshape(batch_size, total_pixels, 3)
+            specular_eval = (specular_eval + 1.) / 2.
+            specular_eval = plt.lin2img(specular_eval, img_res).detach().cpu().numpy()[0]
+            specular_eval = specular_eval.transpose(1, 2, 0)
+            img = Image.fromarray((specular_eval * 255).astype(np.uint8))
+            img.save('{0}/eval_{1}_specular.png'.format(images_dir, '%03d' % indices[0]))
+
+            albedo_eval = model_outputs['albedo_values']
+            albedo_eval = albedo_eval.reshape(batch_size, total_pixels, 3)
+            albedo_eval = (albedo_eval + 1.) / 2.
+            albedo_eval = plt.lin2img(albedo_eval, img_res).detach().cpu().numpy()[0]
+            albedo_eval = albedo_eval.transpose(1, 2, 0)
+            img = Image.fromarray((albedo_eval * 255).astype(np.uint8))
+            img.save('{0}/eval_{1}_albedo.png'.format(images_dir, '%03d' % indices[0]))
+
+            rgb_gt = ground_truth['rgb']
+            rgb_gt = (rgb_gt + 1.) / 2.
+            rgb_gt = plt.lin2img(rgb_gt, img_res).numpy()[0]
+            rgb_gt = rgb_gt.transpose(1, 2, 0)
+
+            mask = model_input['object_mask']
+            mask = plt.lin2img(mask.unsqueeze(-1), img_res).cpu().numpy()[0]
+            mask = mask.transpose(1, 2, 0)
+
+            rgb_eval_masked = rgb_eval * mask
+            rgb_gt_masked = rgb_gt * mask
+
+            psnr = calculate_psnr(rgb_eval_masked, rgb_gt_masked, mask)
+            psnrs.append(psnr)
+
+        psnrs = np.array(psnrs).astype(np.float64)
+        print("RENDERING EVALUATION {2}: psnr mean = {0} ; psnr std = {1}".format("%.2f" % psnrs.mean(), "%.2f" % psnrs.std(), scan_id))
+
+
+
+def calculate_psnr(img1, img2, mask):
+    # img1 and img2 have range [0, 1]
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    mse = np.mean((img1 - img2)**2) * (img2.shape[0] * img2.shape[1]) / mask.sum()
+    if mse == 0:
+        return float('inf')
+    return 20 * math.log10(1.0 / math.sqrt(mse))
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--conf', type=str, default='./confs/test.conf')
+    parser.add_argument('--expname', type=str, default='', help='The experiment name to be evaluated.')
+    parser.add_argument('--exps_folder', type=str, default='exps', help='The experiments folder name.')
+    parser.add_argument('--gpu', type=str, default='auto', help='GPU to use [default: GPU auto]')
+    parser.add_argument('--timestamp', default='latest', type=str, help='The experiemnt timestamp to test.')
+    parser.add_argument('--checkpoint', default='latest',type=str,help='The trained model checkpoint to test')
+    parser.add_argument('--scan_id', type=int, default=0, help='If set, taken to be the scan id.')
+    parser.add_argument('--resolution', default=512, type=int, help='Grid resolution for marching cube')
+    parser.add_argument('--is_uniform_grid', default=False, action="store_true", help='If set, evaluate marching cube with uniform grid.')
+    parser.add_argument('--eval_rendering', default=False, action="store_true",help='If set, evaluate rendering quality.')
+    parser.add_argument('--eval_animation', default=False, action="store_true",help='If set, evaluate rendering quality.')
+
+    opt = parser.parse_args()
+
+    if opt.gpu == "auto":
+        deviceIDs = GPUtil.getAvailable(order='memory', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[])
+        gpu = deviceIDs[0]
+    else:
+        gpu = opt.gpu
+
+    if (not gpu == 'ignore'):
+        os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format(gpu)
+
+    evaluate(conf=opt.conf,
+             expname=opt.expname,
+             exps_folder_name=opt.exps_folder,
+             evals_folder_name='evals',
+             timestamp=opt.timestamp,
+             checkpoint=opt.checkpoint,
+             scan_id=opt.scan_id,
+             resolution=opt.resolution,
+             eval_rendering=opt.eval_rendering,
+             eval_animation=opt.eval_animation
+             )
diff --git a/insightface/reconstruction/PBIDR/code/model/embedder.py b/insightface/reconstruction/PBIDR/code/model/embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f2c083560667833cd6d0c3c86cf4a109c69a66
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/model/embedder.py
@@ -0,0 +1,50 @@
+import torch
+
+""" Positional encoding embedding. Code was taken from https://github.com/bmild/nerf. """
+
+class Embedder:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.create_embedding_fn()
+
+    def create_embedding_fn(self):
+        embed_fns = []
+        d = self.kwargs['input_dims']
+        out_dim = 0
+        if self.kwargs['include_input']:
+            embed_fns.append(lambda x: x)
+            out_dim += d
+
+        max_freq = self.kwargs['max_freq_log2']
+        N_freqs = self.kwargs['num_freqs']
+
+        if self.kwargs['log_sampling']:
+            freq_bands = 2. ** torch.linspace(0., max_freq, N_freqs)
+        else:
+            freq_bands = torch.linspace(2.**0., 2.**max_freq, N_freqs)
+
+        for freq in freq_bands:
+            for p_fn in self.kwargs['periodic_fns']:
+                embed_fns.append(lambda x, p_fn=p_fn,
+                                 freq=freq: p_fn(x * freq))
+                out_dim += d
+
+        self.embed_fns = embed_fns
+        self.out_dim = out_dim
+
+    def embed(self, inputs):
+        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)
+
+def get_embedder(multires):
+    embed_kwargs = {
+        'include_input': True,
+        'input_dims': 3,
+        'max_freq_log2': multires-1,
+        'num_freqs': multires,
+        'log_sampling': True,
+        'periodic_fns': [torch.sin, torch.cos],
+    }
+
+    embedder_obj = Embedder(**embed_kwargs)
+    def embed(x, eo=embedder_obj): return eo.embed(x)
+    return embed, embedder_obj.out_dim
diff --git a/insightface/reconstruction/PBIDR/code/model/loss.py b/insightface/reconstruction/PBIDR/code/model/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9843ad350fd86182f08c51896d670b55bf6a760d
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/model/loss.py
@@ -0,0 +1,69 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+class IFLoss(nn.Module):
+    def __init__(self, eikonal_weight, mask_weight, reg_weight, normal_weight, alpha):
+        super().__init__()
+        self.eikonal_weight = eikonal_weight
+        self.mask_weight = mask_weight
+        self.reg_weight = reg_weight
+        self.normal_weight = normal_weight
+        self.alpha = alpha
+        self.l1_loss = nn.L1Loss(reduction='sum')
+        self.l2_loss = nn.MSELoss(reduction='sum')
+        self.cosine = nn.CosineSimilarity()
+
+    def get_rgb_loss(self,rgb_values, rgb_gt, network_object_mask, object_mask):
+        if (network_object_mask & object_mask).sum() == 0:
+            return torch.tensor(0.0).cuda().float()
+
+        rgb_values = rgb_values[network_object_mask & object_mask]
+        rgb_gt = rgb_gt.reshape(-1, 3)[network_object_mask & object_mask]
+        rgb_loss = self.l1_loss(rgb_values, rgb_gt) / float(object_mask.shape[0])
+        return rgb_loss
+
+    def get_eikonal_loss(self, grad_theta):
+        if grad_theta.shape[0] == 0:
+            return torch.tensor(0.0).cuda().float()
+
+        eikonal_loss = ((grad_theta.norm(2, dim=1) - 1) ** 2).mean()
+        return eikonal_loss
+
+    def get_mask_loss(self, sdf_output, network_object_mask, object_mask):
+        mask = ~(network_object_mask & object_mask)
+        if mask.sum() == 0:
+            return torch.tensor(0.0).cuda().float()
+        sdf_pred = -self.alpha * sdf_output[mask]
+        gt = object_mask[mask].float()
+        mask_loss = (1 / self.alpha) * F.binary_cross_entropy_with_logits(sdf_pred.squeeze(), gt, reduction='sum') / float(object_mask.shape[0])
+        return mask_loss
+
+    def get_reg_loss(self, point_gt, point_pre):
+        loss = self.l2_loss(point_gt, point_pre) / len(point_pre)
+        return loss
+
+    def forward(self, model_outputs, ground_truth):
+        rgb_gt = ground_truth['rgb'].cuda()
+        network_object_mask = model_outputs['network_object_mask']
+        object_mask = model_outputs['object_mask']
+
+        rgb_loss = self.get_rgb_loss(model_outputs['rgb_values'], rgb_gt, network_object_mask, object_mask)
+        mask_loss = self.get_mask_loss(model_outputs['sdf_output'], network_object_mask, object_mask)
+        eikonal_loss = self.get_eikonal_loss(model_outputs['grad_theta'])
+        reg_loss = self.get_reg_loss(model_outputs['points_mesh_ray_gt'], model_outputs['points_pre'])
+        normal_loss = 1 - torch.mean(self.cosine(model_outputs['points_mesh_ray_normals'], model_outputs['surface_normals']))
+        loss = rgb_loss + \
+               self.eikonal_weight * eikonal_loss + \
+               self.mask_weight * mask_loss + \
+               self.reg_weight * reg_loss + \
+                self.normal_weight * normal_loss
+
+        return {
+            'loss': loss,
+            'rgb_loss': rgb_loss,
+            'eikonal_loss': eikonal_loss,
+            'mask_loss': mask_loss,
+            'reg_loss': reg_loss,
+            'normal_loss': normal_loss,
+        }
diff --git a/insightface/reconstruction/PBIDR/code/model/ray_tracing.py b/insightface/reconstruction/PBIDR/code/model/ray_tracing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9ea8352a2d7d7134d5b62833745d84a1d25fed
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/model/ray_tracing.py
@@ -0,0 +1,301 @@
+import torch
+import torch.nn as nn
+from utils import rend_util
+
+class RayTracing(nn.Module):
+    def __init__(
+            self,
+            object_bounding_sphere=1.0,
+            sdf_threshold=5.0e-5,
+            line_search_step=0.5,
+            line_step_iters=1,
+            sphere_tracing_iters=10,
+            n_steps=100,
+            n_secant_steps=8,
+    ):
+        super().__init__()
+
+        self.object_bounding_sphere = object_bounding_sphere
+        self.sdf_threshold = sdf_threshold
+        self.sphere_tracing_iters = sphere_tracing_iters
+        self.line_step_iters = line_step_iters
+        self.line_search_step = line_search_step
+        self.n_steps = n_steps
+        self.n_secant_steps = n_secant_steps
+
+    def forward(self,
+                sdf,
+                cam_loc,
+                object_mask,
+                ray_directions
+                ):
+
+        batch_size, num_pixels, _ = ray_directions.shape
+
+        sphere_intersections, mask_intersect = rend_util.get_sphere_intersection(cam_loc, ray_directions, r=self.object_bounding_sphere)
+
+        curr_start_points, unfinished_mask_start, acc_start_dis, acc_end_dis, min_dis, max_dis = \
+            self.sphere_tracing(batch_size, num_pixels, sdf, cam_loc, ray_directions, mask_intersect, sphere_intersections)
+
+        network_object_mask = (acc_start_dis < acc_end_dis)
+
+        # The non convergent rays should be handled by the sampler
+        sampler_mask = unfinished_mask_start
+        sampler_net_obj_mask = torch.zeros_like(sampler_mask).bool().cuda()
+        if sampler_mask.sum() > 0:
+            sampler_min_max = torch.zeros((batch_size, num_pixels, 2)).cuda()
+            sampler_min_max.reshape(-1, 2)[sampler_mask, 0] = acc_start_dis[sampler_mask]
+            sampler_min_max.reshape(-1, 2)[sampler_mask, 1] = acc_end_dis[sampler_mask]
+
+            sampler_pts, sampler_net_obj_mask, sampler_dists = self.ray_sampler(sdf,
+                                                                                cam_loc,
+                                                                                object_mask,
+                                                                                ray_directions,
+                                                                                sampler_min_max,
+                                                                                sampler_mask
+                                                                                )
+
+            curr_start_points[sampler_mask] = sampler_pts[sampler_mask]
+            acc_start_dis[sampler_mask] = sampler_dists[sampler_mask]
+            network_object_mask[sampler_mask] = sampler_net_obj_mask[sampler_mask]
+
+        print('----------------------------------------------------------------')
+        print('RayTracing: object = {0}/{1}, secant on {2}/{3}.'
+              .format(network_object_mask.sum(), len(network_object_mask), sampler_net_obj_mask.sum(), sampler_mask.sum()))
+        print('----------------------------------------------------------------')
+
+        if not self.training:
+            return curr_start_points, \
+                   network_object_mask, \
+                   acc_start_dis
+
+        ray_directions = ray_directions.reshape(-1, 3)
+        mask_intersect = mask_intersect.reshape(-1)
+
+        in_mask = ~network_object_mask & object_mask & ~sampler_mask
+        out_mask = ~object_mask & ~sampler_mask
+
+        mask_left_out = (in_mask | out_mask) & ~mask_intersect
+        if mask_left_out.sum() > 0:  # project the origin to the not intersect points on the sphere
+            cam_left_out = cam_loc.unsqueeze(1).repeat(1, num_pixels, 1).reshape(-1, 3)[mask_left_out]
+            rays_left_out = ray_directions[mask_left_out]
+            acc_start_dis[mask_left_out] = -torch.bmm(rays_left_out.view(-1, 1, 3), cam_left_out.view(-1, 3, 1)).squeeze()
+            curr_start_points[mask_left_out] = cam_left_out + acc_start_dis[mask_left_out].unsqueeze(1) * rays_left_out
+
+        mask = (in_mask | out_mask) & mask_intersect
+
+        if mask.sum() > 0:
+            min_dis[network_object_mask & out_mask] = acc_start_dis[network_object_mask & out_mask]
+
+            min_mask_points, min_mask_dist = self.minimal_sdf_points(num_pixels, sdf, cam_loc, ray_directions, mask, min_dis, max_dis)
+
+            curr_start_points[mask] = min_mask_points
+            acc_start_dis[mask] = min_mask_dist
+
+        return curr_start_points, \
+               network_object_mask, \
+               acc_start_dis
+
+
+    def sphere_tracing(self, batch_size, num_pixels, sdf, cam_loc, ray_directions, mask_intersect, sphere_intersections):
+        ''' Run sphere tracing algorithm for max iterations from both sides of unit sphere intersection '''
+
+        sphere_intersections_points = cam_loc.reshape(batch_size, 1, 1, 3) + sphere_intersections.unsqueeze(-1) * ray_directions.unsqueeze(2)
+        unfinished_mask_start = mask_intersect.reshape(-1).clone()
+        unfinished_mask_end = mask_intersect.reshape(-1).clone()
+
+        # Initialize start current points
+        curr_start_points = torch.zeros(batch_size * num_pixels, 3).cuda().float()
+        curr_start_points[unfinished_mask_start] = sphere_intersections_points[:,:,0,:].reshape(-1,3)[unfinished_mask_start]
+        acc_start_dis = torch.zeros(batch_size * num_pixels).cuda().float()
+        acc_start_dis[unfinished_mask_start] = sphere_intersections.reshape(-1,2)[unfinished_mask_start,0]
+
+        # Initialize end current points
+        curr_end_points = torch.zeros(batch_size * num_pixels, 3).cuda().float()
+        curr_end_points[unfinished_mask_end] = sphere_intersections_points[:,:,1,:].reshape(-1,3)[unfinished_mask_end]
+        acc_end_dis = torch.zeros(batch_size * num_pixels).cuda().float()
+        acc_end_dis[unfinished_mask_end] = sphere_intersections.reshape(-1,2)[unfinished_mask_end,1]
+
+        # Initizliae min and max depth
+        min_dis = acc_start_dis.clone()
+        max_dis = acc_end_dis.clone()
+
+        # Iterate on the rays (from both sides) till finding a surface
+        iters = 0
+
+        next_sdf_start = torch.zeros_like(acc_start_dis).cuda()
+        next_sdf_start[unfinished_mask_start] = sdf(curr_start_points[unfinished_mask_start])
+
+        next_sdf_end = torch.zeros_like(acc_end_dis).cuda()
+        next_sdf_end[unfinished_mask_end] = sdf(curr_end_points[unfinished_mask_end])
+
+        while True:
+            # Update sdf
+            curr_sdf_start = torch.zeros_like(acc_start_dis).cuda()
+            curr_sdf_start[unfinished_mask_start] = next_sdf_start[unfinished_mask_start]
+            curr_sdf_start[curr_sdf_start <= self.sdf_threshold] = 0
+
+            curr_sdf_end = torch.zeros_like(acc_end_dis).cuda()
+            curr_sdf_end[unfinished_mask_end] = next_sdf_end[unfinished_mask_end]
+            curr_sdf_end[curr_sdf_end <= self.sdf_threshold] = 0
+
+            # Update masks
+            unfinished_mask_start = unfinished_mask_start & (curr_sdf_start > self.sdf_threshold)
+            unfinished_mask_end = unfinished_mask_end & (curr_sdf_end > self.sdf_threshold)
+
+            if (unfinished_mask_start.sum() == 0 and unfinished_mask_end.sum() == 0) or iters == self.sphere_tracing_iters:
+                break
+            iters += 1
+
+            # Make step
+            # Update distance
+            acc_start_dis = acc_start_dis + curr_sdf_start
+            acc_end_dis = acc_end_dis - curr_sdf_end
+
+            # Update points
+            curr_start_points = (cam_loc.unsqueeze(1) + acc_start_dis.reshape(batch_size, num_pixels, 1) * ray_directions).reshape(-1, 3)
+            curr_end_points = (cam_loc.unsqueeze(1) + acc_end_dis.reshape(batch_size, num_pixels, 1) * ray_directions).reshape(-1, 3)
+
+            # Fix points which wrongly crossed the surface
+            next_sdf_start = torch.zeros_like(acc_start_dis).cuda()
+            next_sdf_start[unfinished_mask_start] = sdf(curr_start_points[unfinished_mask_start])
+
+            next_sdf_end = torch.zeros_like(acc_end_dis).cuda()
+            next_sdf_end[unfinished_mask_end] = sdf(curr_end_points[unfinished_mask_end])
+
+            not_projected_start = next_sdf_start < 0
+            not_projected_end = next_sdf_end < 0
+            not_proj_iters = 0
+            while (not_projected_start.sum() > 0 or not_projected_end.sum() > 0) and not_proj_iters < self.line_step_iters:
+                # Step backwards
+                acc_start_dis[not_projected_start] -= ((1 - self.line_search_step) / (2 ** not_proj_iters)) * curr_sdf_start[not_projected_start]
+                curr_start_points[not_projected_start] = (cam_loc.unsqueeze(1) + acc_start_dis.reshape(batch_size, num_pixels, 1) * ray_directions).reshape(-1, 3)[not_projected_start]
+
+                acc_end_dis[not_projected_end] += ((1 - self.line_search_step) / (2 ** not_proj_iters)) * curr_sdf_end[not_projected_end]
+                curr_end_points[not_projected_end] = (cam_loc.unsqueeze(1) + acc_end_dis.reshape(batch_size, num_pixels, 1) * ray_directions).reshape(-1, 3)[not_projected_end]
+
+                # Calc sdf
+                next_sdf_start[not_projected_start] = sdf(curr_start_points[not_projected_start])
+                next_sdf_end[not_projected_end] = sdf(curr_end_points[not_projected_end])
+
+                # Update mask
+                not_projected_start = next_sdf_start < 0
+                not_projected_end = next_sdf_end < 0
+                not_proj_iters += 1
+
+            unfinished_mask_start = unfinished_mask_start & (acc_start_dis < acc_end_dis)
+            unfinished_mask_end = unfinished_mask_end & (acc_start_dis < acc_end_dis)
+
+        return curr_start_points, unfinished_mask_start, acc_start_dis, acc_end_dis, min_dis, max_dis
+
+    def ray_sampler(self, sdf, cam_loc, object_mask, ray_directions, sampler_min_max, sampler_mask):
+        ''' Sample the ray in a given range and run secant on rays which have sign transition '''
+
+        batch_size, num_pixels, _ = ray_directions.shape
+        n_total_pxl = batch_size * num_pixels
+        sampler_pts = torch.zeros(n_total_pxl, 3).cuda().float()
+        sampler_dists = torch.zeros(n_total_pxl).cuda().float()
+
+        intervals_dist = torch.linspace(0, 1, steps=self.n_steps).cuda().view(1, 1, -1)
+
+        pts_intervals = sampler_min_max[:, :, 0].unsqueeze(-1) + intervals_dist * (sampler_min_max[:, :, 1] - sampler_min_max[:, :, 0]).unsqueeze(-1)
+        points = cam_loc.reshape(batch_size, 1, 1, 3) + pts_intervals.unsqueeze(-1) * ray_directions.unsqueeze(2)
+
+        # Get the non convergent rays
+        mask_intersect_idx = torch.nonzero(sampler_mask).flatten()
+        points = points.reshape((-1, self.n_steps, 3))[sampler_mask, :, :]
+        pts_intervals = pts_intervals.reshape((-1, self.n_steps))[sampler_mask]
+
+        sdf_val_all = []
+        for pnts in torch.split(points.reshape(-1, 3), 100000, dim=0):
+            sdf_val_all.append(sdf(pnts))
+        sdf_val = torch.cat(sdf_val_all).reshape(-1, self.n_steps)
+
+        tmp = torch.sign(sdf_val) * torch.arange(self.n_steps, 0, -1).cuda().float().reshape((1, self.n_steps))  # Force argmin to return the first min value
+        sampler_pts_ind = torch.argmin(tmp, -1)
+        sampler_pts[mask_intersect_idx] = points[torch.arange(points.shape[0]), sampler_pts_ind, :]
+        sampler_dists[mask_intersect_idx] = pts_intervals[torch.arange(pts_intervals.shape[0]), sampler_pts_ind]
+
+        true_surface_pts = object_mask[sampler_mask]
+        net_surface_pts = (sdf_val[torch.arange(sdf_val.shape[0]), sampler_pts_ind] < 0)
+
+        # take points with minimal SDF value for P_out pixels
+        p_out_mask = ~(true_surface_pts & net_surface_pts)
+        n_p_out = p_out_mask.sum()
+        if n_p_out > 0:
+            out_pts_idx = torch.argmin(sdf_val[p_out_mask, :], -1)
+            sampler_pts[mask_intersect_idx[p_out_mask]] = points[p_out_mask, :, :][torch.arange(n_p_out), out_pts_idx, :]
+            sampler_dists[mask_intersect_idx[p_out_mask]] = pts_intervals[p_out_mask, :][torch.arange(n_p_out), out_pts_idx]
+
+        # Get Network object mask
+        sampler_net_obj_mask = sampler_mask.clone()
+        sampler_net_obj_mask[mask_intersect_idx[~net_surface_pts]] = False
+
+        # Run Secant method
+        secant_pts = net_surface_pts & true_surface_pts if self.training else net_surface_pts
+        n_secant_pts = secant_pts.sum()
+        if n_secant_pts > 0:
+            # Get secant z predictions
+            z_high = pts_intervals[torch.arange(pts_intervals.shape[0]), sampler_pts_ind][secant_pts]
+            sdf_high = sdf_val[torch.arange(sdf_val.shape[0]), sampler_pts_ind][secant_pts]
+            z_low = pts_intervals[secant_pts][torch.arange(n_secant_pts), sampler_pts_ind[secant_pts] - 1]
+            sdf_low = sdf_val[secant_pts][torch.arange(n_secant_pts), sampler_pts_ind[secant_pts] - 1]
+            cam_loc_secant = cam_loc.unsqueeze(1).repeat(1, num_pixels, 1).reshape((-1, 3))[mask_intersect_idx[secant_pts]]
+            ray_directions_secant = ray_directions.reshape((-1, 3))[mask_intersect_idx[secant_pts]]
+            z_pred_secant = self.secant(sdf_low, sdf_high, z_low, z_high, cam_loc_secant, ray_directions_secant, sdf)
+
+            # Get points
+            sampler_pts[mask_intersect_idx[secant_pts]] = cam_loc_secant + z_pred_secant.unsqueeze(-1) * ray_directions_secant
+            sampler_dists[mask_intersect_idx[secant_pts]] = z_pred_secant
+
+        return sampler_pts, sampler_net_obj_mask, sampler_dists
+
+    def secant(self, sdf_low, sdf_high, z_low, z_high, cam_loc, ray_directions, sdf):
+        ''' Runs the secant method for interval [z_low, z_high] for n_secant_steps '''
+
+        z_pred = - sdf_low * (z_high - z_low) / (sdf_high - sdf_low) + z_low
+        for i in range(self.n_secant_steps):
+            p_mid = cam_loc + z_pred.unsqueeze(-1) * ray_directions
+            sdf_mid = sdf(p_mid)
+            ind_low = sdf_mid > 0
+            if ind_low.sum() > 0:
+                z_low[ind_low] = z_pred[ind_low]
+                sdf_low[ind_low] = sdf_mid[ind_low]
+            ind_high = sdf_mid < 0
+            if ind_high.sum() > 0:
+                z_high[ind_high] = z_pred[ind_high]
+                sdf_high[ind_high] = sdf_mid[ind_high]
+
+            z_pred = - sdf_low * (z_high - z_low) / (sdf_high - sdf_low) + z_low
+
+        return z_pred
+
+    def minimal_sdf_points(self, num_pixels, sdf, cam_loc, ray_directions, mask, min_dis, max_dis):
+        ''' Find points with minimal SDF value on rays for P_out pixels '''
+
+        n_mask_points = mask.sum()
+
+        n = self.n_steps
+        # steps = torch.linspace(0.0, 1.0,n).cuda()
+        steps = torch.empty(n).uniform_(0.0, 1.0).cuda()
+        mask_max_dis = max_dis[mask].unsqueeze(-1)
+        mask_min_dis = min_dis[mask].unsqueeze(-1)
+        steps = steps.unsqueeze(0).repeat(n_mask_points, 1) * (mask_max_dis - mask_min_dis) + mask_min_dis
+
+        mask_points = cam_loc.unsqueeze(1).repeat(1, num_pixels, 1).reshape(-1, 3)[mask]
+        mask_rays = ray_directions[mask, :]
+
+        mask_points_all = mask_points.unsqueeze(1).repeat(1, n, 1) + steps.unsqueeze(-1) * mask_rays.unsqueeze(
+            1).repeat(1, n, 1)
+        points = mask_points_all.reshape(-1, 3)
+
+        mask_sdf_all = []
+        for pnts in torch.split(points, 100000, dim=0):
+            mask_sdf_all.append(sdf(pnts))
+
+        mask_sdf_all = torch.cat(mask_sdf_all).reshape(-1, n)
+        min_vals, min_idx = mask_sdf_all.min(-1)
+        min_mask_points = mask_points_all.reshape(-1, n, 3)[torch.arange(0, n_mask_points), min_idx]
+        min_mask_dist = steps.reshape(-1, n)[torch.arange(0, n_mask_points), min_idx]
+
+        return min_mask_points, min_mask_dist
diff --git a/insightface/reconstruction/PBIDR/code/model/renderer.py b/insightface/reconstruction/PBIDR/code/model/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7081698943d38354b899ba07c179da300f0bce48
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/model/renderer.py
@@ -0,0 +1,461 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import trimesh
+import os
+
+from utils import rend_util
+from model.embedder import *
+from model.ray_tracing import RayTracing
+from model.sample_network import SampleNetwork
+
+
+def barycentric_coordinates(p, select_vertices):
+
+    a = select_vertices[:, 0, :]
+    b = select_vertices[:, 1, :]
+    c = select_vertices[:, 2, :]
+    # p = point
+
+    v0 = b - a
+    v1 = c - a
+    v2 = p - a
+    d00 = (v0 * v0).sum(axis=1)
+    d01 = (v0 * v1).sum(axis=1)
+    d11 = (v1 * v1).sum(axis=1)
+    d20 = (v2 * v0).sum(axis=1)
+    d21 = (v2 * v1).sum(axis=1)
+    denom = d00 * d11 - d01 * d01
+    v = (d11 * d20 - d01 * d21) / denom
+    w = (d00 * d21 - d01 * d20) / denom
+    u = 1 - v - w
+
+    return np.vstack([u, v, w]).T
+
+class ImplicitNetwork(nn.Module):
+    def __init__(
+            self,
+            feature_vector_size,
+            d_in,
+            d_out,
+            dims,
+            geometric_init=True,
+            bias=1.0,
+            skip_in=(),
+            weight_norm=True,
+            multires=0
+    ):
+        super().__init__()
+
+        dims = [d_in] + dims + [d_out + feature_vector_size]
+
+        self.embed_fn = None
+        if multires > 0:
+            embed_fn, input_ch = get_embedder(multires)
+            self.embed_fn = embed_fn
+            dims[0] = input_ch
+
+        self.num_layers = len(dims)
+        self.skip_in = skip_in
+
+        for l in range(0, self.num_layers - 1):
+            if l + 1 in self.skip_in:
+                out_dim = dims[l + 1] - dims[0]
+            else:
+                out_dim = dims[l + 1]
+
+            lin = nn.Linear(dims[l], out_dim)
+
+            if geometric_init:
+                if l == self.num_layers - 2:
+                    torch.nn.init.normal_(lin.weight, mean=np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
+                    torch.nn.init.constant_(lin.bias, -bias)
+                elif multires > 0 and l == 0:
+                    torch.nn.init.constant_(lin.bias, 0.0)
+                    torch.nn.init.constant_(lin.weight[:, 3:], 0.0)
+                    torch.nn.init.normal_(lin.weight[:, :3], 0.0, np.sqrt(2) / np.sqrt(out_dim))
+                elif multires > 0 and l in self.skip_in:
+                    torch.nn.init.constant_(lin.bias, 0.0)
+                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
+                    torch.nn.init.constant_(lin.weight[:, -(dims[0] - 3):], 0.0)
+                else:
+                    torch.nn.init.constant_(lin.bias, 0.0)
+                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
+
+            if weight_norm:
+                lin = nn.utils.weight_norm(lin)
+
+            setattr(self, "lin" + str(l), lin)
+
+        self.softplus = nn.Softplus(beta=100)
+
+    def forward(self, input, compute_grad=False):
+        if self.embed_fn is not None:
+            input = self.embed_fn(input)
+
+        x = input
+
+        for l in range(0, self.num_layers - 1):
+            lin = getattr(self, "lin" + str(l))
+
+            if l in self.skip_in:
+                x = torch.cat([x, input], 1) / np.sqrt(2)
+
+            x = lin(x)
+
+            if l < self.num_layers - 2:
+                x = self.softplus(x)
+
+        return x
+
+    def gradient(self, x):
+        x.requires_grad_(True)
+        y = self.forward(x)[:,:1]
+        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
+        gradients = torch.autograd.grad(
+            outputs=y,
+            inputs=x,
+            grad_outputs=d_output,
+            create_graph=True,
+            retain_graph=True,
+            only_inputs=True)[0]
+        return gradients.unsqueeze(1)
+
+class AlbedoNetwork(nn.Module):
+    def __init__(
+            self,
+            feature_vector_size,
+            dims=[512, 512, 512, 512],
+            weight_norm=True,
+            multires_view=4,
+    ):
+        super().__init__()
+
+        dims = [3 + feature_vector_size] + dims + [3]
+        embedview_fn, input_ch = get_embedder(multires_view)
+        self.embedview_fn = embedview_fn
+        dims[0] += (input_ch - 3)
+        self.num_layers = len(dims)
+
+        for l in range(0, self.num_layers - 1):
+            out_dim = dims[l + 1]
+            lin = nn.Linear(dims[l], out_dim)
+
+            if weight_norm:
+                lin = nn.utils.weight_norm(lin)
+
+            setattr(self, "lin" + str(l), lin)
+
+        self.relu = nn.ReLU()
+        self.tanh = nn.Tanh()
+
+    def forward(self, points, feature_vectors):
+
+        Mpoints = self.embedview_fn(points)
+        x = torch.cat([Mpoints, feature_vectors], dim=-1)
+
+        for l in range(0, self.num_layers - 1):
+            lin = getattr(self, "lin" + str(l))
+
+            x = lin(x)
+
+            if l < self.num_layers - 2:
+                x = self.relu(x)
+
+        x = self.tanh(x)
+        return x
+
+class SpecularNetwork(nn.Module):
+    def __init__(
+            self,
+            dims=[256, 256, 256],
+            weight_norm=True,
+            multires_view=4
+    ):
+        super().__init__()
+        dims = [3 + 3] + dims + [1]
+
+        embedview_fn, input_ch = get_embedder(multires_view)
+        self.embedview_fn = embedview_fn
+        dims[0] += (input_ch - 3)
+        dims[0] += (input_ch - 3)
+        self.num_layers = len(dims)
+
+        for l in range(0, self.num_layers - 1):
+            out_dim = dims[l + 1]
+            lin = nn.Linear(dims[l], out_dim)
+
+            if weight_norm:
+                lin = nn.utils.weight_norm(lin)
+
+            setattr(self, "lin" + str(l), lin)
+
+        self.relu = nn.ReLU()
+        self.tanh = nn.Tanh()
+
+    def forward(self, normals, view_dirs):
+
+        Mview_dirs = self.embedview_fn(view_dirs)
+        Mnormals= self.embedview_fn(normals)
+        x = torch.cat([Mview_dirs, Mnormals], dim=-1)
+
+        for l in range(0, self.num_layers - 1):
+            lin = getattr(self, "lin" + str(l))
+
+            x = lin(x)
+
+            if l < self.num_layers - 2:
+                x = self.relu(x)
+
+        x = self.tanh(x)
+        return x
+    def optimaize(self):
+        return
+
+class DiffuseNetwork(nn.Module):
+    def __init__(
+            self,
+            dims=[256, 256, 256],
+            weight_norm=True,
+            multires_view=6,
+    ):
+        super().__init__()
+
+        dims = [3] + dims + [1]
+        embedview_fn, input_ch = get_embedder(multires_view)
+        self.embedview_fn = embedview_fn
+        dims[0] += (input_ch - 3)
+        self.num_layers = len(dims)
+
+        for l in range(0, self.num_layers - 1):
+            out_dim = dims[l + 1]
+            lin = nn.Linear(dims[l], out_dim)
+
+            if weight_norm:
+                lin = nn.utils.weight_norm(lin)
+
+            setattr(self, "lin" + str(l), lin)
+
+        self.relu = nn.ReLU()
+        self.tanh = nn.Tanh()
+
+    def forward(self, normals):
+
+        Mnormals = self.embedview_fn(normals)
+        x = Mnormals
+
+        for l in range(0, self.num_layers - 1):
+            lin = getattr(self, "lin" + str(l))
+
+            x = lin(x)
+
+            if l < self.num_layers - 2:
+                x = self.relu(x)
+
+        x = self.tanh(x)
+        return x
+
+class IFNetwork(nn.Module):
+    def __init__(self, conf, id, datadir):
+        super().__init__()
+        self.feature_vector_size = conf.get_int('feature_vector_size')
+        self.implicit_network = ImplicitNetwork(self.feature_vector_size, **conf.get_config('implicit_network'))
+        # self.rendering_network = RenderingNetwork(self.feature_vector_size, **conf.get_config('rendering_network'))
+
+        self.diffuse_network = DiffuseNetwork(**conf.get_config('diffuse_network'))
+        self.specular_network = SpecularNetwork(**conf.get_config('specular_network'))
+        self.albedo_network = AlbedoNetwork(self.feature_vector_size, **conf.get_config('albedo_network'))
+
+        self.ray_tracer = RayTracing(**conf.get_config('ray_tracer'))
+        self.sample_network = SampleNetwork()
+        self.object_bounding_sphere = conf.get_float('ray_tracer.object_bounding_sphere')
+        self.mesh = trimesh.load_mesh('{0}/mesh.obj'.format(os.path.join('../data', datadir, 'scan{0}'.format(id))),
+                                      process=False, use_embree=True)
+        self.faces = self.mesh.faces
+        self.vertex_normals = np.array(self.mesh.vertex_normals)
+        self.vertices = np.array(self.mesh.vertices)
+        print('Loaded Mesh')
+
+    def forward(self, input):
+
+        # Parse model input
+        points_predicted = None
+
+        intrinsics = input["intrinsics"]
+        uv = input["uv"]
+        pose = input["pose"]
+        object_mask = input["object_mask"].reshape(-1)
+
+        ray_dirs, cam_loc = rend_util.get_camera_params(uv, pose, intrinsics)
+        batch_size, num_pixels, _ = ray_dirs.shape
+
+        self.implicit_network.eval()
+        with torch.no_grad():
+            points, network_object_mask, dists = self.ray_tracer(sdf=lambda x: self.implicit_network(x)[:, 0],
+                                                                 cam_loc=cam_loc,
+                                                                 object_mask=object_mask,
+                                                                 ray_directions=ray_dirs)
+        self.implicit_network.train()
+
+        points = (cam_loc.unsqueeze(1) + dists.reshape(batch_size, num_pixels, 1) * ray_dirs).reshape(-1, 3)
+        points_normal = self.implicit_network.gradient(points)
+        sdf_output = self.implicit_network(points)[:, 0:1]
+        ray_dirs = ray_dirs.reshape(-1, 3)
+
+        ray_dirs_np = ray_dirs.cpu().numpy()
+        cam_loc_np = np.concatenate([cam_loc.cpu().numpy()] * len(ray_dirs_np), axis=0)
+        # points_mesh_ray: may have the more points than surface mask points,
+        # Need an Index for the Points_Mesh_Ray
+        points_mesh_ray, index_ray, index_tri = self.mesh.ray.intersects_location(ray_origins=cam_loc_np,
+                                                                                  ray_directions=ray_dirs_np,
+                                                                                  multiple_hits=False)
+        # Index ray: total 2048 / ~1200
+        MeshRay_mask = torch.tensor([True if i in index_ray else False for i in range(len(cam_loc_np))], dtype=torch.bool).to(points.device)
+        network_object_mask = network_object_mask & MeshRay_mask
+
+        if self.training:
+
+            surface_mask = network_object_mask & object_mask
+
+            listA = surface_mask.cpu().detach().numpy()
+            A = [int(a) for a in listA]
+            AA = [i for i, a in enumerate(A) if a == 1] # surface mask 的 index
+            MeshRay_Index = np.array([i for i, a in enumerate(index_ray) if a in AA], dtype=int)
+
+            face_points_index = self.faces[index_tri][MeshRay_Index]
+            select_vertex_normals = self.vertex_normals[face_points_index]
+            select_vertices = self.vertices[face_points_index]
+
+            points_mesh_ray = points_mesh_ray[MeshRay_Index]
+            bcoords = barycentric_coordinates(points_mesh_ray, select_vertices)
+            resampled_normals = np.sum(np.expand_dims(bcoords, -1) * select_vertex_normals, 1)
+
+            # Mesh Pull
+            resampled_normals = torch.tensor(resampled_normals).to(points)
+            points_mesh_ray = torch.tensor(points_mesh_ray).to(points)
+            sdf_points_mesh_ray = self.implicit_network(points_mesh_ray)[:, 0:1]
+            g_points_mesh_ray = self.implicit_network.gradient(points_mesh_ray)
+            points_predicted = points_mesh_ray - g_points_mesh_ray.squeeze() * sdf_points_mesh_ray
+
+            surface_points = points[surface_mask]
+            surface_dists = dists[surface_mask].unsqueeze(-1)
+            surface_ray_dirs = ray_dirs[surface_mask]
+            surface_cam_loc = cam_loc.unsqueeze(1).repeat(1, num_pixels, 1).reshape(-1, 3)[surface_mask]
+            surface_output = sdf_output[surface_mask]
+            N = surface_points.shape[0]
+
+            # Sample points for the eikonal loss
+            eik_bounding_box = self.object_bounding_sphere
+            n_eik_points = batch_size * num_pixels // 2
+            eikonal_points = torch.empty(n_eik_points, 3).uniform_(-eik_bounding_box, eik_bounding_box).cuda()
+            eikonal_pixel_points = points.clone()
+            eikonal_pixel_points = eikonal_pixel_points.detach()
+            eikonal_points = torch.cat([eikonal_points, eikonal_pixel_points], 0)
+
+            points_all = torch.cat([surface_points, eikonal_points], dim=0)
+
+            output = self.implicit_network(surface_points)
+            surface_sdf_values = output[:N, 0:1].detach()
+
+            g = self.implicit_network.gradient(points_all)
+            surface_points_grad = g[:N, 0, :].clone().detach()
+            grad_theta = g[N:, 0, :]
+
+            differentiable_surface_points = self.sample_network(surface_output,
+                                                                surface_sdf_values,
+                                                                surface_points_grad,
+                                                                surface_dists,
+                                                                surface_cam_loc,
+                                                                surface_ray_dirs)
+
+        else:
+            surface_mask = network_object_mask
+            differentiable_surface_points = points[surface_mask]
+            grad_theta = None
+
+            listA = surface_mask.cpu().detach().numpy()
+            A = [int(a) for a in listA]
+            AA = [i for i, a in enumerate(A) if a == 1]  # surface mask 的 index
+            MeshRay_Index = np.array([i for i, a in enumerate(index_ray) if a in AA], dtype=int)
+
+            face_points_index = self.faces[index_tri][MeshRay_Index]
+            select_vertex_normals = self.vertex_normals[face_points_index]
+            select_vertices = self.vertices[face_points_index]
+
+            points_mesh_ray = points_mesh_ray[MeshRay_Index]
+            bcoords = barycentric_coordinates(points_mesh_ray, select_vertices)
+            resampled_normals = np.sum(np.expand_dims(bcoords, -1) * select_vertex_normals, 1)
+            resampled_normals = torch.tensor(resampled_normals).to(points)
+
+        view = -ray_dirs[surface_mask]
+
+        rgb_values = torch.ones_like(points).float().cuda()
+        diffuse_values = torch.ones_like(points).float().cuda()
+        specular_values = torch.ones_like(points).float().cuda()
+        albedo_values = torch.ones_like(points).float().cuda()
+
+        if differentiable_surface_points.shape[0] > 0:
+
+            rgb_values[surface_mask] = self.get_rbg_value(differentiable_surface_points, view, resampled_normals)
+            diffuse_values[surface_mask] = self.get_diffuse_value(differentiable_surface_points, view, resampled_normals)
+
+            specular_values[surface_mask] = self.get_specular_value(differentiable_surface_points, view)
+            albedo_values[surface_mask] = self.get_albedo_value(differentiable_surface_points, view)
+
+        output = {
+            'points': points,
+            'points_pre': points_predicted,
+            'points_mesh_ray_gt': points[surface_mask],
+            'points_mesh_ray_normals': resampled_normals,
+            'surface_normals': points_normal[surface_mask].reshape([-1, 3]),
+
+            'rgb_values': rgb_values,
+            'diffuse_values': diffuse_values,
+            'specular_values': specular_values,
+            'albedo_values': albedo_values,
+
+            'sdf_output': sdf_output,
+            'network_object_mask': network_object_mask,
+            'object_mask': object_mask,
+            'grad_theta': grad_theta
+        }
+
+        return output
+
+    def get_rbg_value(self, points, view_dirs, diffuse_normals):
+        output = self.implicit_network(points)
+        g = self.implicit_network.gradient(points)
+        normals = g[:, 0, :]
+        feature_vectors = output[:, 1:]
+
+        diffuse_shading = self.diffuse_network(diffuse_normals)
+        specular_shading = self.specular_network(normals, view_dirs)
+        albedo = self.albedo_network(points, feature_vectors)
+
+        diffuse_shading = (diffuse_shading + 1.) / 2.
+        specular_shading = (specular_shading + 1.) / 2.
+        albedo = (albedo + 1.) / 2.
+
+        rgb_vals = diffuse_shading * albedo + specular_shading
+        rgb_vals = (rgb_vals * 2.) - 1.
+
+        return rgb_vals
+
+    def get_diffuse_value(self, points, view_dirs, diffuse_normals):
+
+        diffuse_shading = self.diffuse_network(diffuse_normals)
+        return diffuse_shading.expand([-1, 3])
+
+    def get_albedo_value(self, points, view_dirs):
+        output = self.implicit_network(points)
+        feature_vectors = output[:, 1:]
+        albedo = self.albedo_network(points, feature_vectors)
+
+        return albedo
+
+    def get_specular_value(self, points, view_dirs):
+        g = self.implicit_network.gradient(points)
+        normals = g[:, 0, :]
+
+        specular_shading = self.specular_network(normals, view_dirs)
+        return specular_shading.expand([-1, 3])
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/model/sample_network.py b/insightface/reconstruction/PBIDR/code/model/sample_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..746ac93de1bad478ee7aa6c217db6d69b8768a5e
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/model/sample_network.py
@@ -0,0 +1,20 @@
+import torch.nn as nn
+import torch
+
+class SampleNetwork(nn.Module):
+    '''
+    Represent the intersection (sample) point as differentiable function of the implicit geometry and camera parameters.
+    See equation 3 in the paper for more details.
+    '''
+
+    def forward(self, surface_output, surface_sdf_values, surface_points_grad, surface_dists, surface_cam_loc, surface_ray_dirs):
+        # t -> t(theta)
+        surface_ray_dirs_0 = surface_ray_dirs.detach()
+        surface_points_dot = torch.bmm(surface_points_grad.view(-1, 1, 3),
+                                       surface_ray_dirs_0.view(-1, 3, 1)).squeeze(-1)
+        surface_dists_theta = surface_dists - (surface_output - surface_sdf_values) / surface_points_dot
+
+        # t(theta) -> x(theta,c,v)
+        surface_points_theta_c_v = surface_cam_loc + surface_dists_theta * surface_ray_dirs
+
+        return surface_points_theta_c_v
diff --git a/insightface/reconstruction/PBIDR/code/preprocess/get_aux_dataset.py b/insightface/reconstruction/PBIDR/code/preprocess/get_aux_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c84c3937fa05446460db9ed72408618c4ddc54
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/preprocess/get_aux_dataset.py
@@ -0,0 +1,158 @@
+import os
+import sys
+sys.path.append(os.path.abspath(''))
+import torch
+import argparse
+import numpy as np
+from pytorch3d.io import load_objs_as_meshes, save_obj,load_obj
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    PerspectiveCameras,
+    # FoVPerspectiveCameras,
+    PointLights,
+    # DirectionalLights,
+    # Materials,
+    RasterizationSettings,
+    MeshRenderer,
+    MeshRasterizer,
+    # SoftPhongShader,
+    # SoftSilhouetteShader,
+    SoftPhongShader,
+    # TexturesVertex,
+    Materials
+)
+from PIL import Image
+
+print("Start to get aux dataset!")
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser("PreProcessing")
+    parser.add_argument('--gpu', '-g', type=str, default='0',help='GPU')
+    parser.add_argument('--input', '-i', type=str, default='../raw_data', help='Location of Raw Textured Mesh Dataset')
+    parser.add_argument('--output', '-o', type=int, required=True, help='New aux dataset')
+    parser.add_argument('--yaw', type=int, default=15, help='num_views_yaw')
+    parser.add_argument('--yaw_angle', type=int, default=45, help='yaw_angle')
+    parser.add_argument('--pitch', type=int, default=9, help='num_views_pitch')
+    parser.add_argument('--pitch_angle', type=int, default=30, help='pitch_angle')
+    parser.add_argument('--datapath', type=str, default='../data/', help='Location of code data')
+
+    parser.add_argument('--dataset', '-d', type=str, default='Face', help='FaceTest dataset')
+
+    args = parser.parse_args()
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+
+    # Setup
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+        torch.cuda.set_device(device)
+    else:
+        device = torch.device("cpu")
+
+    # Set paths
+    DATA_DIR = args.input
+    IMAGE_DIR = os.path.join(DATA_DIR, "mesh")
+
+    if os.path.exists(IMAGE_DIR):
+        os.system("rm -r " + IMAGE_DIR)
+    os.mkdir(IMAGE_DIR)
+    obj_filename = os.path.join(DATA_DIR, "mesh.obj")
+
+    if not os.path.exists(os.path.join(args.datapath, args.dataset)):
+        os.mkdir(os.path.join(args.datapath, args.dataset))
+
+    # Load obj file
+    mesh = load_objs_as_meshes([obj_filename], device=device,load_textures=True)
+    print(obj_filename)
+    print("Loaded Mesh")
+
+    # the number of different viewpoints from which we want to render the mesh.
+    def Ry(q):
+        return np.array([[-np.cos(q * np.pi / 180), 0, -np.sin(q * np.pi / 180)], [0, 1, 0],
+                         [np.sin(q * np.pi / 180), 0, -np.cos(q * np.pi / 180)]])
+    def Rx(q):
+        return np.array([[-1, 0, 0], [0, np.cos(q * np.pi / 180), np.sin(q * np.pi / 180)],
+                         [0, np.sin(q * np.pi / 180), -np.cos(q * np.pi / 180)]])
+
+    def get_R_matrix(azim, axis="Ry"):
+        print("Rotation Martix {}".format(axis))
+        aa = []
+        if axis == "Ry":
+            for q in azim:
+                aa.append(Ry(q))
+            RRR = torch.tensor(np.array(aa)).to(device)
+        else:
+            for q in azim:
+                aa.append(Rx(q))
+            RRR = torch.tensor(np.array(aa)).to(device)
+        return RRR
+
+    num_views = args.yaw + args.pitch
+
+    yaw_dim = torch.linspace(-1 * args.yaw_angle, args.yaw_angle, args.yaw)
+    pitch_dim = torch.linspace(-1 * args.pitch_angle, args.pitch_angle , args.pitch)
+
+    lights = PointLights(device=device, location=[[0, 50, 100]], ambient_color=((1.0, 1.0, 1.0), ), diffuse_color=((0.0, 0.0, 0.0), ), specular_color=((0.0, 0.0, 0.0), ))
+    RRy, TTy = look_at_view_transform(dist=8, elev=0, azim=yaw_dim, up=((0, 1, 0),), device=device)
+
+    TTx = TTy[:args.pitch]
+    RRx = get_R_matrix(azim=pitch_dim, axis="Rx")
+
+    Rtotal = torch.cat([RRy, RRx], dim=0)
+    Ttotal = torch.cat([TTy, TTx], dim=0)
+
+    cameras = PerspectiveCameras(device=device, focal_length=4500, principal_point=((512, 512),), R=Rtotal, T=Ttotal,
+                                 image_size=((1024, 1024),))
+
+    if num_views != 1:
+        camera = PerspectiveCameras(device=device, focal_length=4500, principal_point=((512, 512),), R=Rtotal[None, 1, ...],
+                                T=Ttotal[None, 1, ...], image_size=((1024, 1024),))
+    else:
+        camera = PerspectiveCameras(device=device, focal_length=4500, principal_point=((512, 512),),
+                                    R=Rtotal,
+                                    T=Ttotal, image_size=((1024, 1024),))
+
+    mymaterials = Materials(device=device, shininess=8)
+    raster_settings = RasterizationSettings(
+        image_size=1024,
+        blur_radius=0.0,
+        faces_per_pixel=1,
+    )
+    renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=camera,
+            raster_settings=raster_settings
+        ),
+        shader=SoftPhongShader(
+            device=device,
+            cameras=camera,
+            lights=lights,
+            materials=mymaterials,
+        )
+    )
+
+    meshes = mesh.extend(num_views)
+    target_images = renderer(meshes, cameras=cameras, lights=lights)
+    target_rgb = [target_images[i, ..., :3] for i in range(num_views)]
+    target_cameras = [PerspectiveCameras(device=device, focal_length=4500, principal_point=((512, 512),), R=Rtotal[None, i, ...],
+                           T=Ttotal[None, i, ...], image_size=((1024, 1024),)) for i in range(num_views)]
+
+    # RGB images
+    if not os.path.exists(os.path.join(IMAGE_DIR, 'image')):
+        os.mkdir(os.path.join(IMAGE_DIR, 'image'))
+    if not os.path.exists(os.path.join(IMAGE_DIR, 'mask')):
+        os.mkdir(os.path.join(IMAGE_DIR, 'mask'))
+
+    for i in range(len(target_images)):
+        img = Image.fromarray((target_images[i, ..., :3].cpu().numpy() * 255).astype(np.uint8))
+        img.save(os.path.join(IMAGE_DIR, 'image/{0}.png'.format('%03d' % int(i+1))))
+        img.save(os.path.join(IMAGE_DIR, 'mask/{0}.png'.format('%03d' % int(i+1))))
+    np.save(os.path.join(IMAGE_DIR,'R.npy'), Rtotal.cpu().numpy())
+    np.save(os.path.join(IMAGE_DIR,'T.npy'), Ttotal.cpu().numpy())
+
+    SCAN_DIR = args.datapath + args.dataset + '/scan' + str(args.output) + "/"
+    if os.path.exists(SCAN_DIR):
+        os.system("rm -r " + SCAN_DIR)
+    os.system("cp -r " + IMAGE_DIR + " " + SCAN_DIR)
+    os.system("cp " + DATA_DIR + "/mesh.* " + SCAN_DIR + ".")
+    print("Finished")
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/preprocess/preprocess_cameras.py b/insightface/reconstruction/PBIDR/code/preprocess/preprocess_cameras.py
new file mode 100644
index 0000000000000000000000000000000000000000..43b9ced8cf4177b38cef1aa6c935d6277ac1efdc
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/preprocess/preprocess_cameras.py
@@ -0,0 +1,100 @@
+import numpy as np
+import matplotlib.image as mpimg
+import matplotlib.pyplot as plt
+import cv2
+import argparse
+from glob import glob
+import os
+import sys
+import pickle
+sys.path.append('../code')
+from scipy.spatial.transform import Rotation
+import utils.general as utils
+
+
+def get_Ps_from_Faces(R, T):
+    Ps = []
+    cam_locs = []
+
+    intrinsics = np.concatenate([[4500.0], [0.0], [512.0], [0.0], [4500.0], [512.0], [0.0], [0.0], [1.0]], axis=0)
+    intrinsics = np.reshape(intrinsics, [3, 3])
+
+    projection = np.concatenate([[1.0], [0.0], [0.0], [0.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0.0], [1.0], [0.0]], axis=0)
+    projection = np.reshape(projection, [3, 4])
+
+    I14 = np.concatenate([[0.0], [0.0], [0.0], [1.0]], axis=0)
+    I14 = np.reshape(I14, [1, 4])
+
+    for i in range(0, len(R)):
+        R0 = R[i]
+        T0 = T[i].reshape(3, 1)
+
+        p = np.concatenate([np.concatenate([R[i].T, T[i].reshape(3, 1)], axis=1), I14], axis=0)
+        P = intrinsics @ projection @ p
+        P = P.astype(np.float64)
+
+        camera_loc = -np.dot(R0, T0)
+        cam_locs.append(camera_loc)
+        Ps.append(P)
+
+    return np.array(Ps)
+
+
+def get_all_mask_points_white_bg(masks_dir):
+    mask_paths = sorted(utils.glob_imgs(masks_dir))
+    mask_points_all=[]
+    mask_ims = []
+    for path in mask_paths:
+        img = mpimg.imread(path)
+        cur_mask = img.max(axis=2) < 0.9
+        mask_points = np.where(img.max(axis=2) < 0.9)
+        xs = mask_points[1]
+        ys = mask_points[0]
+        mask_points_all.append(np.stack((xs,ys,np.ones_like(xs))).astype(float))
+        mask_ims.append(cur_mask)
+    return mask_points_all,np.array(mask_ims)
+
+
+def get_normalization(source_dir):
+    print('Preprocessing', source_dir)
+
+    masks_dir= '{0}/mask'.format(source_dir)
+    mask_points_all, masks_all = get_all_mask_points_white_bg(masks_dir)
+    number_of_cameras = len(masks_all)
+    R = np.load('{0}/R.npy'.format(source_dir))
+    T = np.load('{0}/T.npy'.format(source_dir))
+    Ps = get_Ps_from_Faces(R, T)
+    normalization = np.eye(4).astype(np.float32)
+
+    cameras_new={}
+    for i in range(number_of_cameras):
+        cameras_new['scale_mat_%d' % i] = normalization
+        cameras_new['world_mat_%d' % i] = np.concatenate((Ps[i],np.array([[0,0,0,1.0]])),axis=0).astype(np.float32)
+
+    np.savez('{0}/{1}.npz'.format(source_dir, "cameras"), **cameras_new)
+    print(normalization)
+    print('--------------------------------------------------------')
+
+    if False: #for debugging
+        for i in range(number_of_cameras):
+            plt.figure()
+
+            plt.imshow(mpimg.imread('%s/%03d.png' % ('{0}/mask'.format(source_dir), i+1)))
+            xy = (Ps[i,:2, :] @ (np.concatenate((np.array(all_Xs), np.ones((len(all_Xs), 1))), axis=1).T)) / (
+                        Ps[i,2, :] @ (np.concatenate((np.array(all_Xs), np.ones((len(all_Xs), 1))), axis=1).T))
+
+            plt.plot(xy[0, :], xy[1, :], '*')
+            plt.show()
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--scan_id', '-i', type=int, default=0, help='data source folder for preprocess')
+    parser.add_argument('--dataset', '-d', type=str, default='Face', help='dataset dir')
+    opt = parser.parse_args()
+
+    SCAN_DIR = '../data/' + opt.dataset + '/scan' + str(opt.scan_id)
+    get_normalization(SCAN_DIR)
+
+    print('Done!')
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/script/data_process.sh b/insightface/reconstruction/PBIDR/code/script/data_process.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8cc319f52aed9b778909521235f20f503af7501c
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/script/data_process.sh
@@ -0,0 +1,6 @@
+set -ex
+
+GPU=0
+
+python preprocess/get_aux_dataset.py -g $GPU -i '../raw_data/0' -o 0 -d 'Test' --yaw 17 --pitch 0
+python preprocess/preprocess_cameras.py -i 0 -d 'Test'
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/script/fast_eval.sh b/insightface/reconstruction/PBIDR/code/script/fast_eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2608eda1df136bf13f69866991bd7c04015d4e5a
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/script/fast_eval.sh
@@ -0,0 +1,4 @@
+set -ex
+
+GPU=0
+python evaluation/eval.py --conf ./confs/test.conf --scan_id 0 --gpu $GPU --checkpoint 400 --eval_rendering
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/script/fast_train.sh b/insightface/reconstruction/PBIDR/code/script/fast_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a2fa2da96c7e524b13460d4f18a8c89adeb12714
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/script/fast_train.sh
@@ -0,0 +1,4 @@
+set -ex
+
+GPU=0
+python training/runner.py --conf ./confs/test.conf --scan_id 0 --gpu $GPU --nepoch 400
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/training/runner.py b/insightface/reconstruction/PBIDR/code/training/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba2cb94759f92e1284006c99547fc276a26e66f
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/training/runner.py
@@ -0,0 +1,58 @@
+import sys
+sys.path.append('../code')
+import argparse
+import GPUtil
+import torch
+import random
+import numpy as np
+
+from training.train import IFTrainRunner
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size', type=int, default=1, help='input batch size')
+    parser.add_argument('--nepoch', type=int, default=400, help='number of epochs to train for')
+    parser.add_argument('--nepoch_freeze', type=int, default=1000, help='number of epochs to train for')
+    parser.add_argument('--conf', type=str, default='./confs/test.conf')
+    parser.add_argument('--expname', type=str, default='')
+    parser.add_argument('--gpu', type=str, default='auto', help='GPU to use [default: GPU auto]')
+    parser.add_argument('--is_continue', default=False, action="store_true", help='If set, indicates continuing from a previous run.')
+    parser.add_argument('--timestamp', default='latest', type=str, help='The timestamp of the run to be used in case of continuing from a previous run.')
+    parser.add_argument('--checkpoint', default='latest',type=str,help='The checkpoint epoch number of the run to be used in case of continuing from a previous run.')
+    parser.add_argument('--train_cameras', default=False, action="store_true", help='If set, optimizing also camera location.')
+    parser.add_argument('--scan_id', type=int, default=-1, help='If set, taken to be the scan id.')
+
+    opt = parser.parse_args()
+
+    if opt.gpu == "auto":
+        deviceIDs = GPUtil.getAvailable(order='memory', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[])
+        gpu = deviceIDs[0]
+    else:
+        gpu = opt.gpu
+
+    setup_seed(0)
+
+    trainrunner = IFTrainRunner(conf=opt.conf,
+                                 batch_size=opt.batch_size,
+                                 nepochs=opt.nepoch,
+                                 nepoch_freeze=opt.nepoch_freeze,
+                                 expname=opt.expname,
+                                 gpu_index=gpu,
+                                 exps_folder_name='exps',
+                                 is_continue=opt.is_continue,
+                                 timestamp=opt.timestamp,
+                                 checkpoint=opt.checkpoint,
+                                 scan_id=opt.scan_id,
+                                 train_cameras=opt.train_cameras
+                                 )
+
+    trainrunner.run()
diff --git a/insightface/reconstruction/PBIDR/code/training/train.py b/insightface/reconstruction/PBIDR/code/training/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae66174a6e71b63e77d85a520ac2578cb58317b
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/training/train.py
@@ -0,0 +1,284 @@
+import os
+from datetime import datetime
+from pyhocon import ConfigFactory
+import sys
+import torch
+import torch.nn as nn
+
+import utils.general as utils
+import utils.plots as plt
+
+class IFTrainRunner():
+    def __init__(self,**kwargs):
+        torch.set_default_dtype(torch.float32)
+        torch.set_num_threads(1)
+
+        self.conf = ConfigFactory.parse_file(kwargs['conf'])
+        self.batch_size = kwargs['batch_size']
+
+        self.nepochs = kwargs['nepochs']
+        self.nepoch_freeze = kwargs['nepoch_freeze']
+
+        self.exps_folder_name = kwargs['exps_folder_name']
+        self.GPU_INDEX = kwargs['gpu_index']
+        self.train_cameras = kwargs['train_cameras']
+
+        self.expname = self.conf.get_string('train.expname') + kwargs['expname']
+        scan_id = kwargs['scan_id'] if kwargs['scan_id'] != -1 else self.conf.get_int('dataset.scan_id', default=-1)
+        if scan_id != -1:
+            self.expname = self.expname + '_{0}'.format(scan_id)
+
+        if kwargs['is_continue'] and kwargs['timestamp'] == 'latest':
+            if os.path.exists(os.path.join('../',kwargs['exps_folder_name'],self.expname)):
+                timestamps = os.listdir(os.path.join('../',kwargs['exps_folder_name'],self.expname))
+                if (len(timestamps)) == 0:
+                    is_continue = False
+                    timestamp = None
+                else:
+                    timestamp = sorted(timestamps)[-1]
+                    is_continue = True
+            else:
+                is_continue = False
+                timestamp = None
+        else:
+            timestamp = kwargs['timestamp']
+            is_continue = kwargs['is_continue']
+
+        utils.mkdir_ifnotexists(os.path.join('../',self.exps_folder_name))
+        self.expdir = os.path.join('../', self.exps_folder_name, self.expname)
+        utils.mkdir_ifnotexists(self.expdir)
+        self.timestamp = '{:%Y_%m_%d_%H_%M_%S}'.format(datetime.now())
+        utils.mkdir_ifnotexists(os.path.join(self.expdir, self.timestamp))
+
+        self.plots_dir = os.path.join(self.expdir, self.timestamp, 'plots')
+        utils.mkdir_ifnotexists(self.plots_dir)
+
+        # create checkpoints dirs
+        self.checkpoints_path = os.path.join(self.expdir, self.timestamp, 'checkpoints')
+        utils.mkdir_ifnotexists(self.checkpoints_path)
+        self.model_params_subdir = "ModelParameters"
+        self.optimizer_params_subdir = "OptimizerParameters"
+        self.scheduler_params_subdir = "SchedulerParameters"
+
+        utils.mkdir_ifnotexists(os.path.join(self.checkpoints_path, self.model_params_subdir))
+        utils.mkdir_ifnotexists(os.path.join(self.checkpoints_path, self.optimizer_params_subdir))
+        utils.mkdir_ifnotexists(os.path.join(self.checkpoints_path, self.scheduler_params_subdir))
+
+        if self.train_cameras:
+            self.optimizer_cam_params_subdir = "OptimizerCamParameters"
+            self.cam_params_subdir = "CamParameters"
+
+            utils.mkdir_ifnotexists(os.path.join(self.checkpoints_path, self.optimizer_cam_params_subdir))
+            utils.mkdir_ifnotexists(os.path.join(self.checkpoints_path, self.cam_params_subdir))
+
+        os.system("""cp -r {0} "{1}" """.format(kwargs['conf'], os.path.join(self.expdir, self.timestamp, 'runconf.conf')))
+
+        if (not self.GPU_INDEX == 'ignore'):
+            os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format(self.GPU_INDEX)
+
+        print('shell command : {0}'.format(' '.join(sys.argv)))
+
+        print('Loading data ...')
+
+        dataset_conf = self.conf.get_config('dataset')
+        if kwargs['scan_id'] != -1:
+            dataset_conf['scan_id'] = kwargs['scan_id']
+
+        self.train_dataset = utils.get_class(self.conf.get_string('train.dataset_class'))(self.train_cameras,
+                                                                                          **dataset_conf)
+
+        print('Finish loading data ...')
+
+        self.train_dataloader = torch.utils.data.DataLoader(self.train_dataset,
+                                                            batch_size=self.batch_size,
+                                                            shuffle=True,
+                                                            collate_fn=self.train_dataset.collate_fn
+                                                            )
+        self.plot_dataloader = torch.utils.data.DataLoader(self.train_dataset,
+                                                           batch_size=self.conf.get_int('plot.plot_nimgs'),
+                                                           shuffle=True,
+                                                           collate_fn=self.train_dataset.collate_fn
+                                                           )
+
+        self.model = utils.get_class(self.conf.get_string('train.model_class'))(conf=self.conf.get_config('model'), \
+                                                                                id=scan_id, datadir=dataset_conf['data_dir'])
+        if torch.cuda.is_available():
+            self.model.cuda()
+
+        self.loss = utils.get_class(self.conf.get_string('train.loss_class'))(**self.conf.get_config('loss'))
+
+        self.lr = self.conf.get_float('train.learning_rate')
+
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
+
+        self.sched_milestones = self.conf.get_list('train.sched_milestones', default=[])
+        self.sched_factor = self.conf.get_float('train.sched_factor', default=0.0)
+        self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, self.sched_milestones, gamma=self.sched_factor)
+
+        # settings for camera optimization
+        if self.train_cameras:
+            num_images = len(self.train_dataset)
+
+            self.pose_vecs = torch.nn.Embedding(num_images, 7, sparse=True).cuda()
+            self.pose_vecs.weight.data.copy_(self.train_dataset.get_pose_init())
+            self.optimizer_cam = torch.optim.SparseAdam(list(self.pose_vecs.parameters()), self.conf.get_float('train.learning_rate_cam'))
+
+        self.start_epoch = 0
+        if is_continue:
+            old_checkpnts_dir = os.path.join(self.expdir, timestamp, 'checkpoints')
+
+            saved_model_state = torch.load(
+                os.path.join(old_checkpnts_dir, 'ModelParameters', str(kwargs['checkpoint']) + ".pth"))
+            self.model.load_state_dict(saved_model_state["model_state_dict"])
+            self.start_epoch = saved_model_state['epoch']
+
+            data = torch.load(
+                os.path.join(old_checkpnts_dir, 'OptimizerParameters', str(kwargs['checkpoint']) + ".pth"))
+            self.optimizer.load_state_dict(data["optimizer_state_dict"])
+
+            data = torch.load(
+                os.path.join(old_checkpnts_dir, self.scheduler_params_subdir, str(kwargs['checkpoint']) + ".pth"))
+            self.scheduler.load_state_dict(data["scheduler_state_dict"])
+
+            if self.train_cameras:
+                data = torch.load(
+                    os.path.join(old_checkpnts_dir, self.optimizer_cam_params_subdir, str(kwargs['checkpoint']) + ".pth"))
+                self.optimizer_cam.load_state_dict(data["optimizer_cam_state_dict"])
+
+                data = torch.load(
+                    os.path.join(old_checkpnts_dir, self.cam_params_subdir, str(kwargs['checkpoint']) + ".pth"))
+                self.pose_vecs.load_state_dict(data["pose_vecs_state_dict"])
+
+        self.num_pixels = self.conf.get_int('train.num_pixels')
+        self.total_pixels = self.train_dataset.total_pixels
+        self.img_res = self.train_dataset.img_res
+        self.n_batches = len(self.train_dataloader)
+        self.plot_freq = self.conf.get_int('train.plot_freq')
+        self.plot_conf = self.conf.get_config('plot')
+
+        self.alpha_milestones = self.conf.get_list('train.alpha_milestones', default=[])
+        self.alpha_factor = self.conf.get_float('train.alpha_factor', default=0.0)
+        for acc in self.alpha_milestones:
+            if self.start_epoch > acc:
+                self.loss.alpha = self.loss.alpha * self.alpha_factor
+
+    def save_checkpoints(self, epoch):
+        torch.save(
+            {"epoch": epoch, "model_state_dict": self.model.state_dict()},
+            os.path.join(self.checkpoints_path, self.model_params_subdir, str(epoch) + ".pth"))
+        torch.save(
+            {"epoch": epoch, "model_state_dict": self.model.state_dict()},
+            os.path.join(self.checkpoints_path, self.model_params_subdir, "latest.pth"))
+
+        torch.save(
+            {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
+            os.path.join(self.checkpoints_path, self.optimizer_params_subdir, str(epoch) + ".pth"))
+        torch.save(
+            {"epoch": epoch, "optimizer_state_dict": self.optimizer.state_dict()},
+            os.path.join(self.checkpoints_path, self.optimizer_params_subdir, "latest.pth"))
+
+        torch.save(
+            {"epoch": epoch, "scheduler_state_dict": self.scheduler.state_dict()},
+            os.path.join(self.checkpoints_path, self.scheduler_params_subdir, str(epoch) + ".pth"))
+        torch.save(
+            {"epoch": epoch, "scheduler_state_dict": self.scheduler.state_dict()},
+            os.path.join(self.checkpoints_path, self.scheduler_params_subdir, "latest.pth"))
+
+        if self.train_cameras:
+            torch.save(
+                {"epoch": epoch, "optimizer_cam_state_dict": self.optimizer_cam.state_dict()},
+                os.path.join(self.checkpoints_path, self.optimizer_cam_params_subdir, str(epoch) + ".pth"))
+            torch.save(
+                {"epoch": epoch, "optimizer_cam_state_dict": self.optimizer_cam.state_dict()},
+                os.path.join(self.checkpoints_path, self.optimizer_cam_params_subdir, "latest.pth"))
+
+            torch.save(
+                {"epoch": epoch, "pose_vecs_state_dict": self.pose_vecs.state_dict()},
+                os.path.join(self.checkpoints_path, self.cam_params_subdir, str(epoch) + ".pth"))
+            torch.save(
+                {"epoch": epoch, "pose_vecs_state_dict": self.pose_vecs.state_dict()},
+                os.path.join(self.checkpoints_path, self.cam_params_subdir, "latest.pth"))
+
+    def run(self):
+        print("training...")
+
+        for epoch in range(self.start_epoch, self.nepochs + 1):
+
+            if epoch in self.alpha_milestones:
+                self.loss.alpha = self.loss.alpha * self.alpha_factor
+
+            if epoch % 100 == 0 and epoch != 0:
+                self.save_checkpoints(epoch)
+
+            if epoch % self.plot_freq == 0 and epoch != 0:
+                self.model.eval()
+                if self.train_cameras:
+                    self.pose_vecs.eval()
+                self.train_dataset.change_sampling_idx(-1)
+                indices, model_input, ground_truth = next(iter(self.plot_dataloader))
+
+                model_input["intrinsics"] = model_input["intrinsics"].cuda()
+                model_input["uv"] = model_input["uv"].cuda()
+                model_input["object_mask"] = model_input["object_mask"].cuda()
+                # model_input[""] = ground_truth["rgb"].cuda()
+
+                if self.train_cameras:
+                    pose_input = self.pose_vecs(indices.cuda())
+                    model_input['pose'] = pose_input
+                else:
+                    model_input['pose'] = model_input['pose'].cuda()
+
+                detail_3dmm, detail_3dmm_subdivision_full = plt.get_displacement_mesh(self.model)
+                detail_3dmm.export('{0}/Detailed_3dmm_{1}.obj'.format(self.plots_dir, epoch), 'obj')
+                detail_3dmm_subdivision_full.export('{0}/Subdivide_full_{1}.obj'.format(self.plots_dir, epoch), 'obj')
+
+                self.model.train()
+                if self.train_cameras:
+                    self.pose_vecs.train()
+
+            self.train_dataset.change_sampling_idx(self.num_pixels)
+
+            if epoch > self.nepoch_freeze:
+                print("Freeze Diffuse Part...")
+                self.model.diffuse_network.eval()
+                self.model.albedo_network.eval()
+
+            for data_index, (indices, model_input, ground_truth) in enumerate(self.train_dataloader):
+
+                model_input["intrinsics"] = model_input["intrinsics"].cuda()
+                model_input["uv"] = model_input["uv"].cuda()
+                model_input["object_mask"] = model_input["object_mask"].cuda()
+
+                if self.train_cameras:
+                    pose_input = self.pose_vecs(indices.cuda())
+                    model_input['pose'] = pose_input
+                else:
+                    model_input['pose'] = model_input['pose'].cuda()
+
+                model_outputs = self.model(model_input)
+                loss_output = self.loss(model_outputs, ground_truth)
+
+                loss = loss_output['loss']
+
+                self.optimizer.zero_grad()
+                if self.train_cameras:
+                    self.optimizer_cam.zero_grad()
+
+                loss.backward()
+
+                self.optimizer.step()
+                if self.train_cameras:
+                    self.optimizer_cam.step()
+
+                print(
+                    '{0} [{1}] ({2}/{3}): loss = {4}, rgb_loss = {5}, normal_loss = {6}, reg_loss = {7}, eikonal_loss = {8}, mask_loss = {9}, alpha = {10}, lr = {11}'
+                        .format(self.expname, epoch, data_index, self.n_batches, loss.item(),
+                                loss_output['rgb_loss'].item(),
+                                loss_output['normal_loss'].item(),
+                                loss_output['reg_loss'].item(),
+                                loss_output['eikonal_loss'].item(),
+                                loss_output['mask_loss'].item(),
+                                self.loss.alpha,
+                                self.scheduler.get_lr()[0]))
+
+            self.scheduler.step()
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/utils/general.py b/insightface/reconstruction/PBIDR/code/utils/general.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76c7007eee61816978704698e55a5c46c7796bc
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/utils/general.py
@@ -0,0 +1,66 @@
+import os
+from glob import glob
+import torch
+
+def mkdir_ifnotexists(directory):
+    if not os.path.exists(directory):
+        os.mkdir(directory)
+
+def get_class(kls):
+    parts = kls.split('.')
+    module = ".".join(parts[:-1])
+    m = __import__(module)
+    for comp in parts[1:]:
+        m = getattr(m, comp)
+    return m
+
+def glob_imgs(path):
+    imgs = []
+    for ext in ['*.png', '*.jpg', '*.JPEG', '*.JPG']:
+        imgs.extend(glob(os.path.join(path, ext)))
+    return imgs
+
+def split_input(model_input, total_pixels):
+    '''
+     Split the input to fit Cuda memory for large resolution.
+     Can decrease the value of n_pixels in case of cuda out of memory error.
+     '''
+    n_pixels = 10000
+    split = []
+    for i, indx in enumerate(torch.split(torch.arange(total_pixels).cuda(), n_pixels, dim=0)):
+        data = model_input.copy()
+        data['uv'] = torch.index_select(model_input['uv'], 1, indx)
+        data['object_mask'] = torch.index_select(model_input['object_mask'], 1, indx)
+        split.append(data)
+    return split
+
+def split_input_albedo(model_input, total_pixels):
+    '''
+     Split the input to fit Cuda memory for large resolution.
+     Can decrease the value of n_pixels in case of cuda out of memory error.
+     '''
+    n_pixels = 10000
+    split = []
+    for i, indx in enumerate(torch.split(torch.arange(total_pixels).cuda(), n_pixels, dim=0)):
+        data = model_input.copy()
+        data['uv'] = torch.index_select(model_input['uv'], 1, indx)
+        data['object_mask'] = torch.index_select(model_input['object_mask'], 1, indx)
+        data['rgb'] = torch.index_select(model_input['rgb'], 1, indx)
+        split.append(data)
+    return split
+
+def merge_output(res, total_pixels, batch_size):
+    ''' Merge the split output. '''
+
+    model_outputs = {}
+    for entry in res[0]:
+        if res[0][entry] is None:
+            continue
+        if len(res[0][entry].shape) == 1:
+            model_outputs[entry] = torch.cat([r[entry].reshape(batch_size, -1, 1) for r in res],
+                                             1).reshape(batch_size * total_pixels)
+        else:
+            model_outputs[entry] = torch.cat([r[entry].reshape(batch_size, -1, r[entry].shape[-1]) for r in res],
+                                             1).reshape(batch_size * total_pixels, -1)
+
+    return model_outputs
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/utils/plots.py b/insightface/reconstruction/PBIDR/code/utils/plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde881e5b525661c13d433470edd3674a45a9a78
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/utils/plots.py
@@ -0,0 +1,424 @@
+import plotly.graph_objs as go
+import plotly.offline as offline
+import numpy as np
+import torch
+from skimage import measure
+import torchvision
+import trimesh
+from PIL import Image
+from utils import rend_util
+import pickle
+
+
+def plot_latent(model, latent, indices, model_outputs ,pose, rgb_gt, path, epoch, img_res, plot_nimgs, max_depth, resolution):
+    # arrange data to plot
+    batch_size, num_samples, _ = rgb_gt.shape
+
+    network_object_mask = model_outputs['network_object_mask']
+    points = model_outputs['points'].reshape(batch_size, num_samples, 3)
+    rgb_eval = model_outputs['rgb_values']
+    rgb_eval = rgb_eval.reshape(batch_size, num_samples, 3)
+
+    depth = torch.ones(batch_size * num_samples).cuda().float() * max_depth
+    depth[network_object_mask] = rend_util.get_depth(points, pose).reshape(-1)[network_object_mask]
+    depth = depth.reshape(batch_size, num_samples, 1)
+    network_object_mask = network_object_mask.reshape(batch_size,-1)
+
+    cam_loc, cam_dir = rend_util.get_camera_for_plot(pose)
+
+    # plot rendered images
+    plot_images(rgb_eval, rgb_gt, path, epoch, plot_nimgs, img_res)
+
+    # plot depth maps
+    plot_depth_maps(depth, path, epoch, plot_nimgs, img_res)
+
+    data = []
+
+    # plot surface
+    surface_traces = get_surface_trace(path=path,
+                                       epoch=epoch,
+                                       sdf=lambda x: model.implicit_network(torch.cat([latent.expand(len(x), -1), x], 1))[:, 0],
+                                       resolution=resolution
+                                       )
+    data.append(surface_traces[0])
+
+    # plot cameras locations
+    for i, loc, dir in zip(indices, cam_loc, cam_dir):
+        data.append(get_3D_quiver_trace(loc.unsqueeze(0), dir.unsqueeze(0), name='camera_{0}'.format(i)))
+
+    for i, p, m in zip(indices, points, network_object_mask):
+        p = p[m]
+        sampling_idx = torch.randperm(p.shape[0])[:2048]
+        p = p[sampling_idx, :]
+
+        val = model.implicit_network(torch.cat([latent.expand(len(p), -1), p], 1))
+        caption = ["sdf: {0} ".format(v[0].item()) for v in val]
+
+        data.append(get_3D_scatter_trace(p, name='intersection_points_{0}'.format(i), caption=caption))
+
+    fig = go.Figure(data=data)
+    scene_dict = dict(xaxis=dict(range=[-3, 3], autorange=False),
+                      yaxis=dict(range=[-3, 3], autorange=False),
+                      zaxis=dict(range=[-3, 3], autorange=False),
+                      aspectratio=dict(x=1, y=1, z=1))
+    fig.update_layout(scene=scene_dict, width=1400, height=1400, showlegend=True)
+    filename = '{0}/surface_{1}.html'.format(path, epoch)
+    offline.plot(fig, filename=filename, auto_open=False)
+
+
+def plot(model, indices, model_outputs ,pose, rgb_gt, path, epoch, img_res, plot_nimgs, max_depth, resolution):
+    # arrange data to plot
+    batch_size, num_samples, _ = rgb_gt.shape
+
+    network_object_mask = model_outputs['network_object_mask']
+    points = model_outputs['points'].reshape(batch_size, num_samples, 3)
+    rgb_eval = model_outputs['rgb_values']
+    rgb_eval = rgb_eval.reshape(batch_size, num_samples, 3)
+
+    depth = torch.ones(batch_size * num_samples).cuda().float() * max_depth
+    depth[network_object_mask] = rend_util.get_depth(points, pose).reshape(-1)[network_object_mask]
+    depth = depth.reshape(batch_size, num_samples, 1)
+    network_object_mask = network_object_mask.reshape(batch_size,-1)
+
+    cam_loc, cam_dir = rend_util.get_camera_for_plot(pose)
+
+    # plot rendered images
+    plot_images(rgb_eval, rgb_gt, path, epoch, plot_nimgs, img_res)
+
+    # plot depth maps
+    plot_depth_maps(depth, path, epoch, plot_nimgs, img_res)
+
+    data = []
+
+    # plot surface
+    surface_traces = get_surface_trace(path=path,
+                                       epoch=epoch,
+                                       sdf=lambda x: model.implicit_network(x)[:, 0],
+                                       resolution=resolution
+                                       )
+    data.append(surface_traces[0])
+
+    # plot cameras locations
+    for i, loc, dir in zip(indices, cam_loc, cam_dir):
+        data.append(get_3D_quiver_trace(loc.unsqueeze(0), dir.unsqueeze(0), name='camera_{0}'.format(i)))
+
+    for i, p, m in zip(indices, points, network_object_mask):
+        p = p[m]
+        sampling_idx = torch.randperm(p.shape[0])[:2048]
+        p = p[sampling_idx, :]
+
+        val = model.implicit_network(p)
+        caption = ["sdf: {0} ".format(v[0].item()) for v in val]
+
+        data.append(get_3D_scatter_trace(p, name='intersection_points_{0}'.format(i), caption=caption))
+
+    fig = go.Figure(data=data)
+    scene_dict = dict(xaxis=dict(range=[-3, 3], autorange=False),
+                      yaxis=dict(range=[-3, 3], autorange=False),
+                      zaxis=dict(range=[-3, 3], autorange=False),
+                      aspectratio=dict(x=1, y=1, z=1))
+    fig.update_layout(scene=scene_dict, width=1400, height=1400, showlegend=True)
+    filename = '{0}/surface_{1}.html'.format(path, epoch)
+    offline.plot(fig, filename=filename, auto_open=False)
+
+
+def get_3D_scatter_trace(points, name='', size=3, caption=None):
+    assert points.shape[1] == 3, "3d scatter plot input points are not correctely shaped "
+    assert len(points.shape) == 2, "3d scatter plot input points are not correctely shaped "
+
+    trace = go.Scatter3d(
+        x=points[:, 0].cpu(),
+        y=points[:, 1].cpu(),
+        z=points[:, 2].cpu(),
+        mode='markers',
+        name=name,
+        marker=dict(
+            size=size,
+            line=dict(
+                width=2,
+            ),
+            opacity=1.0,
+        ), text=caption)
+
+    return trace
+
+
+def get_3D_quiver_trace(points, directions, color='#bd1540', name=''):
+    assert points.shape[1] == 3, "3d cone plot input points are not correctely shaped "
+    assert len(points.shape) == 2, "3d cone plot input points are not correctely shaped "
+    assert directions.shape[1] == 3, "3d cone plot input directions are not correctely shaped "
+    assert len(directions.shape) == 2, "3d cone plot input directions are not correctely shaped "
+
+    trace = go.Cone(
+        name=name,
+        x=points[:, 0].cpu(),
+        y=points[:, 1].cpu(),
+        z=points[:, 2].cpu(),
+        u=directions[:, 0].cpu(),
+        v=directions[:, 1].cpu(),
+        w=directions[:, 2].cpu(),
+        sizemode='absolute',
+        sizeref=0.125,
+        showscale=False,
+        colorscale=[[0, color], [1, color]],
+        anchor="tail"
+    )
+
+    return trace
+
+
+def get_surface_trace(path, epoch, sdf, resolution=100, return_mesh=False):
+    grid = get_grid_uniform(resolution)
+    points = grid['grid_points']
+
+    z = []
+    for i, pnts in enumerate(torch.split(points, 100000, dim=0)):
+        z.append(sdf(pnts).detach().cpu().numpy())
+    z = np.concatenate(z, axis=0)
+
+    if (not (np.min(z) > 0 or np.max(z) < 0)):
+
+        z = z.astype(np.float32)
+
+        verts, faces, normals, values = measure.marching_cubes_lewiner(
+            volume=z.reshape(grid['xyz'][1].shape[0], grid['xyz'][0].shape[0],
+                             grid['xyz'][2].shape[0]).transpose([1, 0, 2]),
+            level=0,
+            spacing=(grid['xyz'][0][2] - grid['xyz'][0][1],
+                     grid['xyz'][0][2] - grid['xyz'][0][1],
+                     grid['xyz'][0][2] - grid['xyz'][0][1]))
+
+        verts = verts + np.array([grid['xyz'][0][0], grid['xyz'][1][0], grid['xyz'][2][0]])
+
+        I, J, K = faces.transpose()
+
+        traces = [go.Mesh3d(x=verts[:, 0], y=verts[:, 1], z=verts[:, 2],
+                            i=I, j=J, k=K, name='implicit_surface',
+                            opacity=1.0)]
+
+        meshexport = trimesh.Trimesh(verts, faces, vertex_normals=-normals)
+        meshexport.export('{0}/surface_{1}.ply'.format(path, epoch), 'ply')
+
+        if return_mesh:
+            return meshexport
+        return traces
+    return None
+
+
+def get_surface_high_res_mesh(sdf, resolution=100):
+    # get low res mesh to sample point cloud
+    grid = get_grid_uniform(100)
+    z = []
+    points = grid['grid_points']
+
+    for i, pnts in enumerate(torch.split(points, 100000, dim=0)):
+        z.append(sdf(pnts).detach().cpu().numpy())
+    z = np.concatenate(z, axis=0)
+
+    z = z.astype(np.float32)
+
+    verts, faces, normals, values = measure.marching_cubes_lewiner(
+        volume=z.reshape(grid['xyz'][1].shape[0], grid['xyz'][0].shape[0],
+                         grid['xyz'][2].shape[0]).transpose([1, 0, 2]),
+        level=0,
+        spacing=(grid['xyz'][0][2] - grid['xyz'][0][1],
+                 grid['xyz'][0][2] - grid['xyz'][0][1],
+                 grid['xyz'][0][2] - grid['xyz'][0][1]))
+
+    verts = verts + np.array([grid['xyz'][0][0], grid['xyz'][1][0], grid['xyz'][2][0]])
+
+    mesh_low_res = trimesh.Trimesh(verts, faces, vertex_normals=-normals)
+    # return mesh_low_res
+
+    components = mesh_low_res.split(only_watertight=False)
+    areas = np.array([c.area for c in components], dtype=np.float)
+    mesh_low_res = components[areas.argmax()]
+
+    recon_pc = trimesh.sample.sample_surface(mesh_low_res, 10000)[0]
+    recon_pc = torch.from_numpy(recon_pc).float().cuda()
+
+    # Center and align the recon pc
+    s_mean = recon_pc.mean(dim=0)
+    s_cov = recon_pc - s_mean
+    s_cov = torch.mm(s_cov.transpose(0, 1), s_cov)
+    vecs = torch.eig(s_cov, True)[1].transpose(0, 1)
+    if torch.det(vecs) < 0:
+        vecs = torch.mm(torch.tensor([[1, 0, 0], [0, 0, 1], [0, 1, 0]]).cuda().float(), vecs)
+    helper = torch.bmm(vecs.unsqueeze(0).repeat(recon_pc.shape[0], 1, 1),
+                       (recon_pc - s_mean).unsqueeze(-1)).squeeze()
+
+    grid_aligned = get_grid(helper.cpu(), resolution)
+
+    grid_points = grid_aligned['grid_points']
+
+    g = []
+    for i, pnts in enumerate(torch.split(grid_points, 100000, dim=0)):
+        g.append(torch.bmm(vecs.unsqueeze(0).repeat(pnts.shape[0], 1, 1).transpose(1, 2),
+                           pnts.unsqueeze(-1)).squeeze() + s_mean)
+    grid_points = torch.cat(g, dim=0)
+
+    # MC to new grid
+    points = grid_points
+    z = []
+    for i, pnts in enumerate(torch.split(points, 100000, dim=0)):
+        z.append(sdf(pnts).detach().cpu().numpy())
+    z = np.concatenate(z, axis=0)
+
+    meshexport = None
+    if (not (np.min(z) > 0 or np.max(z) < 0)):
+
+        z = z.astype(np.float32)
+
+        verts, faces, normals, values = measure.marching_cubes_lewiner(
+            volume=z.reshape(grid_aligned['xyz'][1].shape[0], grid_aligned['xyz'][0].shape[0],
+                             grid_aligned['xyz'][2].shape[0]).transpose([1, 0, 2]),
+            level=0,
+            spacing=(grid_aligned['xyz'][0][2] - grid_aligned['xyz'][0][1],
+                     grid_aligned['xyz'][0][2] - grid_aligned['xyz'][0][1],
+                     grid_aligned['xyz'][0][2] - grid_aligned['xyz'][0][1]))
+
+        verts = torch.from_numpy(verts).cuda().float()
+        verts = torch.bmm(vecs.unsqueeze(0).repeat(verts.shape[0], 1, 1).transpose(1, 2),
+                   verts.unsqueeze(-1)).squeeze()
+        verts = (verts + grid_points[0]).cpu().numpy()
+
+        meshexport = trimesh.Trimesh(verts, faces, vertex_normals=-normals)
+
+    return meshexport
+
+
+def get_displacement_mesh(model):
+
+    def get_detailed_mesh(input):
+        origin_mesh = input.copy()
+        mesh_points = torch.tensor(np.array(origin_mesh.vertices).astype(np.float32)).cuda().requires_grad_(True)
+        sdfs = model.implicit_network(mesh_points)[:, 0:1]
+        sdf_np = sdfs.detach().cpu().numpy()
+        vetices_normal = origin_mesh.vertex_normals
+        new_vertices = -1.0 * sdf_np * vetices_normal + np.array(origin_mesh.vertices)
+        new_mesh = trimesh.Trimesh(vertices=new_vertices, faces=origin_mesh.faces, process=False, visual=origin_mesh.visual)
+        print('Detailed mesh created!')
+        return new_mesh
+
+    orimesh = model.mesh
+    subdivision_mesh = orimesh.subdivide()
+    new_mesh = get_detailed_mesh(orimesh)
+    submesh_full = get_detailed_mesh(subdivision_mesh)
+
+    return new_mesh, submesh_full
+
+def get_displacement_animation(model):
+
+    def get_detailed_mesh(aaa_mesh):
+        origin_mesh = aaa_mesh.copy()
+        mesh_points = torch.tensor(np.array(origin_mesh.vertices).astype(np.float32)).cuda().requires_grad_(True)
+        sdfs = model.implicit_network(mesh_points)[:, 0:1]
+        sdf_np = sdfs.detach().cpu().numpy()
+
+        print('Detailed mesh created!')
+        return sdf_np
+
+    orimesh = model.mesh
+    subdi_mesh_full = orimesh.subdivide()
+    sdf_np0 = get_detailed_mesh(orimesh)
+    sdf_np1 = get_detailed_mesh(subdi_mesh_full)
+
+    return sdf_np0, sdf_np1
+
+
+def get_NormalMaps(model):
+
+    def get_detailed_mesh(input):
+        origin_mesh = input.copy()
+        mesh_points = torch.tensor(np.array(origin_mesh.vertices).astype(np.float32)).cuda().requires_grad_(True)
+        detailed_normal = model.implicit_network.gradient(mesh_points)
+        detailed_normal = detailed_normal.squeeze().detach().cpu().numpy()
+        vetices_normal = np.asarray(origin_mesh.vertex_normals)
+        return vetices_normal, detailed_normal
+
+    orimesh = model.mesh
+    smooth1, detail1 = get_detailed_mesh(orimesh)
+    return smooth1, detail1
+
+def get_grid_uniform(resolution):
+    x = np.linspace(-1.0, 1.0, resolution)
+    y = x
+    z = x
+
+    xx, yy, zz = np.meshgrid(x, y, z)
+    grid_points = torch.tensor(np.vstack([xx.ravel(), yy.ravel(), zz.ravel()]).T, dtype=torch.float)
+
+    return {"grid_points": grid_points.cuda(),
+            "shortest_axis_length": 2.0,
+            "xyz": [x, y, z],
+            "shortest_axis_index": 0}
+
+def get_grid(points, resolution):
+    eps = 0.2
+    input_min = torch.min(points, dim=0)[0].squeeze().numpy()
+    input_max = torch.max(points, dim=0)[0].squeeze().numpy()
+
+    bounding_box = input_max - input_min
+    shortest_axis = np.argmin(bounding_box)
+    if (shortest_axis == 0):
+        x = np.linspace(input_min[shortest_axis] - eps,
+                        input_max[shortest_axis] + eps, resolution)
+        length = np.max(x) - np.min(x)
+        y = np.arange(input_min[1] - eps, input_max[1] + length / (x.shape[0] - 1) + eps, length / (x.shape[0] - 1))
+        z = np.arange(input_min[2] - eps, input_max[2] + length / (x.shape[0] - 1) + eps, length / (x.shape[0] - 1))
+    elif (shortest_axis == 1):
+        y = np.linspace(input_min[shortest_axis] - eps,
+                        input_max[shortest_axis] + eps, resolution)
+        length = np.max(y) - np.min(y)
+        x = np.arange(input_min[0] - eps, input_max[0] + length / (y.shape[0] - 1) + eps, length / (y.shape[0] - 1))
+        z = np.arange(input_min[2] - eps, input_max[2] + length / (y.shape[0] - 1) + eps, length / (y.shape[0] - 1))
+    elif (shortest_axis == 2):
+        z = np.linspace(input_min[shortest_axis] - eps,
+                        input_max[shortest_axis] + eps, resolution)
+        length = np.max(z) - np.min(z)
+        x = np.arange(input_min[0] - eps, input_max[0] + length / (z.shape[0] - 1) + eps, length / (z.shape[0] - 1))
+        y = np.arange(input_min[1] - eps, input_max[1] + length / (z.shape[0] - 1) + eps, length / (z.shape[0] - 1))
+
+    xx, yy, zz = np.meshgrid(x, y, z)
+    grid_points = torch.tensor(np.vstack([xx.ravel(), yy.ravel(), zz.ravel()]).T, dtype=torch.float).cuda()
+    return {"grid_points": grid_points,
+            "shortest_axis_length": length,
+            "xyz": [x, y, z],
+            "shortest_axis_index": shortest_axis}
+
+def plot_depth_maps(depth_maps, path, epoch, plot_nrow, img_res):
+    depth_maps_plot = lin2img(depth_maps, img_res)
+
+    tensor = torchvision.utils.make_grid(depth_maps_plot.repeat(1, 3, 1, 1),
+                                         scale_each=True,
+                                         normalize=True,
+                                         nrow=plot_nrow).cpu().detach().numpy()
+    tensor = tensor.transpose(1, 2, 0)
+    scale_factor = 255
+    tensor = (tensor * scale_factor).astype(np.uint8)
+
+    img = Image.fromarray(tensor)
+    img.save('{0}/depth_{1}.png'.format(path, epoch))
+
+def plot_images(rgb_points, ground_true, path, epoch, plot_nrow, img_res):
+    ground_true = (ground_true.cuda() + 1.) / 2.
+    rgb_points = (rgb_points + 1. ) / 2.
+
+    output_vs_gt = torch.cat((rgb_points, ground_true), dim=0)
+    output_vs_gt_plot = lin2img(output_vs_gt, img_res)
+
+    tensor = torchvision.utils.make_grid(output_vs_gt_plot,
+                                         scale_each=False,
+                                         normalize=False,
+                                         nrow=plot_nrow).cpu().detach().numpy()
+
+    tensor = tensor.transpose(1, 2, 0)
+    scale_factor = 255
+    tensor = (tensor * scale_factor).astype(np.uint8)
+
+    img = Image.fromarray(tensor)
+    img.save('{0}/rendering_{1}.png'.format(path, epoch))
+
+def lin2img(tensor, img_res):
+    batch_size, num_samples, channels = tensor.shape
+    return tensor.permute(0, 2, 1).view(batch_size, channels, img_res[0], img_res[1])
\ No newline at end of file
diff --git a/insightface/reconstruction/PBIDR/code/utils/rend_util.py b/insightface/reconstruction/PBIDR/code/utils/rend_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a255e2d074775b1756544e545b13fda8262eeda
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/code/utils/rend_util.py
@@ -0,0 +1,192 @@
+import numpy as np
+import imageio
+import skimage
+import cv2
+import torch
+from torch.nn import functional as F
+
+def load_rgb(path):
+    img = imageio.imread(path)
+    img = skimage.img_as_float32(img)
+
+    # pixel values between [-1,1]
+    img -= 0.5
+    img *= 2.
+    img = img.transpose(2, 0, 1)
+    return img
+
+def load_mask(path):
+    alpha = imageio.imread(path, as_gray=True)
+    alpha = skimage.img_as_float32(alpha)
+    object_mask = alpha > 127.5
+
+    return object_mask
+
+def load_mask_white_bg(path):
+    alpha = imageio.imread(path, as_gray=True)
+    alpha = skimage.img_as_float32(alpha)
+    object_mask = alpha < 250.5
+
+    return object_mask
+
+def load_K_Rt_from_P(filename, P=None):
+    if P is None:
+        lines = open(filename).read().splitlines()
+        if len(lines) == 4:
+            lines = lines[1:]
+        lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)]
+        P = np.asarray(lines).astype(np.float32).squeeze()
+
+    out = cv2.decomposeProjectionMatrix(P)
+    K = out[0]
+    R = out[1]
+    t = out[2]
+
+    K = K/K[2,2]
+    intrinsics = np.eye(4)
+    intrinsics[:3, :3] = K
+
+    pose = np.eye(4, dtype=np.float32)
+    to_gl = np.eye(3, dtype=np.float32)
+    to_gl[0, 0] = -1.
+    to_gl[1, 1] = -1.
+    pose[:3, :3] = np.dot(R.transpose(), to_gl)
+    pose[:3,3] = (t[:3] / t[3])[:,0]
+
+    return intrinsics, pose
+
+def get_camera_params(uv, pose, intrinsics):
+    if pose.shape[1] == 7: #In case of quaternion vector representation
+        cam_loc = pose[:, 4:]
+        R = quat_to_rot(pose[:,:4])
+        p = torch.eye(4).repeat(pose.shape[0],1,1).cuda().float()
+        p[:, :3, :3] = R
+        p[:, :3, 3] = cam_loc
+    else: # In case of pose matrix representation
+        cam_loc = pose[:, :3, 3]
+        p = pose
+
+    batch_size, num_samples, _ = uv.shape
+
+    depth = torch.ones((batch_size, num_samples)).cuda()
+    x_cam = uv[:, :, 0].view(batch_size, -1)
+    y_cam = uv[:, :, 1].view(batch_size, -1)
+    z_cam = depth.view(batch_size, -1)
+
+    pixel_points_cam = lift(x_cam, y_cam, z_cam, intrinsics=intrinsics)
+
+    # permute for batch matrix product
+    pixel_points_cam = pixel_points_cam.permute(0, 2, 1)
+
+    world_coords = torch.bmm(p, pixel_points_cam).permute(0, 2, 1)[:, :, :3]
+    ray_dirs = world_coords - cam_loc[:, None, :]
+    ray_dirs = F.normalize(ray_dirs, dim=2)
+
+    return ray_dirs, cam_loc
+
+def get_camera_for_plot(pose):
+    if pose.shape[1] == 7: #In case of quaternion vector representation
+        cam_loc = pose[:, 4:].detach()
+        R = quat_to_rot(pose[:,:4].detach())
+    else: # In case of pose matrix representation
+        cam_loc = pose[:, :3, 3]
+        R = pose[:, :3, :3]
+    cam_dir = R[:, :3, 2]
+    return cam_loc, cam_dir
+
+def lift(x, y, z, intrinsics):
+    # parse intrinsics
+    intrinsics = intrinsics.cuda()
+    fx = intrinsics[:, 0, 0]
+    fy = intrinsics[:, 1, 1]
+    cx = intrinsics[:, 0, 2]
+    cy = intrinsics[:, 1, 2]
+    sk = intrinsics[:, 0, 1]
+
+    x_lift = (x - cx.unsqueeze(-1) + cy.unsqueeze(-1)*sk.unsqueeze(-1)/fy.unsqueeze(-1) - sk.unsqueeze(-1)*y/fy.unsqueeze(-1)) / fx.unsqueeze(-1) * z
+    y_lift = (y - cy.unsqueeze(-1)) / fy.unsqueeze(-1) * z
+
+    # homogeneous
+    return torch.stack((x_lift, y_lift, z, torch.ones_like(z).cuda()), dim=-1)
+
+def quat_to_rot(q):
+    batch_size, _ = q.shape
+    q = F.normalize(q, dim=1)
+    R = torch.ones((batch_size, 3,3)).cuda()
+    qr=q[:,0]
+    qi = q[:, 1]
+    qj = q[:, 2]
+    qk = q[:, 3]
+    R[:, 0, 0]=1-2 * (qj**2 + qk**2)
+    R[:, 0, 1] = 2 * (qj *qi -qk*qr)
+    R[:, 0, 2] = 2 * (qi * qk + qr * qj)
+    R[:, 1, 0] = 2 * (qj * qi + qk * qr)
+    R[:, 1, 1] = 1-2 * (qi**2 + qk**2)
+    R[:, 1, 2] = 2*(qj*qk - qi*qr)
+    R[:, 2, 0] = 2 * (qk * qi-qj * qr)
+    R[:, 2, 1] = 2 * (qj*qk + qi*qr)
+    R[:, 2, 2] = 1-2 * (qi**2 + qj**2)
+    return R
+
+def rot_to_quat(R):
+    batch_size, _,_ = R.shape
+    q = torch.ones((batch_size, 4)).cuda()
+
+    R00 = R[:, 0,0]
+    R01 = R[:, 0, 1]
+    R02 = R[:, 0, 2]
+    R10 = R[:, 1, 0]
+    R11 = R[:, 1, 1]
+    R12 = R[:, 1, 2]
+    R20 = R[:, 2, 0]
+    R21 = R[:, 2, 1]
+    R22 = R[:, 2, 2]
+
+    q[:,0]=torch.sqrt(1.0+R00+R11+R22)/2
+    q[:, 1]=(R21-R12)/(4*q[:,0])
+    q[:, 2] = (R02 - R20) / (4 * q[:, 0])
+    q[:, 3] = (R10 - R01) / (4 * q[:, 0])
+    return q
+
+def get_sphere_intersection(cam_loc, ray_directions, r = 1.0):
+    # Input: n_images x 4 x 4 ; n_images x n_rays x 3
+    # Output: n_images * n_rays x 2 (close and far) ; n_images * n_rays
+
+    n_imgs, n_pix, _ = ray_directions.shape
+
+    cam_loc = cam_loc.unsqueeze(-1)
+    ray_cam_dot = torch.bmm(ray_directions, cam_loc).squeeze()
+    under_sqrt = ray_cam_dot ** 2 - (cam_loc.norm(2,1) ** 2 - r ** 2)
+
+    under_sqrt = under_sqrt.reshape(-1)
+    mask_intersect = under_sqrt > 0
+
+    sphere_intersections = torch.zeros(n_imgs * n_pix, 2).cuda().float()
+    sphere_intersections[mask_intersect] = torch.sqrt(under_sqrt[mask_intersect]).unsqueeze(-1) * torch.Tensor([-1, 1]).cuda().float()
+    sphere_intersections[mask_intersect] -= ray_cam_dot.reshape(-1)[mask_intersect].unsqueeze(-1)
+
+    sphere_intersections = sphere_intersections.reshape(n_imgs, n_pix, 2)
+    sphere_intersections = sphere_intersections.clamp_min(0.0)
+    mask_intersect = mask_intersect.reshape(n_imgs, n_pix)
+
+    return sphere_intersections, mask_intersect
+
+def get_depth(points, pose):
+    ''' Retruns depth from 3D points according to camera pose '''
+    batch_size, num_samples, _ = points.shape
+    if pose.shape[1] == 7:  # In case of quaternion vector representation
+        cam_loc = pose[:, 4:]
+        R = quat_to_rot(pose[:, :4])
+        pose = torch.eye(4).unsqueeze(0).repeat(batch_size, 1, 1).cuda().float()
+        pose[:, :3, 3] = cam_loc
+        pose[:, :3, :3] = R
+
+    points_hom = torch.cat((points, torch.ones((batch_size, num_samples, 1)).cuda()), dim=2)
+
+    # permute for batch matrix product
+    points_hom = points_hom.permute(0, 2, 1)
+
+    points_cam = torch.inverse(pose).bmm(points_hom)
+    depth = points_cam[:, 2, :][:, :, None]
+    return depth
+
diff --git a/insightface/reconstruction/PBIDR/figures/overview.jpg b/insightface/reconstruction/PBIDR/figures/overview.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6cceaee18e50adc7af0bd74d12f080bce7737eb0
Binary files /dev/null and b/insightface/reconstruction/PBIDR/figures/overview.jpg differ
diff --git a/insightface/reconstruction/PBIDR/requirements.txt b/insightface/reconstruction/PBIDR/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be933a08170904c5c4da671f6224a26405af9075
--- /dev/null
+++ b/insightface/reconstruction/PBIDR/requirements.txt
@@ -0,0 +1,10 @@
+vtk
+numpy
+opencv-python
+scikit-image
+scipy
+Pillow
+argparse
+GPUtil
+pyhocon
+plotly
\ No newline at end of file
diff --git a/insightface/reconstruction/gaze/README.md b/insightface/reconstruction/gaze/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c33cf9bfb0da69ebd06f3831afa3d5f91224ffe4
--- /dev/null
+++ b/insightface/reconstruction/gaze/README.md
@@ -0,0 +1,44 @@
+# Generalizing Gaze Estimation with Weak-Supervision from Synthetic Views
+
+The implementation of [Arxiv paper](https://arxiv.org/abs/2212.02997) for gaze estimation task.
+
+
+## Preparation
+
+1. Download the [dataset](https://drive.google.com/file/d/1erYIoTCbXk1amofJ6yTGhbpmsovWrrva/view?usp=sharing) and put it under ``data/``
+
+2. Download [eyes3d.pkl](https://drive.google.com/file/d/1as7_ew6kEFTHpcrlk8QKvgFJJ8cKzM3q/view?usp=sharing) and put it under ``assets/``
+
+3. Download [pretrained checkpoint](https://drive.google.com/file/d/1cqmChXSnTwUpk3jD7JLpZKHOuBLlC3_N/view?usp=sharing) and put it under ``assets/``
+
+4. Install libraries:
+   ```
+   pip install timm pytorch-lightning==1.8.1 albumentations==1.3.0
+   ```
+   
+## Testing with pre-trained model
+
+  After downloading the pre-trained checkpoint above,
+
+  ```
+  python test_gaze.py assets/latest_a.ckpt
+  ```
+
+## Training
+
+  ```
+  python trainer_gaze.py
+  ```
+
+## Results
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/gaze_0.png?raw=true" width="800" alt=""/>
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/gaze_1.png?raw=true" width="800" alt=""/>
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/gaze_2.png?raw=true" width="800" alt=""/>
+
+
+
+
+
diff --git a/insightface/reconstruction/gaze/datasets/__init__.py b/insightface/reconstruction/gaze/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/reconstruction/gaze/datasets/augs.py b/insightface/reconstruction/gaze/datasets/augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e701d9fab544bd7e49a5945378a5f6cbb31ef0a
--- /dev/null
+++ b/insightface/reconstruction/gaze/datasets/augs.py
@@ -0,0 +1,40 @@
+import numpy as np
+import albumentations as A
+from albumentations.core.transforms_interface import ImageOnlyTransform
+
+class RectangleBorderAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            fill_value = 0,
+            limit = 0.3,
+            always_apply=False,
+            p=1.0,
+            ):
+        super(RectangleBorderAugmentation, self).__init__(always_apply, p)
+        assert limit>0.0 and limit<1.0
+        self.fill_value = 0
+        self.limit = limit
+
+
+    def apply(self, image, border_size_limit, **params):
+        assert len(border_size_limit)==4
+        border_size = border_size_limit.copy()
+        border_size[0] *= image.shape[1]
+        border_size[2] *= image.shape[1]
+        border_size[1] *= image.shape[0]
+        border_size[3] *= image.shape[0]
+        border_size = border_size.astype(np.int)
+        image[:,:border_size[0],:] = self.fill_value
+        image[:border_size[1],:,:] = self.fill_value
+        image[:,border_size[2]:,:] = self.fill_value
+        image[border_size[3]:,:,:] = self.fill_value
+        return image
+
+    def get_params(self):
+        border_size_limit = np.random.uniform(0.0, self.limit, size=4)
+        return {'border_size_limit': border_size_limit}
+
+    def get_transform_init_args_names(self):
+        return ('fill_value', 'limit')
+
diff --git a/insightface/reconstruction/gaze/datasets/dataset_gaze.py b/insightface/reconstruction/gaze/datasets/dataset_gaze.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb86a81e682be1326806af629997aa4187395787
--- /dev/null
+++ b/insightface/reconstruction/gaze/datasets/dataset_gaze.py
@@ -0,0 +1,184 @@
+import os
+import os.path as osp
+import queue as Queue
+import mxnet as mx
+import pickle
+import threading
+import logging
+import numpy as np
+import insightface
+from insightface.utils import face_align
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from .augs import RectangleBorderAugmentation
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank,
+                                                 non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+
+class GazeDataset(Dataset):
+    def __init__(self, root_dir, is_train):
+        super(GazeDataset, self).__init__()
+
+        #self.local_rank = local_rank
+        self.is_train = is_train
+        self.input_size = 160
+        #self.num_kps = 68
+        transform_list = []
+        if is_train:
+            transform_list += \
+                [
+                    A.ColorJitter(brightness=0.3, contrast=0.3, p=0.5),
+                    A.ToGray(p=0.1),
+                    A.ISONoise(p=0.1),
+                    A.MedianBlur(blur_limit=(1,7), p=0.1),
+                    A.GaussianBlur(blur_limit=(1,7), p=0.1),
+                    A.MotionBlur(blur_limit=(5,13), p=0.1),
+                    A.ImageCompression(quality_lower=10, quality_upper=90, p=0.05),
+                    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, interpolation=cv2.INTER_LINEAR, 
+                        border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.6),
+                    #A.HorizontalFlip(p=0.5),
+                    RectangleBorderAugmentation(limit=0.2, fill_value=0, p=0.1),
+                ]
+        transform_list += \
+            [
+                A.geometric.resize.Resize(self.input_size, self.input_size, interpolation=cv2.INTER_LINEAR, always_apply=True),
+                A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+                ToTensorV2(),
+            ]
+        self.transform = A.ReplayCompose(
+            transform_list,
+            keypoint_params=A.KeypointParams(format='xy', remove_invisible=False)
+        )
+        self.root_dir = root_dir
+        if is_train:
+            path_imgrec = os.path.join(root_dir, 'train.rec')
+            path_imgidx = os.path.join(root_dir, 'train.idx')
+        else:
+            path_imgrec = os.path.join(root_dir, 'val.rec')
+            path_imgidx = os.path.join(root_dir, 'val.idx')
+        self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+        self.imgidx = np.array(list(self.imgrec.keys))
+        logging.info('len:%d'%len(self.imgidx))
+        print('!!!len:%d'%len(self.imgidx))
+        #self.num_face = 1103
+        self.num_eye = 481
+
+    def __len__(self):
+        return len(self.imgidx)
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        s = self.imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        img = mx.image.imdecode(img).asnumpy() #rgb numpy
+        y = np.array(header.label, dtype=np.float32).reshape( (-1, 3) )
+        #print('!!!', y.shape)
+        assert y.shape[0]==self.num_eye*2
+        eye_l = y[:self.num_eye,:]
+        eye_r = y[self.num_eye:,:]
+        mean_z_l = np.mean(eye_l[:32,2])
+        mean_z_r = np.mean(eye_r[:32,2])
+        std_z_l = np.max(np.abs(eye_l[:32,2]))
+        std_z_r = np.max(np.abs(eye_r[:32,2]))
+        eye_l[:,2] -= mean_z_l
+        eye_r[:,2] -= mean_z_r
+        eye_l[:,2] /= std_z_l
+        eye_r[:,2] /= std_z_r
+        #print('!!!', np.max(eye_l[:,2]), np.min(eye_l[:,2]))
+        y = np.concatenate( (eye_l, eye_r), axis=0)
+        #y[:,2] /= 100.0
+        
+        #if self.is_train:
+        #    black_edge = np.random.randint(img.shape[1]//3, size=4)
+        #    if np.random.random()<0.5:
+        #        img[:black_edge[0],:,:] = 0
+        #        img[black_edge[1]*-1:,:,:] = 0
+        #        img[:,:black_edge[2],:] = 0
+        #        img[:,black_edge[3]*-1:,:] = 0
+        #label = torch.tensor(y, dtype=torch.float32)
+        #label = y
+        kps_xy = []
+        kps_z = []
+        for i in range(y.shape[0]):
+            kps_xy.append( (y[i][0], y[i][1]) )
+            kps_z.append(y[i][2])
+        if self.transform is not None:
+            #sample = self.transform(image=sample)['image']
+            #t = self.transform(image=img, keypoints=label, class_labels=self.class_labels, class_sides=self.class_sides)
+            t = self.transform(image=img, keypoints=kps_xy)
+            flipped = False
+            #print(t.keys())
+            #print('!!!flipped:', flipped)
+            img = t['image']
+            label_xy = t['keypoints']
+            label_xy = np.array(label_xy, dtype=np.float32)
+            label_xy /= (self.input_size/2)
+            label_xy -= 1.0
+            label_z = np.array(kps_z, dtype=np.float32).reshape((-1,1))
+            #label_z /= (self.input_size/2)
+            label = np.concatenate( (label_xy, label_z), axis=1)
+            #label = label.flatten()
+            label = torch.tensor(label, dtype=torch.float32)
+        #print('label:', label.shape)
+        return img, label
+
diff --git a/insightface/reconstruction/gaze/models.py b/insightface/reconstruction/gaze/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db5dc252d780fed103f1f6322c7ba87d9e4aae2
--- /dev/null
+++ b/insightface/reconstruction/gaze/models.py
@@ -0,0 +1,75 @@
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.loggers import TensorBoardLogger
+import torch
+import torch.nn as nn
+import timm
+
+class GazeModel(pl.LightningModule):
+    def __init__(self, backbone, epoch):
+        super().__init__()
+        self.save_hyperparameters()
+        self.backbone = timm.create_model(backbone, num_classes=481*2*3)
+        self.epoch = epoch
+        #self.loss = nn.MSELoss(reduction='mean')
+        self.loss = nn.L1Loss(reduction='mean')
+        #self.hard_mining = False
+        self.hard_mining = False
+        self.num_face = 1103
+        self.num_eye = 481*2
+
+    def forward(self, x):
+        # use forward for inference/predictions
+        y = self.backbone(x)
+        return y
+
+    def cal_loss(self, y_hat, y, hm=False):
+        bs = y.size(0)
+        y_hat = y_hat.view( (bs,-1,3) )
+        loss = torch.abs(y_hat - y) #(B,K,3)
+        loss[:,:,2] *= 0.5
+        if hm:
+            loss = torch.mean(loss, dim=(1,2)) #(B,)
+            loss, _ = torch.topk(loss, k=int(bs*0.25), largest=True)
+            #B = len(loss)
+            #S = int(B*0.5)
+            #loss, _ = torch.sort(loss, descending=True)
+            #loss = loss[:S]
+        loss = torch.mean(loss) * 20.0
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = self.cal_loss(y_hat, y, self.hard_mining)
+        self.log('train_loss', loss, on_epoch=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = self.cal_loss(y_hat, y)
+        self.log('val_loss', loss, on_step=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = self.cal_loss(y_hat, y)
+        self.log('test_loss', loss)
+
+    def configure_optimizers(self):
+        #return torch.optim.Adam(self.parameters(), lr=0.0002)
+        opt = torch.optim.SGD(self.parameters(), lr = 0.1, momentum=0.9, weight_decay = 0.0005)
+        epoch_steps = [int(self.epoch*0.4), int(self.epoch*0.7), int(self.epoch*0.9)]
+        print('epoch_steps:', epoch_steps)
+        def lr_step_func(epoch):
+            return 0.1 ** len([m for m in epoch_steps if m <= epoch])
+        scheduler = torch.optim.lr_scheduler.LambdaLR(
+                optimizer=opt, lr_lambda=lr_step_func)
+        lr_scheduler = {
+                'scheduler': scheduler,
+                'name': 'learning_rate',
+                'interval':'epoch',
+                'frequency': 1}
+        return [opt], [lr_scheduler]
diff --git a/insightface/reconstruction/gaze/test_gaze.py b/insightface/reconstruction/gaze/test_gaze.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ebc9fa36e2f27380640e9d58b4b53b389ae2c6
--- /dev/null
+++ b/insightface/reconstruction/gaze/test_gaze.py
@@ -0,0 +1,239 @@
+
+from models import GazeModel
+import sys
+import glob
+import torch
+import os
+import os.path as osp
+import numpy as np
+import cv2
+import os.path as osp
+import insightface
+from insightface.app import FaceAnalysis
+from insightface.utils import face_align
+import menpo.io as mio
+from menpo.image import Image
+from menpo.shape import PointCloud, TriMesh
+
+
+def angles_from_vec(vec):
+    x, y, z = -vec[2], vec[1], -vec[0]
+    theta = np.arctan2(y, x)
+    phi = np.arctan2(np.sqrt(x**2 + y**2), z) - np.pi/2
+    theta_x, theta_y = phi, theta
+    return theta_x, theta_y
+
+def vec_from_eye(eye, iris_lms_idx):
+    p_iris = eye[iris_lms_idx] - eye[:32].mean(axis=0)
+    vec = p_iris.mean(axis=0)
+    vec /= np.linalg.norm(vec, axis=0)
+    return vec
+
+def angles_and_vec_from_eye(eye, iris_lms_idx):
+
+    vec = vec_from_eye(eye, iris_lms_idx)
+    theta_x, theta_y = angles_from_vec(vec)
+    return theta_x, theta_y, vec
+
+def vec_from_angles(rx, ry):
+    rx = np.deg2rad(rx)
+    ry = np.deg2rad(ry)
+    x1 = np.sin(np.pi/2 + rx) * np.cos(ry) 
+    y1 = np.sin(np.pi/2 + rx) * np.sin(ry)
+    z1 = np.cos(np.pi/2 + rx)
+    x, y, z = -z1, y1, -x1
+    vec = np.array([x, y, z])
+    vec /= np.linalg.norm(vec, axis=0)
+    return vec
+
+
+class GazeHandler():
+    def __init__(self, ckpt_path, res_eyes_path='assets/eyes3d.pkl'):
+        R = 12.0
+        eyes_mean = mio.import_pickle(res_eyes_path)
+        idxs481 = eyes_mean['mask481']['idxs']
+        self.tri481 = eyes_mean['mask481']['trilist']
+        self.iris_idx_481 = eyes_mean['mask481']['idxs_iris']
+
+        self.mean_l = eyes_mean['left_points'][idxs481][:, [0, 2, 1]]
+        self.mean_r = eyes_mean['right_points'][idxs481][:, [0, 2, 1]]
+
+        self.num_face = 1103
+        self.num_eye = 481
+        self.app = FaceAnalysis()
+        det_size = 320
+        self.app.prepare(ctx_id=0, det_size=(det_size, det_size))
+        self.input_size = 160
+        self.model = GazeModel.load_from_checkpoint(ckpt_path).cuda()
+        self.model.eval()
+
+
+
+    def draw_item(self, eimg, item):
+        #bbox, kps, eye_kps = item
+        eye_kps = item
+        #eye_kps[:,2] *= 10.0
+        eye_l = eye_kps[:self.num_eye,:]
+        eye_r = eye_kps[self.num_eye:,:]
+        for _eye in [eye_l, eye_r]:
+            tmp = _eye[:,0].copy()
+            _eye[:,0] = _eye[:,1].copy()
+            _eye[:,1] = tmp
+        #img_crp, trf_crp = image.crop_to_pointcloud_proportion(PointCloud(np.concatenate((eye_l[:, :2], eye_r[:, :2]))), 0.4,  return_transform=True)
+        #img_crp.view(1)
+        #trf_crp.pseudoinverse().apply(TriMesh(eye_l, tri481).with_dims([0, 1])).view(1, marker_size=0.01, line_width=0.1)
+        #trf_crp.pseudoinverse().apply(PointCloud(eye_l[iris_idx_481]).with_dims([0, 1])).view(1)
+        #trf_crp.pseudoinverse().apply(TriMesh(eye_r, tri481).with_dims([0, 1])).view(1, marker_size=0.01, line_width=0.1)
+        #trf_crp.pseudoinverse().apply(PointCloud(eye_r[iris_idx_481]).with_dims([0, 1])).view(1)
+
+        for _eye in [eye_l, eye_r]:
+            _kps = _eye[self.iris_idx_481,:].astype(np.int)
+            for l in range(_kps.shape[0]):
+                color = (0, 255, 0)
+                cv2.circle(eimg, (_kps[l][1], _kps[l][0]), 4, color, 4)
+            #print(tri481.shape)
+            for _tri in self.tri481:
+                color = (0, 0, 255)
+                for k in range(3):
+                    ix = _tri[k]
+                    iy = _tri[(k+1)%3]
+                    x = _eye[ix,:2].astype(np.int)[::-1]
+                    y = _eye[iy,:2].astype(np.int)[::-1]
+                    cv2.line(eimg, x, y, color, 1)
+
+        theta_x_l, theta_y_l, vec_l = angles_and_vec_from_eye(eye_l, self.iris_idx_481)
+        theta_x_r, theta_y_r, vec_r = angles_and_vec_from_eye(eye_r, self.iris_idx_481)
+        gaze_pred = np.array([(theta_x_l + theta_x_r) / 2, (theta_y_l + theta_y_r) / 2])
+
+        diag = np.sqrt(float(eimg.shape[0]*eimg.shape[1]))
+
+        #img_crp, trf_crp = image.crop_to_pointcloud_proportion(PointCloud(lms[17:-20]), 0.1, return_transform=True)
+        #diag = img_crp.diagonal()
+
+        eye_pos_left = eye_l[self.iris_idx_481].mean(axis=0)[[0, 1]]
+        eye_pos_right = eye_r[self.iris_idx_481].mean(axis=0)[[0, 1]]
+
+        ##fig = plt.figure(0)
+        ##image.view(0, figure_size=(4,4))
+        #PointCloud(eye_l[iris_idx_481]).with_dims([0, 1]).view(0, marker_size=3, figure_size=(4,4))
+        #PointCloud(eye_r[iris_idx_481]).with_dims([0, 1]).view(0, marker_size=3, figure_size=(4,4))
+
+        ## pred ---
+        gaze_pred = np.array([theta_x_l, theta_y_l])
+        dx = 0.4*diag * np.sin(gaze_pred[1])
+        dy = 0.4*diag * np.sin(gaze_pred[0])
+        x = np.array([eye_pos_left[1], eye_pos_left[0]])
+        y = x.copy()
+        y[0] += dx
+        y[1] += dy
+        x = x.astype(np.int)
+        y = y.astype(np.int)
+        color = (0,255,255)
+        cv2.line(eimg, x, y, color, 2)
+
+        gaze_pred = np.array([theta_x_r, theta_y_r])
+        dx = 0.4*diag * np.sin(gaze_pred[1])
+        dy = 0.4*diag * np.sin(gaze_pred[0])
+        x = np.array([eye_pos_right[1], eye_pos_right[0]])
+        y = x.copy()
+        y[0] += dx
+        y[1] += dy
+        x = x.astype(np.int)
+        y = y.astype(np.int)
+        color = (0,255,255)
+        cv2.line(eimg, x, y, color, 2)
+        return eimg
+
+    def draw_on(self, eimg, results):
+        face_sizes = [ (x[0][2] - x[0][0]) for x in results]
+        max_index = np.argmax(face_sizes)
+        max_face_size = face_sizes[max_index]
+        rescale = 300.0 / max_face_size
+        #print(max_face_size, rescale)
+        oimg = eimg.copy()
+        eimg = cv2.resize(eimg, None, fx=rescale, fy=rescale)
+        for pred in results:
+            _, _, eye_kps = pred
+            eye_kps = eye_kps.copy()
+            eye_kps *= rescale
+            eimg = self.draw_item(eimg, eye_kps)
+        eimg = cv2.resize(eimg, (oimg.shape[1], oimg.shape[0]))
+        return eimg
+        pred_max = results[max_index]
+        bbox, kps, eye_kps = pred_max
+        width = bbox[2] - bbox[0]
+        center = (kps[0]+kps[1]) / 2.0
+        #_size = np.abs(kps[1][0] - kps[0][0]) * 1.5
+        _size = max(width/1.5, np.abs(kps[1][0] - kps[0][0]) ) * 1.5
+        rotate = 0
+        _scale = self.input_size  / _size
+        aimg, M = face_align.transform(oimg, center, self.input_size, _scale, rotate)
+        eye_kps = face_align.trans_points(eye_kps, M)
+        center_eye_rescale = 4.0
+        aimg = cv2.resize(aimg, None, fx=center_eye_rescale, fy=center_eye_rescale)
+        eye_kps *= center_eye_rescale
+        aimg = self.draw_item(aimg, eye_kps)
+        #return aimg
+    
+        rimg = np.zeros( (max(eimg.shape[0], aimg.shape[0]), eimg.shape[1]+aimg.shape[1], 3), dtype=np.uint8)
+        rimg[:eimg.shape[0], :eimg.shape[1], :] = eimg
+        rimg[:aimg.shape[0], eimg.shape[1]:eimg.shape[1]+aimg.shape[1], :] = aimg
+        return rimg
+
+    def get(self, img):
+        results = []
+        faces = self.app.get(img)
+        if len(faces)==0:
+            return results
+        for face in faces:
+            bbox = face.bbox
+            width = bbox[2] - bbox[0]
+            kps = face.kps
+            center = (kps[0]+kps[1]) / 2.0
+            #_size = np.abs(kps[1][0] - kps[0][0]) * 1.5
+            _size = max(width/1.5, np.abs(kps[1][0] - kps[0][0]) ) * 1.5
+            rotate = 0
+            _scale = self.input_size  / _size
+            aimg, M = face_align.transform(img, center, self.input_size, _scale, rotate)
+            #eimg = cv2.resize(aimg, None, fx=R, fy=R)
+            #cv2.imwrite("outputs/a_%s"%name, aimg)
+            aimg = cv2.cvtColor(aimg, cv2.COLOR_BGR2RGB)
+            input = aimg.copy()
+            input = np.transpose(input, (2, 0, 1))
+            input = np.expand_dims(input, 0)
+            imgs = torch.Tensor(input).cuda()
+            imgs.div_(255).sub_(0.5).div_(0.5)
+            opred = self.model(imgs).detach().cpu().numpy().flatten().reshape( (-1, 3) )
+            opred[:, 0:2] += 1
+            opred[:, 0:2] *= (self.input_size // 2)
+            #opred[:, 0:2] *= 112
+            opred[:,2] *= 10.0
+            IM = cv2.invertAffineTransform(M)
+            pred = face_align.trans_points(opred, IM)
+            result = (bbox, kps, pred)
+            results.append(result)
+        return results
+
+
+if __name__ == '__main__':
+    ckpt_path = sys.argv[1]
+    handler = GazeHandler(ckpt_path)
+    output_dir = 'outputs/'
+    if not osp.exists(output_dir):
+        os.makedirs(output_dir)
+    input_dir = 'assets/images'
+    for imgname in os.listdir(input_dir):
+        imgpath = osp.join(input_dir, imgname)
+        img = cv2.imread(imgpath)
+        print(imgpath, imgname)
+        if img is None:
+            continue
+        results = handler.get(img)
+        if len(results)==0:
+            continue
+        eimg = handler.draw_on(img, results)
+        oimg = np.concatenate((img, eimg), axis=1)
+        cv2.imwrite(osp.join(output_dir, "%s"%imgname), oimg)
+
+
+
diff --git a/insightface/reconstruction/gaze/trainer_gaze.py b/insightface/reconstruction/gaze/trainer_gaze.py
new file mode 100644
index 0000000000000000000000000000000000000000..69294a25651da7226b54109939d906893546f3cd
--- /dev/null
+++ b/insightface/reconstruction/gaze/trainer_gaze.py
@@ -0,0 +1,89 @@
+from argparse import ArgumentParser
+
+import os
+import os.path as osp
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.loggers import TensorBoardLogger
+import timm
+from datasets.dataset_gaze import GazeDataset, DataLoaderX
+from models import GazeModel
+
+
+
+
+
+def cli_main():
+    pl.seed_everything(727)
+
+    # ------------
+    # args
+    # ------------
+    parser = ArgumentParser()
+    parser.add_argument('--backbone', default='resnet101d', type=str)
+    parser.add_argument('--batch_size', default=64, type=int)
+    parser.add_argument('--epoch', default=16, type=int)
+    parser.add_argument('--root', default='data/gaze_refine', type=str)
+    parser.add_argument('--num-gpus', default=8, type=int)
+    parser.add_argument('--tf32', action='store_true')
+    parser = pl.Trainer.add_argparse_args(parser)
+    args = parser.parse_args()
+
+    if not args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+
+    # ------------
+    # data
+    # ------------
+    train_set = GazeDataset(root_dir=args.root, is_train=True)
+    val_set = GazeDataset(root_dir=args.root, is_train=False)
+    print('train data size:', len(train_set))
+
+    train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=2, pin_memory=True)
+    val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False)
+
+    # ------------
+    # model
+    # ------------
+    model = GazeModel(backbone=args.backbone, epoch=args.epoch)
+    ckpt_path = 'work_dirs/gaze'
+    if not os.path.exists(ckpt_path):
+        os.makedirs(ckpt_path)
+
+    # ------------
+    # training
+    # ------------
+    checkpoint_callback = ModelCheckpoint(
+            monitor='val_loss',
+            dirpath=ckpt_path,
+            filename='{epoch:02d}-{val_loss:.6f}',
+            save_top_k=5,
+            mode='min',
+            )
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    trainer = pl.Trainer(
+        gpus = args.num_gpus,
+        accelerator="gpu",
+        strategy="ddp",
+        benchmark=True,
+		logger=TensorBoardLogger(osp.join(ckpt_path, 'logs')),
+        callbacks=[checkpoint_callback, lr_monitor],
+        check_val_every_n_epoch=1,
+        #progress_bar_refresh_rate=1,
+        max_epochs=args.epoch,
+    )
+    trainer.fit(model, train_loader, val_loader)
+
+if __name__ == '__main__':
+    cli_main()
+
diff --git a/insightface/reconstruction/jmlr/README.md b/insightface/reconstruction/jmlr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffb5f48728831c09b80d57637722a289fb76221a
--- /dev/null
+++ b/insightface/reconstruction/jmlr/README.md
@@ -0,0 +1,50 @@
+## Introduction
+
+JMLR is an efficient high accuracy face reconstruction approach which achieved [Rank-1st](https://tianchi.aliyun.com/competition/entrance/531961/rankingList) of 
+[Perspective Projection Based Monocular 3D Face Reconstruction Challenge](https://tianchi.aliyun.com/competition/entrance/531961/introduction) 
+of [ECCV-2022 WCPA Workshop](https://sites.google.com/view/wcpa2022).
+
+Paper in [arXiv](https://arxiv.org/abs/2208.07142).
+
+
+## Method Pipeline
+
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/jmlr_pipeline.jpg?raw=true" width="800" alt="jmlr-pipeline"/>
+
+
+## Data preparation
+
+1. Download the dataset from WCPA organiser and put it at somewhere.
+
+2. Create `cache_align/` dir and put `flip_index.npy` file under it.
+
+3. Check `configs/s1.py` and fix the location to yours.
+
+4. Use ``python rec_builder.py`` to generate cached dataset, which will be used in following steps.
+ 
+
+## Training
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -u -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=13334 train.py configs/s1.py
+```
+
+## Inference Example
+
+```
+python inference_simple.py
+```
+
+## Resources
+
+[flip_index.npy](https://drive.google.com/file/d/1fZ4cRyvQeehwKoMKKSmXUmTx5GEJwyrT/view?usp=sharing)
+
+[pretrained-model](https://drive.google.com/file/d/1qSpqDDLQfcPeFr2b82IZrK8QC_3lci3l/view?usp=sharing)
+
+[projection_matrix.txt](https://drive.google.com/file/d/1joiu-V0qEZxil_AHcg_W726nRxE8Q4dm/view?usp=sharing)
+
+## Results
+
+<img src="https://github.com/nttstar/insightface-resources/blob/master/images/jmlr_id.jpg?raw=true" width="800" alt="jmlr-id"/>
+
diff --git a/insightface/reconstruction/jmlr/augs.py b/insightface/reconstruction/jmlr/augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2977afbc1929002f69128db1e2077c0ae715023f
--- /dev/null
+++ b/insightface/reconstruction/jmlr/augs.py
@@ -0,0 +1,271 @@
+import numpy as np
+import cv2
+import os
+import os.path as osp
+import albumentations as A
+from albumentations.core.transforms_interface import ImageOnlyTransform
+from albumentations.pytorch import ToTensorV2
+
+class RectangleBorderAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            fill_value = 0,
+            fg_limit = (0.7, 0.9),
+            always_apply=False,
+            p=1.0,
+            ):
+        super(RectangleBorderAugmentation, self).__init__(always_apply, p)
+        #assert limit>0.0 and limit<1.0
+        assert isinstance(fg_limit, tuple)
+        assert fg_limit[1]>fg_limit[0]
+        self.fill_value = 0
+        self.fg_limit = fg_limit
+        #self.output_size = output_size
+
+
+    def apply(self, image, fg, top, left, **params):
+        assert image.shape[0]==image.shape[1]
+        oimage = np.ones_like(image) * self.fill_value
+        f = int(fg*image.shape[0])
+        t = int(top*image.shape[0])
+        l = int(left*image.shape[1])
+        oimage[t:t+f,l:l+f,:] = image[t:t+f,l:l+f,:]
+        return oimage
+
+    def get_params(self):
+        fg = np.random.uniform(self.fg_limit[0], self.fg_limit[1])
+        top = np.random.uniform(0.0, 1.0-fg)
+        left = np.random.uniform(0.0, 1.0-fg)
+        return {'fg': fg, 'top': top, 'left': left}
+
+    def get_transform_init_args_names(self):
+        return ('fill_value','fg_limit')
+
+class SunGlassAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            fill_value = 0,
+            loc = [ (38, 52), (73, 52) ],
+            rad_limit = (10, 20),
+            always_apply=False,
+            p=1.0,
+            ):
+        super(SunGlassAugmentation, self).__init__(always_apply, p)
+        #assert limit>0.0 and limit<1.0
+        assert isinstance(rad_limit, tuple)
+        self.fill_value = 0
+        self.loc = loc
+        self.rad_limit = rad_limit
+
+
+    def apply(self, image, rad, **params):
+        for i in range(2):
+            cv2.circle(image, self.loc[i], rad, self.fill_value, -1)
+        return image
+
+    def get_params(self):
+        rad = np.random.randint(self.rad_limit[0], self.rad_limit[1])
+        return {'rad':rad}
+
+    def get_transform_init_args_names(self):
+        return ('fill_value', 'loc', 'rad_limit')
+
+class ForeHeadAugmentation(ImageOnlyTransform):
+
+    def __init__(
+            self,
+            height_min = 0.2,
+            height_max = 0.4,
+            width_min = 0.5,
+            always_apply=False,
+            p=1.0,
+            ):
+        super(ForeHeadAugmentation, self).__init__(always_apply, p)
+        assert height_max > height_min
+        #assert limit>0.0 and limit<1.0
+        self.height_min = height_min
+        self.height_max = height_max
+        self.width_min = width_min
+
+
+    def apply(self, image, height, width, left, **params):
+        mask_value = np.random.randint(0, 255, size=(int(image.shape[0]*height), int(image.shape[1]*width), 3), dtype=image.dtype)
+        l = int(image.shape[1]*left)
+        image[:mask_value.shape[0], l:l+mask_value.shape[1], :] = mask_value
+        return image
+
+    def get_params(self):
+        height = np.random.uniform(self.height_min, self.height_max)
+        width = np.random.uniform(self.width_min, 1.0)
+        left = np.random.uniform(0.0, 1.0 - width)
+        return {'height': height, 'width': width, 'left': left}
+
+    def get_transform_init_args_names(self):
+        return ('height_min', 'height_max','width_min')
+
+
+def get_aug_transform(cfg):
+    aug_modes = cfg.aug_modes
+    input_size = cfg.input_size
+    task = cfg.task
+    transform_list = []
+    is_test = False
+    if 'test-aug' in aug_modes:
+        #transform_list.append(
+        #    A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+        #    )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.05, rotate_limit=5, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=1.0, always_apply=True)
+            )
+        is_test = True
+
+    if '1' in aug_modes:
+        transform_list.append(
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+            )
+    if '1A' in aug_modes:
+        transform_list.append(
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.03, rotate_limit=6, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.3)
+            )
+    if '2' in aug_modes:
+        transform_list.append(
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.4)
+            )
+    if '3' in aug_modes:
+        transform_list.append(
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.6)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.6)
+            )
+    if 'nist1' in aug_modes:
+        transform_list.append(
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.06, rotate_limit=6, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.4)
+            )
+    if 'nist2' in aug_modes:
+        transform_list.append(
+            #A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02, p=0.3)
+            A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.05, p=0.2)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.06, scale_limit=0.06, rotate_limit=6, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.4)
+            )
+        transform_list.append(
+                A.OneOf([
+                    RectangleBorderAugmentation(p=0.5),
+                    ForeHeadAugmentation(p=0.5),
+                    #SunGlassAugmentation(p=0.2),
+                    ], p=0.06)
+                )
+        transform_list.append(
+            A.ToGray(p=0.05)
+            )
+        transform_list.append(
+            A.geometric.resize.RandomScale(scale_limit=(0.7, 0.9), interpolation=cv2.INTER_LINEAR, p=0.05)
+            )
+        transform_list.append(
+            A.ISONoise(p=0.06)
+            )
+        transform_list.append(
+            A.MedianBlur(blur_limit=(1,7), p=0.05)
+            )
+        transform_list.append(
+            A.MotionBlur(blur_limit=(5,12), p=0.05)
+            )
+        transform_list.append(
+            A.ImageCompression(quality_lower=50, quality_upper=80, p=0.05)
+            )
+    if 'prod' in aug_modes:
+        transform_list.append(
+            #A.RandomBrightnessContrast(brightness_limit=0.125, contrast_limit=0.125, p=0.2)
+            A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02, p=0.3)
+            )
+        transform_list.append(
+            A.ShiftScaleRotate(shift_limit=0.06, scale_limit=0.1, rotate_limit=10, interpolation=cv2.INTER_LINEAR, 
+                border_mode=cv2.BORDER_CONSTANT, value=0, mask_value=0, p=0.4)
+            )
+        transform_list.append(
+                A.OneOf([
+                    RectangleBorderAugmentation(p=0.5),
+                    ForeHeadAugmentation(p=0.5),
+                    MaskAugmentation(mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'], mask_probs=[0.4, 0.4, 0.1, 0.1], h_low=0.33, h_high=0.4, p=0.2),
+                    SunGlassAugmentation(p=0.2),
+                    ], p=0.2)
+                )
+        transform_list.append(
+            A.ToGray(p=0.05)
+            )
+        transform_list.append(
+            A.geometric.resize.RandomScale(scale_limit=(0.6, 0.9), interpolation=cv2.INTER_LINEAR, p=0.2)
+            )
+        transform_list.append(
+            A.ISONoise(p=0.1)
+            )
+        transform_list.append(
+            A.MedianBlur(blur_limit=(1,7), p=0.1)
+            )
+        transform_list.append(
+            A.MotionBlur(blur_limit=(5,12), p=0.1)
+            )
+        transform_list.append(
+            A.ImageCompression(quality_lower=30, quality_upper=80, p=0.1)
+            )
+    #if input_size!=112: # TODO!!
+    #    transform_list.append(
+    #        A.geometric.resize.Resize(input_size, input_size, interpolation=cv2.INTER_LINEAR, always_apply=True)
+    #        )
+    transform_list += \
+        [
+            #A.HorizontalFlip(p=0.5),
+            A.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+            ToTensorV2(),
+        ]
+    #here, the input for A transform is rgb cv2 img
+    if is_test:
+        transform = A.ReplayCompose(
+            transform_list ,
+            keypoint_params=A.KeypointParams(format='xy',remove_invisible=False)
+        )
+    else:
+        transform = A.Compose(
+            transform_list,
+            keypoint_params=A.KeypointParams(format='xy',remove_invisible=False)
+        )
+    return transform
+
+
+if __name__ == "__main__":
+    tool = MaskRenderer()
+    tool.prepare(ctx_id=0, det_size=(128,128))
+    image = cv2.imread("./test1.png")[:,:,::-1]
+    mask_image  = "mask_blue"
+    #params = tool.build_params(image)
+    label = np.load('assets/mask_label.npy')
+    params = tool.decode_params(label)
+    #print(params[0][:20])
+    mask_out = tool.render_mask(image, mask_image, params, input_is_rgb=True, auto_blend=False)[:,:,::-1]
+    #print(uv_out.dtype, uv_out.shape)
+    cv2.imwrite('output_mask.jpg', mask_out)
+    transform = A.Compose([
+        MaskAugmentation(mask_names=['mask_white', 'mask_blue', 'mask_black', 'mask_green'], mask_probs=[0.4, 0.4, 0.1, 0.1], h_low=0.33, h_high=0.4, p=1.0),
+        #MaskAugmentation(p=1.0),
+        ])
+    mask_out = transform(image=image, hlabel=label)["image"][:,:,::-1]
+    cv2.imwrite('output_mask2.jpg', mask_out)
diff --git a/insightface/reconstruction/jmlr/backbones/__init__.py b/insightface/reconstruction/jmlr/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f46a9da2d95bf6b8d8bea9bc529452e2e2b56e5
--- /dev/null
+++ b/insightface/reconstruction/jmlr/backbones/__init__.py
@@ -0,0 +1 @@
+from .network import get_network
diff --git a/insightface/reconstruction/jmlr/backbones/iresnet.py b/insightface/reconstruction/jmlr/backbones/iresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2103c1e64329859401f08d85716be5f1182c0c32
--- /dev/null
+++ b/insightface/reconstruction/jmlr/backbones/iresnet.py
@@ -0,0 +1,326 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+import logging
+
+__all__ = ['iresnet18', 'iresnet34', 'iresnet50', 'iresnet100', 'iresnet200']
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+
+
+class IBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 groups=1, base_width=64, dilation=1, eps=1e-5, dropblock=0.0):
+        super(IBasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=eps)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=eps)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=eps)
+        self.downsample = downsample
+        self.stride = stride
+        self.dbs = None
+        if dropblock>0.0:
+            import timm
+            from timm.layers import DropBlock2d
+            self.dbs = [DropBlock2d(dropblock, 7), DropBlock2d(dropblock, 7), DropBlock2d(dropblock, 7)]
+
+    def forward(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        if self.dbs is not None:
+            out = self.dbs[0](out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        if self.dbs is not None:
+            out = self.dbs[1](out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        if self.dbs is not None:
+            out = self.dbs[2](out)
+        return out
+
+
+class IResNet(nn.Module):
+    def __init__(self,
+                 block, layers, dropout=0.0, num_features=512, input_size=112, zero_init_residual=False,
+                 stem_type='', dropblock = 0.0, kaiming_init=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None, fp16=0):
+        super(IResNet, self).__init__()
+        self.input_size = input_size
+        assert self.input_size%16==0
+        fc_scale = self.input_size // 16
+        self.fc_scale = fc_scale*fc_scale
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.dilation = 1
+        self.norm_layer = nn.BatchNorm2d
+        self.act_layer = nn.PReLU
+        self.eps = 1e-5
+        if kaiming_init:
+            self.eps = 2e-5
+        self.stem_type = stem_type
+        self.dropblock = dropblock
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        if stem_type!='D':
+            self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        else:
+            stem_width = self.inplanes // 2
+            stem_chs = [stem_width, stem_width]
+            self.conv1 = nn.Sequential(*[
+                nn.Conv2d(3, stem_chs[0], 3, stride=1, padding=1, bias=False),
+                self.norm_layer(stem_chs[0], eps=self.eps),
+                self.act_layer(stem_chs[0]),
+                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
+                self.norm_layer(stem_chs[1], eps=self.eps),
+                self.act_layer(stem_chs[1]),
+                nn.Conv2d(stem_chs[1], self.inplanes, 3, stride=1, padding=1, bias=False)])
+        logging.info("iresnet, input_size: %d, fc_scale: %d, dropout: %.2f, stem_type: %s, fp16: %d"%(self.input_size, self.fc_scale, dropout, stem_type, self.fp16))
+        logging.info("iresnet, eps: %.6f, dropblock: %.3f, kaiming_init: %d"%(self.eps, self.dropblock, kaiming_init))
+        #self.conv1.requires_grad = False
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=self.eps)
+        #self.bn1.requires_grad = False
+        self.prelu = nn.PReLU(self.inplanes)
+        #self.prelu.requires_grad = False
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        #self.layer1.requires_grad = False
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        #self.layer2.requires_grad = False
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[1],
+                                       dropblock=self.dropblock)
+        #self.layer3.requires_grad = False
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2,
+                                       dilate=replace_stride_with_dilation[2],
+                                       dropblock=self.dropblock)
+        #self.layer4.requires_grad = False
+        self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=self.eps)
+        #self.bn2.requires_grad = False
+        if dropout>0.0:
+            self.dropout = nn.Dropout(p=dropout, inplace=True)
+        else:
+            self.dropout = None
+        self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features)
+        #self.fc.requires_grad = False
+        self.features = nn.BatchNorm1d(num_features, eps=self.eps)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+
+        #for m in self.modules():
+        #    if isinstance(m, nn.Conv2d):
+        #        nn.init.xavier_uniform_(m.weight.data)
+        #        if m.bias is not None:
+        #            m.bias.data.zero_()
+        #    elif isinstance(m, nn.BatchNorm2d):
+        #        m.weight.data.fill_(1)
+        #        m.bias.data.zero_()
+        #    elif isinstance(m, nn.BatchNorm1d):
+        #        m.weight.data.fill_(1)
+        #        m.bias.data.zero_()
+        #    elif isinstance(m, nn.Linear):
+        #        nn.init.xavier_uniform_(m.weight.data)
+        #        if m.bias is not None:
+        #            m.bias.data.zero_()
+        #nn.init.constant_(self.features.weight, 1.0)
+        #self.features.weight.requires_grad = False
+
+        #for m in self.modules():
+        #    if kaiming_init:
+        #        if isinstance(m, (nn.Conv2d, nn.Linear)):
+        #            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        #            if m.bias is not None:
+        #                nn.init.constant_(m.bias, 0)
+        #    else:
+        #        if isinstance(m, (nn.Conv2d, nn.Linear)):
+        #            nn.init.normal_(m.weight, 0, 0.1)
+        #            if m.bias is not None:
+        #                nn.init.constant_(m.bias, 0)
+        #    if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+        #        nn.init.constant_(m.weight, 1)
+        #        nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False, dropblock=0.0):
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if self.stem_type!='D':
+                downsample = nn.Sequential(
+                    conv1x1(self.inplanes, planes * block.expansion, stride),
+                    nn.BatchNorm2d(planes * block.expansion, eps=self.eps),
+                )
+            else:
+				#avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+                avg_stride = stride
+                pool = nn.AvgPool2d(2, avg_stride, ceil_mode=True, count_include_pad=False)
+                downsample = nn.Sequential(*[
+                    pool,
+                    conv1x1(self.inplanes, planes * block.expansion, stride=1),
+                    nn.BatchNorm2d(planes * block.expansion, eps=self.eps),
+                    ])
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, self.eps, dropblock))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      dilation=self.dilation,
+                      eps=self.eps,
+                      dropblock=dropblock))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        #if self.input_size!=112:
+        #    x = F.interpolate(x, [self.input_size, self.input_size], mode='bilinear', align_corners=False)
+        is_fp16 = self.fp16>0
+        with torch.cuda.amp.autocast(is_fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            if self.fp16<3:
+                x = self.layer4(x)
+                x = self.bn2(x)
+                x = torch.flatten(x, 1)
+                if self.dropout is not None:
+                    x = self.dropout(x)
+        if is_fp16:
+            x = x.float()
+        if self.fp16>=3:
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            if self.dropout is not None:
+                x = self.dropout(x)
+        x = self.fc(x)
+        x = self.features(x)
+        return x
+
+
+def _iresnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = IResNet(block, layers, **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model
+
+
+def iresnet18(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet18', IBasicBlock, [2, 2, 2, 2], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet34(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet34', IBasicBlock, [3, 4, 6, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet50(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet50', IBasicBlock, [3, 4, 14, 3], pretrained,
+                    progress, **kwargs)
+
+
+def iresnet100(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet100', IBasicBlock, [3, 13, 30, 3], pretrained,
+                    progress, **kwargs)
+
+def iresnet120(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet120', IBasicBlock, [3, 16, 37, 3], pretrained,
+                    progress, **kwargs)
+
+def iresnet160(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet160', IBasicBlock, [3, 16, 56, 3], pretrained,
+                    progress, **kwargs)
+
+def iresnet180(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet180', IBasicBlock, [3, 20, 63, 3], pretrained,
+                    progress, **kwargs)
+
+def iresnet200(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet200', IBasicBlock, [6, 26, 60, 6], pretrained,
+                    progress, **kwargs)
+
+def iresnet247(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet247', IBasicBlock, [3, 36, 80, 4], pretrained,
+                    progress, **kwargs)
+
+def iresnet269(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet269', IBasicBlock, [4, 46, 80, 4], pretrained,
+                    progress, **kwargs)
+
+def iresnet300(pretrained=False, progress=True, **kwargs):
+    return _iresnet('iresnet300', IBasicBlock, [4, 46, 95, 4], pretrained,
+                    progress, **kwargs)
+
+
+def get_model(name, **kwargs):
+    if name == "r18":
+        return iresnet18(False, **kwargs)
+    elif name == "r34":
+        return iresnet34(False, **kwargs)
+    elif name == "r50":
+        return iresnet50(False, **kwargs)
+    elif name == "r100":
+        return iresnet100(False, **kwargs)
+    elif name == "r200":
+        return iresnet200(False, **kwargs)
+    elif name == "r2060":
+        from .iresnet2060 import iresnet2060
+        return iresnet2060(False, **kwargs)
+    else:
+        raise ValueError()
+
diff --git a/insightface/reconstruction/jmlr/backbones/network.py b/insightface/reconstruction/jmlr/backbones/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f438452f05f2de1e3a92aace38de6cfe53ec0a9
--- /dev/null
+++ b/insightface/reconstruction/jmlr/backbones/network.py
@@ -0,0 +1,260 @@
+import os
+import time
+import timm
+import glob
+import numpy as np
+import os.path as osp
+
+import torch
+import torch.distributed as dist
+from torch import nn
+import torch.nn.functional as F
+from .iresnet import get_model as arcface_get_model
+
+
+def kaiming_leaky_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        torch.nn.init.kaiming_normal_(m.weight, a=0.2, mode='fan_in', nonlinearity='leaky_relu')
+
+class CustomMappingNetwork(nn.Module):
+    def __init__(self, z_dim, map_hidden_dim, map_output_dim):
+        super().__init__()
+
+
+
+        self.network = nn.Sequential(nn.Linear(z_dim, map_hidden_dim),
+                nn.LeakyReLU(0.2, inplace=True),
+
+                nn.Linear(map_hidden_dim, map_hidden_dim),
+                nn.LeakyReLU(0.2, inplace=True),
+
+                nn.Linear(map_hidden_dim, map_hidden_dim),
+                nn.LeakyReLU(0.2, inplace=True),
+
+                nn.Linear(map_hidden_dim, map_output_dim))
+
+        self.network.apply(kaiming_leaky_init)
+        with torch.no_grad():
+            self.network[-1].weight *= 0.25
+
+    def forward(self, z):
+        frequencies_offsets = self.network(z)
+        frequencies = frequencies_offsets[..., :frequencies_offsets.shape[-1]//2]
+        phase_shifts = frequencies_offsets[..., frequencies_offsets.shape[-1]//2:]
+
+        return frequencies, phase_shifts
+
+class FiLMLayer(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.layer = nn.Linear(input_dim, hidden_dim)
+
+    def forward(self, x, freq, phase_shift):
+        x = self.layer(x)
+        return torch.sin(freq * x + phase_shift)
+
+class InstanceNorm(nn.Module):
+    def __init__(self, epsilon=1e-8):
+        """
+            @notice: avoid in-place ops.
+            https://discuss.pytorch.org/t/encounter-the-runtimeerror-one-of-the-variables-needed-for-gradient-computation-has-been-modified-by-an-inplace-operation/836/3
+        """
+        super(InstanceNorm, self).__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x):
+        x   = x - torch.mean(x, (2, 3), True)
+        tmp = torch.mul(x, x) # or x ** 2
+        tmp = torch.rsqrt(torch.mean(tmp, (2, 3), True) + self.epsilon)
+        return x * tmp
+
+class ApplyStyle(nn.Module):
+    def __init__(self, latent_size, channels):
+        super(ApplyStyle, self).__init__()
+        self.linear = nn.Linear(latent_size, channels * 2)
+
+    def forward(self, x, latent):
+        style = self.linear(latent).unsqueeze(2).unsqueeze(3) #B, 2*c, 1, 1
+        gamma, beta = style.chunk(2, 1)
+        x = gamma * x + beta
+        return x
+
+class ResnetBlock_Adain(nn.Module):
+    def __init__(self, dim, latent_size, padding_type='reflect', activation=nn.ReLU(True)):
+        super(ResnetBlock_Adain, self).__init__()
+
+        p = 0
+        conv1 = []
+        if padding_type == 'reflect':
+            conv1 += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv1 += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv1 += [nn.Conv2d(dim, dim, kernel_size=3, padding = p), InstanceNorm()]
+        self.conv1 = nn.Sequential(*conv1)
+        self.style1 = ApplyStyle(latent_size, dim)
+        self.act1 = activation
+
+        p = 0
+        conv2 = []
+        if padding_type == 'reflect':
+            conv2 += [nn.ReflectionPad2d(1)]
+        elif padding_type == 'replicate':
+            conv2 += [nn.ReplicationPad2d(1)]
+        elif padding_type == 'zero':
+            p = 1
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv2 += [nn.Conv2d(dim, dim, kernel_size=3, padding=p), InstanceNorm()]
+        self.conv2 = nn.Sequential(*conv2)
+        self.style2 = ApplyStyle(latent_size, dim)
+
+
+    def forward(self, x, dlatents_in_slice):
+        y = self.conv1(x)
+        y = self.style1(y, dlatents_in_slice)
+        y = self.act1(y)
+        y = self.conv2(y)
+        y = self.style2(y, dlatents_in_slice)
+        out = x + y
+        return out
+
+class OneNetwork(nn.Module):
+    def __init__(self, cfg):
+        super(OneNetwork, self).__init__()
+        self.num_verts = cfg.num_verts
+        self.input_size = cfg.input_size
+        self.use_eyes = cfg.eyes is not None
+        kwargs = {}
+        num_classes = self.num_verts*5
+        if cfg.task==1:
+            num_classes = self.num_verts*3
+        elif cfg.task==2:
+            num_classes = 6
+        elif cfg.task==3:
+            num_classes = self.num_verts*2
+        eye_num_classes = 481*2*5
+        #if use_eyes:
+        #    num_classes += 481*2*5
+        if cfg.network.startswith('resnet'):
+            kwargs['base_width'] = int(64*cfg.width_mult)
+        p_num_classes = num_classes
+        if cfg.no_gap:
+            p_num_classes = 0
+            kwargs['global_pool'] = None
+        elif cfg.use_arcface:
+            p_num_classes = 0
+            kwargs['global_pool'] = None
+        elif self.use_eyes:
+            p_num_classes = 0
+            #kwargs['global_pool'] = None
+        if cfg.network=='resnet_jmlr':
+            from .resnet import resnet_jmlr
+            self.net = resnet_jmlr(num_classes = p_num_classes, **kwargs)
+            if self.use_eyes:
+                input_dim = 512 #resnet34d
+                self.nete = resnet_jmlr(num_classes = p_num_classes, **kwargs)
+                self.fc = nn.Linear(input_dim*2, num_classes+eye_num_classes)
+        else:
+            self.net = timm.create_model(cfg.network, num_classes = p_num_classes, **kwargs)
+
+        if cfg.no_gap:
+            in_channel = self.net.num_features
+            feat_hw = (self.input_size//32)**2
+            mid_channel = 128
+            self.no_gap_output = nn.Sequential(*[
+                nn.BatchNorm2d(in_channel),
+                nn.Conv2d(in_channel, mid_channel, 1, stride=1, padding=0, bias=False),
+                nn.ReLU(),
+                nn.Flatten(1),
+                nn.Linear(mid_channel*feat_hw, num_classes)])
+
+        self.no_gap = cfg.no_gap
+        self.use_arcface = cfg.use_arcface
+        if self.use_arcface:
+            self.neta = arcface_get_model(cfg.arcface_model, input_size=cfg.arcface_input_size)
+            self.neta.load_state_dict(torch.load(cfg.arcface_ckpt, map_location=torch.device('cpu')))
+            self.neta.eval()
+            self.neta.requires_grad_(False)
+            input_dim = 512 #resnet34d
+            z_dim = 512 #arcface_dim
+            hidden_dim = 256
+            self.pool = nn.AdaptiveAvgPool2d(1)
+            self.flatten = nn.Flatten(1)
+            mlp_act = nn.LeakyReLU
+
+            self.mlp = nn.Sequential(*[
+                nn.Linear(z_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, hidden_dim),
+                mlp_act(),
+                nn.Linear(hidden_dim, input_dim),
+                ])
+            style_blocks = []
+            for i in range(3):
+                style_blocks += [ResnetBlock_Adain(input_dim, latent_size=z_dim)]
+            self.style_blocks = nn.Sequential(*style_blocks)
+            self.branch2d = nn.Sequential(*[
+                nn.Conv2d(input_dim, input_dim, 3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(input_dim),
+                nn.ReLU(),
+                ])
+            self.fc = nn.Linear(input_dim*2, num_classes)
+
+
+    def forward(self, x):
+        if self.use_arcface:
+            conv_feat = self.net.forward_features(x)
+            #input = self.flatten(self.pool(conv_feat))
+            xa = F.interpolate(x, [144, 144], mode='bilinear', align_corners=False)
+            xa = xa[:,:,8:120,16:128]
+            z = self.neta(xa)
+            z = self.mlp(z)
+
+            c = conv_feat
+            for i in range(len(self.style_blocks)):
+                c = self.style_blocks[i](c, z)
+            feat3 = c
+            feat2 = self.branch2d(conv_feat)
+            conv_feat = torch.cat([feat3, feat2], dim=1)
+            feat = self.flatten(self.pool(conv_feat))
+            pred = self.fc(feat)
+
+        elif self.no_gap:
+            y = self.net.forward_features(x)
+            pred = self.no_gap_output(y)
+        else:
+            pred = self.net(x)
+            if self.use_eyes:
+                eye_w1 = x.shape[3]//8
+                eye_w2 = x.shape[3] - wa
+                hstep = x.shape[2]//8
+                eye_h1 = hstep*2
+                eye_h2 = hstep*4
+                x_eye = x[:,:,eye_h1:eye_h2,eye_w1:eye_w2]
+                feate = self.nete(x_eye)
+                #print(pred.shape, feate.shape)
+                feat = torch.cat((pred,feate), 1)
+                pred = self.fc(feat)
+        return pred
+
+def get_network(cfg):
+    if cfg.use_onenetwork:
+        net = OneNetwork(cfg)
+    else:
+        net = timm.create_model(cfg.network, num_classes = 1220*5)
+    return net
+
+
diff --git a/insightface/reconstruction/jmlr/backbones/resnet.py b/insightface/reconstruction/jmlr/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc9b9faf9da4c45844f5ba377447e3fc7e23cc9
--- /dev/null
+++ b/insightface/reconstruction/jmlr/backbones/resnet.py
@@ -0,0 +1,453 @@
+"""PyTorch ResNet
+
+This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with
+additional dropout and dynamic global avg/max pool.
+
+ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman
+
+Copyright 2019, Ross Wightman
+"""
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.helpers import build_model_with_cfg
+from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame, BlurPool2d, GroupNorm, create_attn, get_attn, create_classifier
+
+
+
+def get_padding(kernel_size, stride, dilation=1):
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+def create_aa(aa_layer, channels, stride=2, enable=True):
+    if not aa_layer or not enable:
+        return nn.Identity()
+    return aa_layer(stride) if issubclass(aa_layer, nn.AvgPool2d) else aa_layer(channels=channels, stride=stride)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(BasicBlock, self).__init__()
+
+        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
+        assert base_width == 64, 'BasicBlock does not support changing base width'
+        first_planes = planes // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
+
+        self.conv1 = nn.Conv2d(
+            inplanes, first_planes, kernel_size=3, stride=1 if use_aa else stride, padding=first_dilation,
+            dilation=first_dilation, bias=False)
+        self.bn1 = norm_layer(first_planes)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
+        self.act1 = act_layer(inplace=True)
+        self.aa = create_aa(aa_layer, channels=first_planes, stride=stride, enable=use_aa)
+
+        self.conv2 = nn.Conv2d(
+            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
+        self.bn2 = norm_layer(outplanes)
+
+        self.se = create_attn(attn_layer, outplanes)
+
+        self.act2 = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn2.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.drop_block(x)
+        x = self.act1(x)
+        x = self.aa(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act2(x)
+
+        return x
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+            self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+            reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(Bottleneck, self).__init__()
+
+        width = int(math.floor(planes * (base_width / 64)) * cardinality)
+        first_planes = width // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
+
+        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(first_planes)
+        self.act1 = act_layer(inplace=True)
+
+        self.conv2 = nn.Conv2d(
+            first_planes, width, kernel_size=3, stride=1 if use_aa else stride,
+            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
+        self.bn2 = norm_layer(width)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
+        self.act2 = act_layer(inplace=True)
+        self.aa = create_aa(aa_layer, channels=width, stride=stride, enable=use_aa)
+
+        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(outplanes)
+
+        self.se = create_attn(attn_layer, outplanes)
+
+        self.act3 = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_path = drop_path
+
+    def zero_init_last(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.drop_block(x)
+        x = self.act2(x)
+        x = self.aa(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act3(x)
+
+        return x
+
+
+def downsample_conv(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
+    p = get_padding(kernel_size, stride, first_dilation)
+
+    return nn.Sequential(*[
+        nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
+        norm_layer(out_channels)
+    ])
+
+
+def downsample_avg(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    if stride == 1 and dilation == 1:
+        pool = nn.Identity()
+    else:
+        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+
+    return nn.Sequential(*[
+        pool,
+        nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
+        norm_layer(out_channels)
+    ])
+
+
+def drop_blocks(drop_prob=0.):
+    return [
+        None, None,
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=5, gamma_scale=0.25) if drop_prob else None,
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=3, gamma_scale=1.00) if drop_prob else None]
+
+
+def make_blocks(
+        block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
+        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
+    stages = []
+    feature_info = []
+    net_num_blocks = sum(block_repeats)
+    net_block_idx = 0
+    net_stride = 4
+    dilation = prev_dilation = 1
+    for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
+        stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
+        stride = 1 if stage_idx == 0 else 2
+        if net_stride >= output_stride:
+            dilation *= stride
+            stride = 1
+        else:
+            net_stride *= stride
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block_fn.expansion:
+            down_kwargs = dict(
+                in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
+                stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
+            downsample = downsample_avg(**down_kwargs) if avg_down else downsample_conv(**down_kwargs)
+
+        block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
+        blocks = []
+        for block_idx in range(num_blocks):
+            downsample = downsample if block_idx == 0 else None
+            stride = stride if block_idx == 0 else 1
+            block_dpr = drop_path_rate * net_block_idx / (net_num_blocks - 1)  # stochastic depth linear decay rule
+            blocks.append(block_fn(
+                inplanes, planes, stride, downsample, first_dilation=prev_dilation,
+                drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
+            prev_dilation = dilation
+            inplanes = planes * block_fn.expansion
+            net_block_idx += 1
+
+        stages.append((stage_name, nn.Sequential(*blocks)))
+        feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
+
+    return stages, feature_info
+
+
+class ResNet(nn.Module):
+    """ResNet / ResNeXt / SE-ResNeXt / SE-Net
+
+    This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
+      * have > 1 stride in the 3x3 conv layer of bottleneck
+      * have conv-bn-act ordering
+
+    This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
+    variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
+    'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
+
+    ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
+      * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
+      * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
+      * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
+      * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
+      * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
+      * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
+      * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
+
+    ResNeXt
+      * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
+      * same c,d, e, s variants as ResNet can be enabled
+
+    SE-ResNeXt
+      * normal - 7x7 stem, stem_width = 64
+      * same c, d, e, s variants as ResNet can be enabled
+
+    SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
+        reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
+
+    Parameters
+    ----------
+    block : Block, class for the residual block. Options are BasicBlockGl, BottleneckGl.
+    layers : list of int, number of layers in each block
+    num_classes : int, default 1000, number of classification classes.
+    in_chans : int, default 3, number of input (color) channels.
+    output_stride : int, default 32, output stride of the network, 32, 16, or 8.
+    global_pool : str, Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
+    cardinality : int, default 1, number of convolution groups for 3x3 conv in Bottleneck.
+    base_width : int, default 64, factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
+    stem_width : int, default 64, number of channels in stem convolutions
+    stem_type : str, default ''
+        The type of stem:
+          * '', default - a single 7x7 conv with a width of stem_width
+          * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
+          * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
+    block_reduce_first : int, default 1
+        Reduction factor for first convolution output width of residual blocks, 1 for all archs except senets, where 2
+    down_kernel_size : int, default 1, kernel size of residual block downsample path, 1x1 for most, 3x3 for senets
+    avg_down : bool, default False, use average pooling for projection skip connection between stages/downsample.
+    act_layer : nn.Module, activation layer
+    norm_layer : nn.Module, normalization layer
+    aa_layer : nn.Module, anti-aliasing layer
+    drop_rate : float, default 0. Dropout probability before classifier, for training
+    """
+
+    def __init__(
+            self, block, layers, num_classes=1000, in_chans=3, output_stride=32, global_pool='avg',
+            cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False, block_reduce_first=1,
+            down_kernel_size=1, avg_down=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None,
+            drop_rate=0.0, drop_path_rate=0., drop_block_rate=0., zero_init_last=True, block_args=None, channels=[64, 128, 256, 512]):
+        super(ResNet, self).__init__()
+        block_args = block_args or dict()
+        assert output_stride in (8, 16, 32)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.grad_checkpointing = False
+
+        # Stem
+        deep_stem = 'deep' in stem_type
+        inplanes = stem_width * 2 if deep_stem else 64
+        if deep_stem:
+            stem_chs = (stem_width, stem_width)
+            if 'tiered' in stem_type:
+                stem_chs = (3 * (stem_width // 4), stem_width)
+            self.conv1 = nn.Sequential(*[
+                nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
+                norm_layer(stem_chs[0]),
+                act_layer(inplace=True),
+                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
+                norm_layer(stem_chs[1]),
+                act_layer(inplace=True),
+                nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
+        else:
+            self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(inplanes)
+        self.act1 = act_layer(inplace=True)
+        self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
+
+        # Stem pooling. The name 'maxpool' remains for weight compatibility.
+        if replace_stem_pool:
+            self.maxpool = nn.Sequential(*filter(None, [
+                nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False),
+                create_aa(aa_layer, channels=inplanes, stride=2) if aa_layer is not None else None,
+                norm_layer(inplanes),
+                act_layer(inplace=True)
+            ]))
+        else:
+            if aa_layer is not None:
+                if issubclass(aa_layer, nn.AvgPool2d):
+                    self.maxpool = aa_layer(2)
+                else:
+                    self.maxpool = nn.Sequential(*[
+                        nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                        aa_layer(channels=inplanes, stride=2)])
+            else:
+                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Feature Blocks
+        #channels = [64, 128, 256, 512]
+        stage_modules, stage_feature_info = make_blocks(
+            block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
+            output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
+            down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
+            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
+        for stage in stage_modules:
+            self.add_module(*stage)  # layer1, layer2, etc
+        self.feature_info.extend(stage_feature_info)
+
+        # Head (Pooling and Classifier)
+        self.num_features = 512 * block.expansion
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+        self.init_weights(zero_init_last=zero_init_last)
+
+    @torch.jit.ignore
+    def init_weights(self, zero_init_last=True):
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        if zero_init_last:
+            for m in self.modules():
+                if hasattr(m, 'zero_init_last'):
+                    m.zero_init_last()
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        matcher = dict(stem=r'^conv1|bn1|maxpool', blocks=r'^layer(\d+)' if coarse else r'^layer(\d+)\.(\d+)')
+        return matcher
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self, name_only=False):
+        return 'fc' if name_only else self.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq([self.layer1, self.layer2, self.layer3, self.layer4], x, flatten=True)
+        else:
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+        return x
+
+    def forward_head(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
+        return x if pre_logits else self.fc(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+#def _create_resnet(variant, pretrained=False, **kwargs):
+#    return build_model_with_cfg(ResNet, variant, pretrained, **kwargs)
+
+
+
+def resnet34d(pretrained=False, **kwargs):
+    """Constructs a ResNet-34-D model.
+    """
+    model_args = dict(
+        block=BasicBlock, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
+    return ResNet(**model_args)
+
+
+def resnet_jmlr(pretrained=False, **kwargs):
+    model_args = dict(
+        block=BasicBlock, layers=[5, 3, 4, 2], stem_width=32, stem_type='deep', avg_down=True, channels=[64,160,272,512], **kwargs)
+    #return _create_resnet('resnet34d', pretrained, **model_args)
+    return ResNet(**model_args)
+
diff --git a/insightface/reconstruction/jmlr/configs/base.py b/insightface/reconstruction/jmlr/configs/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2c29fa3f19141a1c0a708b3e1d640afd585762
--- /dev/null
+++ b/insightface/reconstruction/jmlr/configs/base.py
@@ -0,0 +1,105 @@
+from easydict import EasyDict as edict
+import numpy as np
+
+config = edict()
+config.embedding_size = 512
+config.sample_rate = 1
+config.fp16 = 0
+config.tf32 = False
+config.backbone_wd = None
+config.batch_size = 128
+config.clip_grad = None
+config.dropout = 0.0
+#config.warmup_epoch = -1
+config.loss = 'cosface'
+config.margin = 0.4
+config.hard_margin = False
+config.network = 'r50'
+config.prelu = True
+config.stem_type = ''
+config.dropblock = 0.0
+config.output = None
+config.input_size = 112
+config.width_mult = 1.0
+config.kaiming_init = True
+config.use_se = False
+config.aug_modes = []
+config.checkpoint_segments = [1, 1, 1, 1]
+
+config.sampling_id = True
+config.id_sampling_ratio = None
+metric_loss = edict()
+metric_loss.enable = False
+metric_loss.lambda_n = 0.0
+metric_loss.lambda_c = 0.0
+metric_loss.lambda_t = 0.0
+metric_loss.margin_c = 1.0
+metric_loss.margin_t = 1.0
+metric_loss.margin_n = 0.4
+config.metric_loss = metric_loss
+
+config.opt = 'sgd'
+config.lr = 0.1  # when batch size is 512
+config.momentum = 0.9
+config.weight_decay = 5e-4
+config.fc_mom = 0.9
+
+config.warmup_epochs = 0
+config.max_warmup_steps = 6000
+config.num_epochs = 24
+
+
+config.resume = False
+config.resume_path = None
+config.resume_from = None
+
+config.save_every_epochs = True
+
+config.lr_func = None
+config.lr_epochs = None
+config.save_pfc = False
+config.save_onnx = False
+config.save_opt = False
+
+config.label_6dof_mean = np.array([-0.018197, -0.017891, 0.025348, -0.005368, 0.001176, -0.532206], dtype=np.float32)  # mean of pitch, yaw, roll, tx, ty, tz
+config.label_6dof_std = np.array([0.314015, 0.271809, 0.081881, 0.022173, 0.048839, 0.065444], dtype=np.float32)        # std of pitch, yaw, roll, tx, ty, tz
+
+config.num_verts = 1220
+config.flipindex_file = 'cache_align/flip_index.npy'
+config.enable_flip = True
+config.verts3d_central_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 956, 975, 1022, 1041, 1047, 1048, 1049]
+
+config.task = 0
+config.ckpt = None
+config.loss_hard = False
+config.sampling_hard = False
+config.loss_pip = False
+config.net_stride = 32
+config.loss_bone3d = False
+config.loss_bone2d = False
+
+config.lossw_verts3d = 8.0
+config.lossw_verts2d = 16.0
+config.lossw_bone3d = 10.0
+config.lossw_bone2d = 10.0
+config.lossw_project = 10.0
+config.lossw_eyes3d = 8.0
+config.lossw_eyes2d = 16.0
+
+config.align_face = False
+config.no_gap = False
+
+config.use_trainval = False
+
+config.project_loss = False
+
+config.use_onenetwork = True
+
+config.use_rtloss = False
+
+
+config.use_arcface = False
+
+
+config.eyes = None
+
diff --git a/insightface/reconstruction/jmlr/configs/s1.py b/insightface/reconstruction/jmlr/configs/s1.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d77387a72167a283d32b28c1c81383c8bca400
--- /dev/null
+++ b/insightface/reconstruction/jmlr/configs/s1.py
@@ -0,0 +1,54 @@
+from easydict import EasyDict as edict
+
+config = edict()
+
+config.dataset = "wcpa"
+config.root_dir = '/data/insightface/wcpa'
+config.cache_dir = './cache_align'
+#config.num_classes = 617970
+#config.num_classes = 2000000
+#config.num_classes = 80000000
+#config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+#config.val_targets = ["lfw"]
+#config.val_targets = []
+config.verbose = 20000
+
+#config.network = 'resnet34d'
+config.network = 'resnet_jmlr'
+config.input_size = 256
+#config.width_mult = 1.0
+#config.dropout = 0.0
+#config.loss = 'cosface'
+#config.embedding_size = 512
+#config.sample_rate = 0.2
+config.fp16 = 0
+config.tf32 = True
+config.weight_decay = 5e-4
+config.batch_size = 64
+config.lr = 0.1  # lr when batch size is 512
+
+config.aug_modes = ['1']
+
+config.num_epochs = 40
+config.warmup_epochs = 1
+config.max_warmup_steps = 1000
+
+#def lr_step_func(epoch):
+#    return ((epoch + 1) / (4 + 1)) ** 2 if epoch < -1 else 0.1 ** len(
+#        [m for m in [20, 30, 38] if m - 1 <= epoch])
+#config.lr_func = lr_step_func
+
+config.task = 0
+config.save_every_epochs = False
+
+
+config.lossw_verts3d = 16.0
+
+config.align_face = True
+
+config.use_trainval = True
+#config.use_rtloss = True
+
+config.loss_bone3d = True
+config.lossw_bone3d = 2.0
+
diff --git a/insightface/reconstruction/jmlr/configs/s2.py b/insightface/reconstruction/jmlr/configs/s2.py
new file mode 100644
index 0000000000000000000000000000000000000000..89fa69babae0eb25656796faa980984d94e2bb62
--- /dev/null
+++ b/insightface/reconstruction/jmlr/configs/s2.py
@@ -0,0 +1,57 @@
+from easydict import EasyDict as edict
+
+config = edict()
+
+config.dataset = "wcpa"
+config.root_dir = '/data/insightface/wcpa'
+config.cache_dir = './cache_align_eyes'
+#config.num_classes = 617970
+#config.num_classes = 2000000
+#config.num_classes = 80000000
+#config.val_targets = ["lfw", "cfp_fp", "agedb_30"]
+#config.val_targets = ["lfw"]
+#config.val_targets = []
+config.verbose = 20000
+
+#config.network = 'resnet34d'
+config.network = 'resnet_jmlr'
+config.input_size = 256
+#config.width_mult = 1.0
+#config.dropout = 0.0
+#config.loss = 'cosface'
+#config.embedding_size = 512
+#config.sample_rate = 0.2
+config.fp16 = 0
+config.tf32 = True
+config.weight_decay = 5e-4
+config.batch_size = 64
+config.lr = 0.1  # lr when batch size is 512
+
+config.aug_modes = ['1']
+
+config.num_epochs = 40
+config.warmup_epochs = 1
+config.max_warmup_steps = 1000
+
+#def lr_step_func(epoch):
+#    return ((epoch + 1) / (4 + 1)) ** 2 if epoch < -1 else 0.1 ** len(
+#        [m for m in [20, 30, 38] if m - 1 <= epoch])
+#config.lr_func = lr_step_func
+
+config.task = 0
+config.save_every_epochs = False
+
+
+config.lossw_verts3d = 16.0
+
+config.align_face = True
+
+config.use_trainval = True
+#config.use_rtloss = True
+
+config.loss_bone3d = False
+config.lossw_bone3d = 2.0
+
+config.eyes = {'root': './eyes_raw'}
+config.enable_flip = False
+
diff --git a/insightface/reconstruction/jmlr/dataset.py b/insightface/reconstruction/jmlr/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f82855f30906868804af70d6dd815f7a72dab35
--- /dev/null
+++ b/insightface/reconstruction/jmlr/dataset.py
@@ -0,0 +1,811 @@
+import numbers
+import os
+import os.path as osp
+import pickle
+import queue as Queue
+import threading
+import logging
+import numbers
+import math
+import pandas as pd
+from scipy.spatial.transform import Rotation
+
+import mxnet as mx
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+from skimage import transform as sktrans
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from augs import *
+
+
+def Rt26dof(R_t, degrees=False):
+    yaw_gt, pitch_gt, roll_gt = Rotation.from_matrix(R_t[:3, :3].T).as_euler('yxz', degrees=degrees)
+    label_euler = np.array([pitch_gt, yaw_gt, roll_gt])
+    label_translation = R_t[3, :3]
+    label_6dof = np.concatenate([label_euler, label_translation])
+    return label_6dof
+
+
+def gen_target_pip(target, target_map, target_local_x, target_local_y):
+    map_channel, map_height, map_width = target_map.shape
+    target = target.reshape(-1, 2)
+    assert map_channel == target.shape[0]
+
+    for i in range(map_channel):
+        mu_x = int(math.floor(target[i][0] * map_width))
+        mu_y = int(math.floor(target[i][1] * map_height))
+        mu_x = max(0, mu_x)
+        mu_y = max(0, mu_y)
+        mu_x = min(mu_x, map_width-1)
+        mu_y = min(mu_y, map_height-1)
+        target_map[i, mu_y, mu_x] = 1
+        shift_x = target[i][0] * map_width - mu_x
+        shift_y = target[i][1] * map_height - mu_y
+        target_local_x[i, mu_y, mu_x] = shift_x
+        target_local_y[i, mu_y, mu_x] = shift_y
+
+
+    return target_map, target_local_x, target_local_y
+
+def get_tris(cfg):
+    import trimesh
+    data_root = Path(cfg.root_dir)
+    obj_path = data_root / 'resources/example.obj'
+    mesh = trimesh.load(obj_path, process=False)
+    verts_template = np.array(mesh.vertices, dtype=np.float32)
+    tris = np.array(mesh.faces, dtype=np.int32)
+    #print(verts_template.shape, tris.shape)
+    return tris
+
+
+class BackgroundGenerator(threading.Thread):
+    def __init__(self, generator, local_rank, max_prefetch=6):
+        super(BackgroundGenerator, self).__init__()
+        self.queue = Queue.Queue(max_prefetch)
+        self.generator = generator
+        self.local_rank = local_rank
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        torch.cuda.set_device(self.local_rank)
+        for item in self.generator:
+            self.queue.put(item)
+        self.queue.put(None)
+
+    def next(self):
+        next_item = self.queue.get()
+        if next_item is None:
+            raise StopIteration
+        return next_item
+
+    def __next__(self):
+        return self.next()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoaderX(DataLoader):
+    def __init__(self, local_rank, **kwargs):
+        super(DataLoaderX, self).__init__(**kwargs)
+        self.stream = torch.cuda.Stream(local_rank)
+        self.local_rank = local_rank
+
+    def __iter__(self):
+        self.iter = super(DataLoaderX, self).__iter__()
+        self.iter = BackgroundGenerator(self.iter, self.local_rank)
+        self.preload()
+        return self
+
+    def preload(self):
+        self.batch = next(self.iter, None)
+        if self.batch is None:
+            return None
+        with torch.cuda.stream(self.stream):
+            for k in range(len(self.batch)):
+                self.batch[k] = self.batch[k].to(device=self.local_rank,
+                                                 non_blocking=True)
+
+    def __next__(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is None:
+            raise StopIteration
+        self.preload()
+        return batch
+
+
+class FaceDataset(Dataset):
+    def __init__(self, cfg, is_train=True, is_test=False, local_rank=0):
+        super(FaceDataset, self).__init__()
+
+        
+        self.data_root = Path(cfg.root_dir)
+        self.input_size = cfg.input_size
+        self.transform = get_aug_transform(cfg)
+        self.local_rank = local_rank
+        self.is_test = is_test
+        txt_path = self.data_root / 'resources/projection_matrix.txt'
+        self.M_proj = np.loadtxt(txt_path, dtype=np.float32)
+        if is_test:
+            data_root = Path(cfg.root_dir)
+            csv_path = data_root / 'list/WCPA_track2_test.csv'
+            self.df = pd.read_csv(csv_path, dtype={'subject_id': str, 'facial_action': str, 'img_id': str})
+        else:
+            if is_train:
+                self.df = pd.read_csv(osp.join(cfg.cache_dir, 'train_list.csv'), dtype={'subject_id': str, 'facial_action': str, 'img_id': str})
+            else:
+                self.df = pd.read_csv(osp.join(cfg.cache_dir, 'val_list.csv'), dtype={'subject_id': str, 'facial_action': str, 'img_id': str})
+        self.label_6dof_mean = [-0.018197, -0.017891, 0.025348, -0.005368, 0.001176, -0.532206]   # mean of pitch, yaw, roll, tx, ty, tz
+        self.label_6dof_std = [0.314015, 0.271809, 0.081881, 0.022173, 0.048839, 0.065444]        # std of pitch, yaw, roll, tx, ty, tz
+        self.align_face = cfg.align_face
+        if not self.align_face:
+            self.dst_pts = np.float32([
+                [0, 0],
+                [0, cfg.input_size- 1],
+                [cfg.input_size- 1, 0]
+            ])
+        else:
+            dst_pts = np.array([
+                [38.2946, 51.6963],
+                [73.5318, 51.5014],
+                [56.0252, 71.7366],
+                [41.5493, 92.3655],
+                [70.7299, 92.2041] ], dtype=np.float32 )
+
+            new_size = 144
+            dst_pts[:,0] += ((new_size-112)//2)
+            dst_pts[:,1] += 8
+            dst_pts[:,:] *= (self.input_size/float(new_size))
+            self.dst_pts = dst_pts
+
+        if local_rank==0:
+            logging.info('data_transform_list:%s'%self.transform)
+            logging.info('len:%d'%len(self.df))
+        self.is_test_aug = False
+        self.eye_dataset = None
+        if cfg.eyes is not None:
+            from eye_dataset import EyeDataset
+            self.eye_dataset = EyeDataset(cfg.eyes['root'])
+
+    def set_test_aug(self):
+        if not self.is_test_aug:
+            from easydict import EasyDict as edict
+            cfg = edict()
+            cfg.aug_modes = ['test-aug']
+            cfg.input_size = self.input_size
+            cfg.task = 0
+            self.transform = get_aug_transform(cfg)
+            self.is_test_aug = True
+
+    def get_names(self, index):
+        subject_id = self.df['subject_id'][index]
+        facial_action = self.df['facial_action'][index]
+        img_id = self.df['img_id'][index]
+        return subject_id, facial_action, img_id
+
+    def __getitem__(self, index):
+        subject_id = self.df['subject_id'][index]
+        facial_action = self.df['facial_action'][index]
+        img_id = self.df['img_id'][index]
+
+        img_path = self.data_root / 'image' / subject_id / facial_action / f'{img_id}_ar.jpg'
+        npz_path = self.data_root / 'info' / subject_id / facial_action / f'{img_id}_info.npz'
+        txt_path = self.data_root / '68landmarks' / subject_id / facial_action / f'{img_id}_68landmarks.txt'
+        #if not osp.exists(img_path):
+        #    continue
+
+        #print(img_path)
+        img_raw = cv2.imread(str(img_path))
+        #if img_raw is None:
+        #    print('XXX ERR:', img_path)
+        img_raw = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
+        #print(img_raw.shape)
+        img_h, img_w, _ = img_raw.shape
+        pts68 = np.loadtxt(txt_path, dtype=np.int32)
+
+        x_min, y_min = pts68.min(axis=0)
+        x_max, y_max = pts68.max(axis=0)
+        x_center = (x_min + x_max) / 2
+        y_center = (y_min + y_max) / 2
+        w, h = x_max - x_min, y_max - y_min
+
+
+        if not self.align_face:
+            size = max(w, h)
+            ss = np.array([0.75, 0.75, 0.85, 0.65])     # predefined expand size
+
+            left = x_center - ss[0] * size
+            right = x_center + ss[1] * size
+            top = y_center - ss[2] * size
+            bottom = y_center + ss[3] * size
+
+            src_pts = np.float32([
+                [left, top],
+                [left, bottom],
+                [right, top]
+            ])
+            tform = cv2.getAffineTransform(src_pts, self.dst_pts)
+        else:
+            src_pts = np.float32([
+                (pts68[36] + pts68[39])/2,
+                (pts68[42] + pts68[45])/2,
+                pts68[30],
+                pts68[48],
+                pts68[54]
+            ])
+            tf = sktrans.SimilarityTransform()
+            tf.estimate(src_pts, self.dst_pts)
+            tform = tf.params[0:2,:]
+
+        img_local = cv2.warpAffine(img_raw, tform, (self.input_size,)*2, flags=cv2.INTER_CUBIC)
+        fake_points2d = np.ones( (1,2), dtype=np.float32) * (self.input_size//2)
+
+        #tform_inv = cv2.invertAffineTransform(tform)
+        #img_global = cv2.warpAffine(img_local, tform_inv, (img_w, img_h), borderValue=0.0)
+        #img_global = cv2.resize(img_global, (self.input_size, self.input_size))
+        if self.transform is not None:
+            t = self.transform(image=img_local, keypoints=fake_points2d)
+            img_local = t['image']
+            if self.is_test_aug:
+                height, width = img_local.shape[:2]
+                for trans in t["replay"]["transforms"]:
+                    if trans['__class_fullname__']=='ShiftScaleRotate' and trans['applied']:
+                        param = trans['params']
+                        dx, dy, angle, scale = param['dx'], param['dy'], param['angle'], param['scale']
+                        center = (width / 2, height / 2)
+                        matrix = cv2.getRotationMatrix2D(center, angle, scale)
+                        matrix[0, 2] += dx * width
+                        matrix[1, 2] += dy * height
+                        new_matrix = np.identity(3)
+                        new_matrix[:2,:3] = matrix
+                        old_tform = np.identity(3)
+                        old_tform[:2,:3] = tform
+                        #new_tform = np.dot(old_tform, new_matrix)
+                        new_tform = np.dot(new_matrix, old_tform)
+                        #print('label_tform:')
+                        #print(label_tform.flatten())
+                        #print(new_matrix.flatten())
+                        #print(new_tform.flatten())
+                        tform = new_tform[:2,:3]
+                        break
+                            #print('trans param:', param)
+            #img_global = self.transform(image=img_global)['image']
+
+        tform_tensor = torch.tensor(tform, dtype=torch.float32)
+        d = {'img_local': img_local, 'tform': tform_tensor}
+        if self.eye_dataset is not None:
+            eye_key = str(Path('image') / subject_id / facial_action / f'{img_id}_ar.jpg')
+            #print(eye_key)
+            eyel, eyer = self.eye_dataset.get(eye_key, to_homo=True)
+            if eyel is not None:
+                #print(eye_key, el_inv.shape, er_inv.shape)
+                d['eye_world_left'] = torch.tensor(eyel, dtype=torch.float32)
+                d['eye_world_right'] = torch.tensor(eyer, dtype=torch.float32)
+        if not self.is_test:
+            M = np.load(npz_path)
+            #yaw_gt, pitch_gt, roll_gt = Rotation.from_matrix(M['R_t'][:3, :3].T).as_euler('yxz', degrees=False)
+            #label_euler = np.array([pitch_gt, yaw_gt, roll_gt])
+            #label_translation = M['R_t'][3, :3]
+            #label_6dof = np.concatenate([label_euler, label_translation])
+            #label_6dof = (label_6dof - self.label_6dof_mean) / self.label_6dof_std
+            #label_6dof_tensor = torch.tensor(label_6dof, dtype=torch.float32)
+            #label_verts = M['verts'] * 10.0     # roughly [-1, 1]
+            #label_verts_tensor = torch.tensor(label_verts, dtype=torch.float32)
+            #return img_local, label_verts_tensor, label_6dof_tensor
+            label_verts_tensor = torch.tensor(M['verts'], dtype=torch.float32)
+            label_Rt_tensor = torch.tensor(M['R_t'], dtype=torch.float32)
+            d['verts'] = label_verts_tensor
+            d['rt'] = label_Rt_tensor
+            #return img_local, img_global, label_verts_tensor, label_Rt_tensor, tform_tensor
+            #return img_local, label_verts_tensor, label_Rt_tensor, tform_tensor
+        else:
+            #return img_local, img_global, tform_tensor
+            index_tensor = torch.tensor(index, dtype=torch.long)
+            d['index'] = index_tensor
+            #return img_local, tform_tensor, index_tensor
+        return d
+
+
+    def __len__(self):
+        return len(self.df)
+
+class MXFaceDataset(Dataset):
+    def __init__(self, cfg, is_train=True, norm_6dof=True, degrees_6dof=False, local_rank=0):
+        super(MXFaceDataset, self).__init__()
+
+        
+        self.is_train = is_train
+        self.data_root = Path(cfg.root_dir)
+        self.input_size = cfg.input_size
+        self.transform = get_aug_transform(cfg)
+        self.local_rank = local_rank
+        self.use_trainval = cfg.use_trainval
+        self.use_eye = cfg.eyes is not None
+        if is_train:
+            #self.df = pd.read_csv(osp.join(cfg.cache_dir, 'train_list.csv'), dtype={'subject_id': str, 'facial_action': str, 'img_id': str})
+            path_imgrec = os.path.join(cfg.cache_dir, 'train.rec')
+            path_imgidx = os.path.join(cfg.cache_dir, 'train.idx')
+            self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+            self.imgidx = list(self.imgrec.keys)
+            self.imggroup = [0] * len(self.imgidx)
+            self.size_train = len(self.imgidx)
+            if self.use_trainval:
+                assert not cfg.sampling_hard
+                path_imgrec = os.path.join(cfg.cache_dir, 'val.rec')
+                path_imgidx = os.path.join(cfg.cache_dir, 'val.idx')
+                self.imgrec2 = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+                imgidx2 = list(self.imgrec2.keys)
+                self.imggroup += [1] * len(imgidx2)
+                self.imgidx += imgidx2
+        else:
+            #self.df = pd.read_csv(osp.join(cfg.cache_dir, 'val_list.csv'), dtype={'subject_id': str, 'facial_action': str, 'img_id': str})
+            path_imgrec = os.path.join(cfg.cache_dir, 'val.rec')
+            path_imgidx = os.path.join(cfg.cache_dir, 'val.idx')
+            self.imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+            self.imgidx = list(self.imgrec.keys)
+            self.imggroup = [0] * len(self.imgidx)
+        self.imgidx = np.array(self.imgidx)
+        self.imggroup = np.array(self.imggroup)
+        if cfg.sampling_hard and is_train:
+            meta = np.load(os.path.join(cfg.cache_dir, 'train.meta.npy'))
+            assert meta.shape[0]==len(self.imgidx)
+            new_imgidx = []
+            for i in range(len(self.imgidx)):
+                idx = self.imgidx[i]
+                assert i==idx
+                pose = np.abs(meta[i,:2])
+                #repeat = np.sum(pose>=35)*3+1
+                if np.max(pose)<15:
+                    repeat = 2
+                else:
+                    repeat = 1
+                new_imgidx += [idx]*repeat
+            if local_rank==0:
+                print('new-imgidx:', len(self.imgidx), len(new_imgidx))
+            self.imgidx = np.array(new_imgidx)
+        self.label_6dof_mean = [-0.018197, -0.017891, 0.025348, -0.005368, 0.001176, -0.532206]   # mean of pitch, yaw, roll, tx, ty, tz
+        self.label_6dof_std = [0.314015, 0.271809, 0.081881, 0.022173, 0.048839, 0.065444]        # std of pitch, yaw, roll, tx, ty, tz
+        txt_path = self.data_root / 'resources/projection_matrix.txt'
+        self.M_proj = np.loadtxt(txt_path, dtype=np.float32)
+        self.M1 = np.array([
+            [400.0,       0, 0, 0],
+            [      0, 400.0, 0, 0],
+            [      0,       0, 1, 0],
+            [400.0, 400.0, 0, 1]
+        ])
+        self.dst_pts = np.float32([
+            [0, 0],
+            [0, cfg.input_size- 1],
+            [cfg.input_size- 1, 0]
+        ])
+        self.norm_6dof = norm_6dof
+        self.degrees_6dof = degrees_6dof
+        self.task = cfg.task
+        self.num_verts = cfg.num_verts
+        self.loss_pip = cfg.loss_pip
+        self.net_stride = 32
+        if local_rank==0:
+            logging.info('data_transform_list:%s'%self.transform)
+            logging.info('len:%d'%len(self.imgidx))
+            logging.info('glen:%d'%len(self.imggroup))
+        self.is_test_aug = False
+
+        self.enable_flip = cfg.enable_flip
+        self.flipindex = cfg.flipindex.copy()
+        self.verts3d_central_index = cfg.verts3d_central_index
+        self.eye_dataset = None
+        self.use_eye = False
+        if cfg.eyes is not None:
+            #from eye_dataset import EyeDataset
+            #self.eye_dataset = EyeDataset(cfg.eyes['root'], load_data=False)
+            self.use_eye = True
+
+    def set_test_aug(self):
+        if not self.is_test_aug:
+            from easydict import EasyDict as edict
+            cfg = edict()
+            cfg.aug_modes = ['test-aug']
+            cfg.input_size = self.input_size
+            cfg.task = 0
+            self.transform = get_aug_transform(cfg)
+            self.is_test_aug = True
+
+    def __getitem__(self, index):
+        idx = self.imgidx[index]
+        group = self.imggroup[index]
+        if group==0:
+            imgrec = self.imgrec
+        elif group==1:
+            imgrec = self.imgrec2
+        elif group==2:
+            imgrec = self.imgrec3
+
+        s = imgrec.read_idx(idx)
+        header, img = mx.recordio.unpack(s)
+        hlabel = header.label
+        img = mx.image.imdecode(img).asnumpy() #rgb numpy
+
+        label_verts = np.array(hlabel[:1220*3], dtype=np.float32).reshape(-1,3)
+        label_Rt = np.array(hlabel[1220*3:1220*3+16], dtype=np.float32).reshape(4,4)
+        label_tform = np.array(hlabel[1220*3+16:1220*3+16+6], dtype=np.float32).reshape(2,3)
+        label_6dof = Rt26dof(label_Rt, self.degrees_6dof)
+        if self.norm_6dof:
+            label_6dof = (label_6dof - self.label_6dof_mean) / self.label_6dof_std
+        label_6dof_tensor = torch.tensor(label_6dof, dtype=torch.float32)
+        el_inv = None
+        er_inv = None
+        if self.use_eye:
+            a = 1220*3+16+6
+            el_inv = np.array(hlabel[a:a+481*3], dtype=np.float32).reshape(-1,3)
+            a+=481*3
+            er_inv = np.array(hlabel[a:a+481*3], dtype=np.float32).reshape(-1,3)
+            #el_inv = torch.tensor(el_inv, dtype=torch.float32)
+            #er_inv = torch.tensor(er_inv, dtype=torch.float32)
+            #eye_verts = [el_inv, er_inv]
+            eye_verts = np.concatenate( (el_inv, er_inv), axis=0 )
+
+        #img_local = None
+        img_raw = None
+        #if self.task==0 or self.task==2:
+        #    img_raw = img[:,self.input_size:,:]
+        #if self.task==0 or self.task==1 or self.task==3:
+        #    img_local = img[:,:self.input_size,:]
+        assert img.shape[0]==img.shape[1] and img.shape[0]>=self.input_size
+        if img.shape[0]>self.input_size:
+            scale = float(self.input_size) / img.shape[0]
+            #print('scale:', scale)
+            #src_pts = np.float32([
+            #    [0, 0],
+            #    [0, 799],
+            #    [799, 0]
+            #])
+            #tform = cv2.getAffineTransform(src_pts, self.dst_pts)
+            #new_tform = np.identity(3)
+            #new_tform[:2,:3] = tform
+            #label_tform = np.dot(new_tform, label_tform.T).T
+
+            src_pts = np.float32([
+                [0, 0, 1],
+                [0, 799, 1],
+                [799, 0, 1]
+            ])
+            dst_pts = np.dot(label_tform, src_pts.T).T
+            dst_pts *= scale
+            dst_pts = dst_pts.copy()
+            src_pts = src_pts[:,:2].copy()
+            #print('index:', index)
+            #print(src_pts.shape, dst_pts.shape)
+            #print(label_tform.shape)
+            #print(src_pts.dtype)
+            #print(dst_pts.dtype)
+            tform = cv2.getAffineTransform(src_pts, dst_pts)
+            label_tform = tform
+
+            img = cv2.resize(img, (self.input_size, self.input_size))
+
+        img_local = img
+        need_points2d = (self.task==0 or self.task==3)
+
+        if need_points2d:
+            ones = np.ones([label_verts.shape[0], 1])
+            verts_homo = np.concatenate([label_verts, ones], axis=1)
+            verts = verts_homo @ label_Rt @ self.M_proj @ self.M1
+            w_ = verts[:, [3]]
+            verts = verts / w_
+            points2d = verts[:, :3]
+            points2d[:, 1] = 800.0 - points2d[:, 1]
+            verts2d = points2d[:,:2].copy()
+            points2d[:,2] = 1.0
+            points2d = np.dot(label_tform, points2d.T).T
+        else:
+            points2d = np.ones( (1,2), dtype=np.float32) * (self.input_size//2)
+        if self.use_eye:
+            verts_homo = eye_verts
+            if verts_homo.shape[1] == 3:
+                ones = np.ones([verts_homo.shape[0], 1])
+                verts_homo = np.concatenate([verts_homo, ones], axis=1)
+            verts_out = verts_homo @ label_Rt @ self.M_proj @ self.M1
+            w_ = verts_out[:, [3]]
+            verts_out = verts_out / w_
+            _points2d = verts_out[:, :3]
+            _points2d[:, 1] = 800.0 - _points2d[:, 1]
+            _points2d[:,2] = 1.0
+            _points2d = np.dot(label_tform, _points2d.T).T
+            eye_points = _points2d
+        #if img.shape[0]!=self.input_size:
+        #    assert img.shape[0]>self.input_size
+            #img = cv2.resize(img, (self.input_size, self.input_size))
+            #scale = float(self.input_size) / img.shape[0]
+            #points2d *= scale
+
+        if self.transform is not None:
+            if img_raw is not None:
+                img_raw = self.transform(image=img_raw, keypoints=points2d)['image']
+            if img_local is not None:
+                height, width = img_local.shape[:2]
+                x = self.transform(image=img_local, keypoints=points2d)
+                img_local = x['image']
+                points2d = x['keypoints']
+                points2d = np.array(points2d, dtype=np.float32)
+                if self.is_test_aug:
+                    for trans in x["replay"]["transforms"]:
+                        if trans['__class_fullname__']=='ShiftScaleRotate' and trans['applied']:
+                            param = trans['params']
+                            dx, dy, angle, scale = param['dx'], param['dy'], param['angle'], param['scale']
+                            center = (width / 2, height / 2)
+                            matrix = cv2.getRotationMatrix2D(center, angle, scale)
+                            matrix[0, 2] += dx * width
+                            matrix[1, 2] += dy * height
+                            new_matrix = np.identity(3)
+                            new_matrix[:2,:3] = matrix
+                            old_tform = np.identity(3)
+                            old_tform[:2,:3] = label_tform
+                            #new_tform = np.dot(old_tform, new_matrix)
+                            new_tform = np.dot(new_matrix, old_tform)
+                            #print('label_tform:')
+                            #print(label_tform.flatten())
+                            #print(new_matrix.flatten())
+                            #print(new_tform.flatten())
+                            label_tform = new_tform[:2,:3]
+                            break
+                            #print('trans param:', param)
+
+
+        if self.loss_pip:
+            target_map = np.zeros((self.num_verts, int(self.input_size/self.net_stride), int(self.input_size/self.net_stride)))
+            target_local_x = np.zeros((self.num_verts, int(self.input_size/self.net_stride), int(self.input_size/self.net_stride)))
+            target_local_y = np.zeros((self.num_verts, int(self.input_size/self.net_stride), int(self.input_size/self.net_stride)))
+            target = points2d / self.input_size
+            target_map, target_local_x, target_local_y = gen_target_pip(target, target_map, target_local_x, target_local_y)
+            target_map_tensor = torch.tensor(target_map, dtype=torch.float32)
+            target_x_tensor = torch.tensor(target_local_x, dtype=torch.float32)
+            target_y_tensor = torch.tensor(target_local_y, dtype=torch.float32)
+            d['pip_map'] = target_map_tensor
+            d['pip_x'] = target_x_tensor
+            d['pip_y'] = target_y_tensor
+
+        if self.is_train and self.enable_flip and np.random.random()<0.5:
+            #if self.local_rank==0:
+            #    print('XXX:', label_verts[:5,:2])
+            img_local = img_local.flip([2])
+            x_of_central = 0.0
+            #x_of_central = label_verts[self.verts3d_central_index,0]
+            #x_of_central = np.mean(x_of_central)
+            label_verts = label_verts[self.flipindex,:]
+            label_verts[:,0] -= x_of_central
+            label_verts[:,0] *= -1.0
+            label_verts[:,0] += x_of_central
+
+            if need_points2d:
+                flipped_p2d = points2d[self.flipindex,:].copy()
+                flipped_p2d[:,0] = self.input_size - 1 - flipped_p2d[:,0]
+                points2d = flipped_p2d
+            if self.use_eye:
+                flipped_p2d = eye_points[self.flipindex,:].copy()
+                flipped_p2d[:,0] = self.input_size - 1 - flipped_p2d[:,0]
+                eye_points = flipped_p2d
+        label_verts_tensor = torch.tensor(label_verts*10.0, dtype=torch.float32)
+        d = {}
+        d['img_local'] = img_local
+        d['verts'] = label_verts_tensor
+        d['6dof'] = label_6dof_tensor
+        d['rt'] = torch.tensor(label_Rt, dtype=torch.float32)
+        if need_points2d:
+            points2d = points2d / (self.input_size//2) - 1.0
+            points2d_tensor = torch.tensor(points2d, dtype=torch.float32)
+            d['points2d'] = points2d_tensor
+        if self.use_eye:
+            d['eye_verts'] = torch.tensor(eye_verts, dtype=torch.float32)
+            eye_points = eye_points / (self.input_size//2) - 1.0
+            eye_points_tensor = torch.tensor(eye_points, dtype=torch.float32)
+            d['eye_points'] = eye_points_tensor
+
+        loss_weight = 1.0
+        if group!=0:
+            loss_weight = 0.0
+        loss_weight_tensor = torch.tensor(loss_weight, dtype=torch.float32)
+        d['loss_weight'] = loss_weight_tensor
+        label_tform_tensor = torch.tensor(label_tform, dtype=torch.float32)
+        d['tform'] = label_tform_tensor
+
+        #if img_local is None:
+        #    image = (img_raw,)
+        #elif img_raw is None:
+        #    image = (img_local,)
+        #else:
+        #    image = (img_local,img_raw)
+        #ret = image + (label_verts_tensor, label_6dof_tensor, points2d_tensor)
+        if not self.is_train:
+            idx_tensor = torch.tensor([idx], dtype=torch.long)
+            d['idx'] = idx_tensor
+            d['verts2d'] = torch.tensor(verts2d, dtype=torch.float32)
+        return d
+
+
+    def __len__(self):
+        return len(self.imgidx)
+
+def test_dataset1(cfg):
+    cfg.task = 0
+    is_train = False
+    center_axis = []
+    dataset = MXFaceDataset(cfg, is_train=is_train, norm_6dof=False, local_rank=0)
+    for i in range(len(dataset.flipindex)):
+        if i==dataset.flipindex[i]:
+            center_axis.append(i)
+    print(center_axis)
+    #dataset.transform = None
+    print('total:', len(dataset))
+    total = len(dataset)
+    #total = 50
+    list_6dof = []
+    all_mean_xs = []
+    for idx in range(total):
+        #img_local, img_raw, label_verts, label_6dof, = dataset[idx]
+        #img_local, img_raw, label_verts, label_6dof, points2d, tform, data_idx = dataset[idx]
+        #img_local, label_verts, label_6dof, points2d, tform, data_idx = dataset[idx]
+        d = dataset[idx]
+        img_local = d['img_local']
+        label_verts = d['verts']
+        label_6dof = d['6dof']
+        points2d = d['points2d']
+        label_verts = label_verts.numpy()
+        label_6dof = label_6dof.numpy()
+        points2d = points2d.numpy()
+        #print(img_local.shape, label_verts.shape, label_6dof.shape, points2d.shape)
+        verts3d = label_verts / 10.0
+        xs = []
+        for c in center_axis:
+            _x = verts3d[c,0]
+            xs.append(_x)
+        _std = np.std(xs)
+        print(xs)
+        print(_std)
+        #print(np.mean(xs))
+        all_mean_xs.append(np.mean(xs))
+        if idx%100==0:
+            print('processing:', idx, np.mean(all_mean_xs))
+        #print(label_verts[:3,:], label_6dof)
+        #list_6dof.append(label_6dof)
+        #print(image.__class__, label_verts.__class__)
+        #label = list(label_verts.numpy().flatten()) + list(label_6dof.numpy().flatten())
+        #points2d = label_verts2[:,:2]
+        #points2d = (points2d+1) * 128.0
+        #img_local = img_local.numpy()
+        #img_local = (img_local+1.0) * 128.0
+        #draw = img_local.astype(np.uint8).transpose( (1,2,0) )[:,:,::-1].copy()
+        #for i in range(points2d.shape[0]):
+        #    pt = points2d[i].astype(np.int)
+        #    cv2.circle(draw, pt, 2, (255,0,0), 2)
+        ##output_path = "outputs/%d_%.3f_%.3f_%.3f.jpg"%(idx, label_6dof[0], label_6dof[1], label_6dof[2])
+        #output_path = "outputs/%06d.jpg"%(idx)
+        #cv2.imwrite(output_path, draw)
+    #list_6dof = np.array(list_6dof)
+    #print('MEAN:')
+    #print(np.mean(list_6dof, axis=0))
+
+def test_loader1(cfg):
+    cfg.task = 0
+    is_train = True
+    dataset = MXFaceDataset(cfg, is_train=is_train, norm_6dof=False, local_rank=0)
+    loader = DataLoader(dataset, batch_size=64, shuffle=True)
+    for index, d in enumerate(loader):
+        #img_local = d['img_local']
+        label_verts = d['verts']
+        points2d = d['points2d']
+        tform = d['tform']
+        label_verts /= 10.0
+        points2d = (points2d + 1.0) * (cfg.input_size//2)
+        tform = tform.numpy()
+        verts = label_verts.numpy()
+        points2d = points2d.numpy()
+        print(verts.shape, points2d.shape, tform.shape)
+        np.save("temp/verts3d.npy", verts)
+        np.save("temp/points2d.npy", points2d)
+        np.save("temp/tform.npy", tform)
+        break
+
+def test_facedataset1(cfg):
+    cfg.task = 0
+    cfg.input_size = 512
+    dataset = FaceDataset(cfg, is_train=True, local_rank=0)
+    for idx in range(100000):
+        img_local, label_verts, label_Rt, tform = dataset[idx]
+        label_Rt = label_Rt.numpy()
+        if label_Rt[0,0]>1.0:
+            print(idx, label_Rt.shape)
+            print(label_Rt)
+            break
+
+def test_arcface(cfg):
+    cfg.task = 0
+    is_train = True
+    dataset = MXFaceDataset(cfg, is_train=is_train, norm_6dof=False, local_rank=0)
+    loader = DataLoader(dataset, batch_size=1, shuffle=True)
+    for index, d in enumerate(loader):
+        img = d['img_local'].numpy()
+        img /= 2.0
+        img += 0.5
+        img *= 255.0
+        img = img[0]
+        img = img.transpose( (1,2,0) )
+        img = img.astype(np.uint8)
+        img = cv2.resize(img, (144,144))
+        img = img[:,:,::-1]
+        img = img[8:120,16:128,:]
+        print(img.shape)
+        cv2.imwrite("temp/arc_%d.jpg"%index, img)
+        #np.save("temp/verts3d.npy", verts)
+        #np.save("temp/points2d.npy", points2d)
+        #np.save("temp/tform.npy", tform)
+        if index>100:
+            break
+
+def test_dataset2(cfg):
+    cfg.task = 0
+    is_train = False
+    center_axis = []
+    dataset = MXFaceDataset(cfg, is_train=is_train, norm_6dof=False, local_rank=0)
+    for i in range(len(dataset.flipindex)):
+        if i==dataset.flipindex[i]:
+            center_axis.append(i)
+    print(center_axis)
+    #dataset.transform = None
+    print('total:', len(dataset))
+    total = len(dataset)
+    total = 50
+    list_6dof = []
+    all_mean_xs = []
+    for idx in range(total):
+        d = dataset[idx]
+        img_local = d['img_local']
+        label_verts = d['verts']
+        label_6dof = d['6dof']
+        points2d = d['points2d']
+        label_verts = label_verts.numpy()
+        label_6dof = label_6dof.numpy()
+        points2d = points2d.numpy()
+        eye_points = d['eye_points'].numpy()
+        eye_verts = d['eye_verts'].numpy()
+        print(eye_verts[:5,:])
+        #print(img_local.shape, label_verts.shape, label_6dof.shape, points2d.shape)
+        verts3d = label_verts / 10.0
+        #print(label_verts[:3,:], label_6dof)
+        #list_6dof.append(label_6dof)
+        #print(image.__class__, label_verts.__class__)
+        #label = list(label_verts.numpy().flatten()) + list(label_6dof.numpy().flatten())
+        #points2d = label_verts2[:,:2]
+        points2d = (points2d+1) * 128.0
+        eye_points = (eye_points+1) * 128.0
+        img_local = img_local.numpy()
+        img_local = (img_local+1.0) * 128.0
+        draw = img_local.astype(np.uint8).transpose( (1,2,0) )[:,:,::-1].copy()
+        for i in range(points2d.shape[0]):
+            pt = points2d[i].astype(np.int)
+            cv2.circle(draw, pt, 2, (255,0,0), 2)
+        for i in range(eye_points.shape[0]):
+            pt = eye_points[i].astype(np.int)
+            cv2.circle(draw, pt, 2, (0,255,0), 2)
+        ##output_path = "outputs/%d_%.3f_%.3f_%.3f.jpg"%(idx, label_6dof[0], label_6dof[1], label_6dof[2])
+        output_path = "outputs/%06d.jpg"%(idx)
+        cv2.imwrite(output_path, draw)
+    #list_6dof = np.array(list_6dof)
+    #print('MEAN:')
+    #print(np.mean(list_6dof, axis=0))
+
+if __name__ == "__main__":
+    from utils.utils_config import get_config
+    #cfg = get_config('configs/r0_a1.py')
+    cfg = get_config('configs/s2')
+    #test_loader1(cfg)
+    #test_facedataset1(cfg)
+    #test_arcface(cfg)
+    test_dataset2(cfg)
+
+
diff --git a/insightface/reconstruction/jmlr/eye_dataset.py b/insightface/reconstruction/jmlr/eye_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc929d21a76eb66d1211097d3a54d04d7b69ddf
--- /dev/null
+++ b/insightface/reconstruction/jmlr/eye_dataset.py
@@ -0,0 +1,71 @@
+import os
+import os.path as osp
+import numpy as np
+
+import menpo.io as mio
+
+def project_shape_in_image(verts, R_t, M_proj, M1):
+    verts_homo = verts
+    if verts_homo.shape[1] == 3:
+        ones = np.ones([verts_homo.shape[0], 1])
+        verts_homo = np.concatenate([verts_homo, ones], axis=1)
+    verts_out = verts_homo @ R_t @ M_proj @ M1
+    w_ = verts_out[:, [3]]
+    verts_out = verts_out / w_
+    return verts_out
+
+class EyeDataset():
+    def __init__(self, root, load_data=True):
+        eyes_info = mio.import_pickle(osp.join(root,'eyes3d.pkl'))
+        idxs481 = eyes_info['mask481']['idxs']
+        tri481 = eyes_info['mask481']['trilist']
+        self.iris_idx_481 = eyes_info['mask481']['idxs_iris']
+        eyel_template = eyes_info['left_points'][idxs481]
+        eyer_template = eyes_info['right_points'][idxs481]
+        eyel_template_homo = np.append(eyel_template, np.ones((eyel_template.shape[0],1)), axis=1)
+        eyer_template_homo = np.append(eyer_template, np.ones((eyer_template.shape[0],1)), axis=1)
+        points = mio.import_pickle(osp.join(root,'eyespoints.pkl'))
+        self.homol = eyel_template_homo.T
+        self.homor = eyer_template_homo.T
+        if load_data:
+            self.worldl = {}
+            self.worldr = {}
+            #vector_norm = 0.035
+            for k in points:
+                p = k.find('/')
+                newk = k[p+1:]
+                value = points[k]
+                #el_inv = (value['left'] @ eyel_template_homo.T).T
+                #er_inv = (value['right'] @ eyer_template_homo.T).T
+                #print('V:', value['left'][:5,:])
+                #print('E:', el_inv[:5,:])
+                # gaze vector of left eye in world space
+                #gl_vector = el_inv[iris_idx_481].mean(axis=0) - el_inv[-1]
+                #gl_vector = (gl_vector / np.linalg.norm(gl_vector)) * vector_norm
+                #gl_point = el_inv[iris_idx_481].mean(axis=0) + gl_vector
+                ## gaze vector of right eye in world space
+                #gr_vector = er_inv[iris_idx_481].mean(axis=0) - er_inv[-1]
+                #gr_vector = (gr_vector / np.linalg.norm(gr_vector)) * vector_norm
+                #gr_point = er_inv[iris_idx_481].mean(axis=0) + gr_vector
+                #self.world[newk] = (el_inv, er_inv, gl_point, gr_point)
+                self.worldl[newk] = value['left']
+                self.worldr[newk] = value['right']
+            #print(self.points.keys())
+
+    def get(self, key, to_homo=False):
+        if key not in self.worldl:
+            return None, None
+        left = self.worldl[key]
+        right = self.worldr[key]
+        if to_homo:
+            left = (left @ self.homol).T
+            right = (right @ self.homor).T
+        return left, right
+
+    def to_homo(self, eyel, eyer):
+        eyel = (eyel @ self.homol).T
+        eyer = (eyer @ self.homor).T
+        return eyel, eyer
+
+
+
diff --git a/insightface/reconstruction/jmlr/flops.py b/insightface/reconstruction/jmlr/flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..605465db2c90b76869fa3ed7d585f11056a6f5cf
--- /dev/null
+++ b/insightface/reconstruction/jmlr/flops.py
@@ -0,0 +1,25 @@
+from ptflops import get_model_complexity_info
+import os
+import argparse
+from utils.utils_config import get_config
+from backbones import get_network
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='JMLR FLOPs')
+    parser.add_argument('config', type=str, help='input config file')
+    args = parser.parse_args()
+    args = parser.parse_args()
+    cfg = get_config(args.config)
+    #backbone = get_model(cfg.network, num_features=cfg.embedding_size, input_size=cfg.input_size, dropout=cfg.dropout, stem_type=cfg.stem_type, fp16=0)
+    net = get_network(cfg)
+    macs, params = get_model_complexity_info(
+        net, (3, cfg.input_size, cfg.input_size), as_strings=True,
+        print_per_layer_stat=True, verbose=True)
+    print(macs)
+    print(params)
+
+    # from torch import distributed
+    # distributed.AllreduceOptions
+    # distributed.AllreduceCoalescedOptions
+    # distributed.all_reduce
diff --git a/insightface/reconstruction/jmlr/gen_dataset_meta.py b/insightface/reconstruction/jmlr/gen_dataset_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..108cdfe8d0dbaaae77fc174ca1e33a235c92f026
--- /dev/null
+++ b/insightface/reconstruction/jmlr/gen_dataset_meta.py
@@ -0,0 +1,35 @@
+import pickle
+import numpy as np
+import os
+import os.path as osp
+import glob
+import argparse
+import cv2
+import time
+import datetime
+import pickle
+import sklearn
+import mxnet as mx
+from utils.utils_config import get_config
+from dataset import MXFaceDataset, Rt26dof
+
+if __name__ == "__main__":
+    cfg = get_config('configs/s1.py')
+    cfg.task = 0
+    save_path = os.path.join(cfg.cache_dir, 'train.meta')
+    assert not osp.exists(save_path)
+    dataset = MXFaceDataset(cfg, is_train=True, norm_6dof=False, degrees_6dof=True, local_rank=0)
+    #dataset.transform = None
+    print('total:', len(dataset))
+    total = len(dataset)
+    meta = np.zeros( (total, 3), dtype=np.float32 )
+    for idx in range(total):
+        #image, label_verts, label_6dof = dataset[idx]
+        #img_raw, img_local, label_verts, label_Rt, tform = dataset[idx]
+        img, label_verts, label_6dof, label_points2d, _, _ = dataset[idx]
+        pose = label_6dof.numpy()[:3]
+        print(idx, pose)
+        meta[idx] = pose
+
+    np.save(save_path, meta)
+
diff --git a/insightface/reconstruction/jmlr/inference_simple.py b/insightface/reconstruction/jmlr/inference_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..de4d90612a23d0c439a24236be19e3e047b9adff
--- /dev/null
+++ b/insightface/reconstruction/jmlr/inference_simple.py
@@ -0,0 +1,322 @@
+
+import os
+import time
+import timm
+import glob
+import numpy as np
+import os.path as osp
+import cv2
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from pathlib import Path
+from backbones import get_network
+from skimage import transform as sktrans
+from scipy.spatial.transform import Rotation
+
+def batch_euler2matrix(batch_euler):
+    n = batch_euler.shape[0]
+    assert batch_euler.shape[1] == 3
+    batch_matrix = np.zeros([n, 3, 3], dtype=np.float32)
+
+    for i in range(n):
+        pitch, yaw, roll = batch_euler[i]
+        R = Rotation.from_euler('yxz', [yaw, pitch, roll], degrees=False).as_matrix().T
+        batch_matrix[i] = R
+
+    return batch_matrix
+
+def euler2matrix(euler):
+    assert len(euler)==3
+    matrix = np.zeros([3, 3], dtype=np.float32)
+
+    pitch, yaw, roll = euler
+    R = Rotation.from_euler('yxz', [yaw, pitch, roll], degrees=False).as_matrix().T
+    matrix = R
+    return matrix
+
+def Rt_from_6dof(pred_6dof):
+    assert pred_6dof.ndim==1 or pred_6dof.ndim==2
+    if pred_6dof.ndim==1:
+        R_pred = euler2matrix(pred_6dof[:3])
+        t_pred = pred_6dof[-3:]
+        return R_pred, t_pred
+    else:
+        R_pred = batch_euler2matrix(pred_6dof[:,:3])
+        t_pred = pred_6dof[:,-3:].reshape(-1,1,3)
+        return R_pred, t_pred
+
+def solver_rigid(pts_3d , pts_2d , camera_matrix):
+    # pts_3d  Nx3
+    # pts_2d  Nx2
+    # camera_matrix 4x4
+    dist_coeffs = np.zeros((4,1))
+    pts_3d = pts_3d.copy()
+    pts_2d = pts_2d.copy()
+    #print(pts_3d.shape, pts_3d.dtype, pts_2d.shape, pts_2d.dtype)
+    success, rotation_vector, translation_vector = cv2.solvePnP(pts_3d, pts_2d, camera_matrix, dist_coeffs, flags=0)
+    assert success
+    R, _ = cv2.Rodrigues(rotation_vector)
+    R = R.T
+    R[:,1:3] *= -1
+    T = translation_vector.flatten()
+    T[1:] *= -1
+
+    return R,T
+
+
+class JMLRInference(nn.Module):
+    def __init__(self, cfg, local_rank=0):
+        super(JMLRInference, self).__init__()
+        backbone = get_network(cfg)
+        if cfg.ckpt is None:
+            ckpts = list(glob.glob(osp.join(cfg.output, "backbone*.pth")))
+            backbone_pth = sorted(ckpts)[-1]
+        else:
+            backbone_pth = cfg.ckpt
+        if local_rank==0:
+            print(backbone_pth)
+        backbone_ckpt = torch.load(backbone_pth, map_location=torch.device(local_rank))
+        if 'model' in backbone_ckpt:
+            backbone_ckpt = backbone_ckpt['model']
+        backbone.load_state_dict(backbone_ckpt)
+        backbone.eval()
+        backbone.requires_grad_(False)
+        self.backbone = backbone
+        self.num_verts = cfg.num_verts
+        self.input_size = cfg.input_size
+        self.flipindex = cfg.flipindex.copy()
+        self.data_root = Path(cfg.root_dir)
+        txt_path = self.data_root / 'resources/projection_matrix.txt'
+        self.M_proj = np.loadtxt(txt_path, dtype=np.float32)
+        M1 = np.array([
+            [400.0,       0, 0, 0],
+            [      0, 400.0, 0, 0],
+            [      0,       0, 1, 0],
+            [400.0, 400.0, 0, 1]
+        ])
+        self.M1 = M1
+        camera_matrix = self.M_proj @ M1
+        camera_matrix =  camera_matrix[:3,:3].T
+        camera_matrix[0,2] = 400
+        camera_matrix[1,2] = 400
+        self.camera_matrix = camera_matrix.copy()
+        self.use_eyes = False
+        if cfg.eyes is not None:
+            self.use_eyes = True
+            from eye_dataset import EyeDataset
+            eye_dataset = EyeDataset(cfg.eyes['root'], load_data=False)
+            self.iris_idx_481 = eye_dataset.iris_idx_481
+
+    
+    def project_shape_in_image(self, verts, R_t):
+        verts_homo = verts
+        if verts_homo.shape[1] == 3:
+            ones = np.ones([verts_homo.shape[0], 1])
+            verts_homo = np.concatenate([verts_homo, ones], axis=1)
+        verts_out = verts_homo @ R_t @ self.M_proj @ self.M1
+        w_ = verts_out[:, [3]]
+        verts_out = verts_out / w_
+        return verts_out
+
+    def set_raw_image_size(self, width, height):
+        w = width / 2.0
+        h = height / 2.0
+        M1 = np.array([
+            [w,       0, 0, 0],
+            [      0, h, 0, 0],
+            [      0,       0, 1, 0],
+            [w, h, 0, 1]
+        ])
+        camera_matrix = self.M_proj @ M1
+        camera_matrix =  camera_matrix[:3,:3].T
+        camera_matrix[0,2] = w
+        camera_matrix[1,2] = h
+        self.camera_matrix = camera_matrix
+        self.raw_width = width
+        self.raw_height = height
+
+
+    def forward(self, img_local, is_flip=False):
+        if is_flip:
+            img_local = img_local.flip([3])
+        pred = self.backbone(img_local)
+        pred1 = pred[:,:1220*3]
+        pred2 = pred[:,1220*3:1220*5]
+        meta = {'flip': is_flip}
+        if not self.use_eyes:
+            return pred1, pred2, meta
+        else:
+            eye_verts = pred[:,1220*5:1220*5+481*2*3]
+            eye_points = pred[:,1220*5+481*2*3:]
+            return pred1, pred2, meta, eye_verts, eye_points
+
+
+    def convert_verts(self, pred1, meta):
+        is_flip = meta['flip']
+        pred1 = pred1.cpu().numpy()
+        pred1 = pred1[:,:1220*3]
+        pred_verts = pred1.reshape(-1,1220,3) / 10.0
+        if is_flip:
+            pred_verts = pred_verts[:,self.flipindex,:]
+            pred_verts[:,:,0] *= -1.0
+        return pred_verts
+
+    def convert_2d(self, pred2, tforms, meta):
+        is_flip = meta['flip']
+        tforms = tforms.cpu().numpy()
+        pred2 = pred2.cpu().numpy()
+        B = pred2.shape[0]
+        points2d = (pred2.reshape(B,-1,2)+1.0) * self.input_size//2
+        if is_flip:
+            points2d = points2d[:,self.flipindex,:]
+            points2d[:,:,0] = self.input_size - 1 - points2d[:,:,0]
+        #B = points2d.shape[0]
+        points2de = np.ones( (points2d.shape[0], points2d.shape[1], 3), dtype=points2d.dtype)
+        points2de[:,:,:2] = points2d
+        verts2d = np.zeros((B,points2d.shape[1],2), dtype=np.float32)
+        for n in range(B):
+            tform = tforms[n]
+            tform_inv = cv2.invertAffineTransform(tform)
+            _points2d = np.dot(tform_inv, points2de[n].T).T
+            verts2d[n] = _points2d
+        #return verts2d, points2d
+        return verts2d
+
+
+    def convert_eyes(self, eye_verts3d, eye_verts2d, R_t, tforms):
+        meta = {'flip': False}
+        eye_verts3d = eye_verts3d.cpu().numpy().reshape(-1, 481*2, 3)[0]
+        eye_verts2d = self.convert_2d(eye_verts2d, tforms, meta)[0]
+        el_inv = eye_verts3d[:481,:]
+        er_inv = eye_verts3d[481:,:]
+        v_el = eye_verts2d[:481,:]
+        v_er = eye_verts2d[481:,:]
+        vector_norm = 0.035
+        # gaze vector of left eye in world space
+        gl_vector = el_inv[self.iris_idx_481].mean(axis=0) - el_inv[-1]
+        gl_vector = (gl_vector / np.linalg.norm(gl_vector)) * vector_norm
+        gl_point = el_inv[self.iris_idx_481].mean(axis=0) + gl_vector
+        # gaze vector of right eye in world space
+        gr_vector = er_inv[self.iris_idx_481].mean(axis=0) - er_inv[-1]
+        gr_vector = (gr_vector / np.linalg.norm(gr_vector)) * vector_norm
+        gr_point = er_inv[self.iris_idx_481].mean(axis=0) + gr_vector
+        g_el = self.project_shape_in_image(gl_point[None, :], R_t)
+        g_er = self.project_shape_in_image(gr_point[None, :], R_t)
+        g_el = g_el[:, :3].copy()
+        g_el[:, 1] = self.raw_height - g_el[:, 1]
+        g_er = g_er[:, :3].copy()
+        g_er[:, 1] = self.raw_height - g_er[:, 1]
+        pt1_l = v_el[self.iris_idx_481][:, [0, 1]].mean(axis=0).astype(np.int32)
+        pt2_l = g_el[0, [0, 1]].astype(np.int32)
+        pt1_r = v_er[self.iris_idx_481][:, [0, 1]].mean(axis=0).astype(np.int32)
+        pt2_r = g_er[0, [0, 1]].astype(np.int32)
+        return eye_verts3d, eye_verts2d, (pt1_l, pt2_l), (pt1_r, pt2_r)
+
+    def solve(self, verts3d, verts2d):
+        print(verts3d.shape, verts2d.shape)
+        B = verts3d.shape[0]
+        R = np.zeros([B, 3, 3], dtype=np.float32)
+        t = np.zeros([B, 1, 3], dtype=np.float32)
+        for n in range(B):
+            _R, _t = solver_rigid(verts3d[n], verts2d[n], self.camera_matrix)
+            R[n] = _R
+            t[n,0] = _t
+        return R, t
+
+    def solve_one(self, verts3d, verts2d):
+        R, t = solver_rigid(verts3d, verts2d, self.camera_matrix)
+        return R, t
+
+
+def get(net, img, keypoints):
+    dst_pts = np.array([
+        [38.2946, 51.6963],
+        [73.5318, 51.5014],
+        [56.0252, 71.7366],
+        [41.5493, 92.3655],
+        [70.7299, 92.2041] ], dtype=np.float32 )
+    input_size = 256
+    local_rank = 0
+
+    new_size = 144
+    dst_pts[:,0] += ((new_size-112)//2)
+    dst_pts[:,1] += 8
+    dst_pts[:,:] *= (input_size/float(new_size))
+    tf = sktrans.SimilarityTransform()
+    tf.estimate(keypoints, dst_pts)
+    tform = tf.params[0:2,:]
+    img = cv2.warpAffine(img, tform, (input_size,)*2)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, (2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).float()
+    img.div_(255).sub_(0.5).div_(0.5)
+    img_local = img.to(local_rank)
+    with torch.no_grad():
+        if not net.use_eyes:
+            pred1, pred2, meta = net(img_local, is_flip=False)
+        else:
+            pred1, pred2, meta, eye_verts, eye_points = net(img_local, is_flip=False)
+    pred_verts = net.convert_verts(pred1, meta)
+    tform = torch.from_numpy(tform.reshape(1,2,3))
+    pred_verts2d = net.convert_2d(pred2, tform, meta)
+    verts = pred_verts[0]
+    verts2d = pred_verts2d[0]
+    R, t = net.solve_one(verts, verts2d)
+    if not net.use_eyes:
+        return verts, verts2d
+    else:
+        R_t = np.zeros( (4,4), dtype=np.float32)
+        R_t[:3,:3] = R
+        R_t[3,:3] = t
+        R_t[3,3] = 1.0
+        eye_verts, eye_verts2d, gaze_l, gaze_r = net.convert_eyes(eye_verts, eye_points, R_t, tform)
+        return verts, verts2d, eye_verts, eye_verts2d, gaze_l, gaze_r
+
+
+if __name__ == "__main__":
+    import argparse
+    from utils.utils_config import get_config
+    from insightface.app import FaceAnalysis
+    parser = argparse.ArgumentParser(description='JMLR inference')
+    #parser.add_argument('config', type=str, help='config file')
+    config_file = 'configs/s1.py'
+    #config_file = 'configs/s2.py'
+    args = parser.parse_args()
+    cfg = get_config(config_file)
+    cfg2 = None
+    local_rank = 0
+    #img = cv2.imread('sample.jpg')
+    net = JMLRInference(cfg, local_rank)
+    net = net.to(local_rank)
+    net.eval()
+    app = FaceAnalysis(allowed_modules='detection')
+    app.prepare(ctx_id=0, det_size=(640,640), det_thresh=0.5)
+    index = -1
+    for img_path in glob.glob('/data/insightface/wcpa/image/222714/01_LeftToRight_Neutral/*.jpg'):
+        index+=1
+        img = cv2.imread(img_path)
+        if index==0:
+            net.set_raw_image_size(img.shape[1], img.shape[0])
+        draw = img.copy()
+        faces = app.get(img)
+        for face in faces:
+            if not net.use_eyes:
+                verts3d, verts2d = get(net, img, face.kps)
+            else:
+                verts3d, verts2d, eye_verts3d, eye_verts2d, gaze_l, gaze_r = get(net, img, face.kps)
+            #print(verts3d.shape, verts2d.shape, R.shape, t.shape, eye_verts3d.shape, eye_verts2d.shape)
+            for i in range(verts2d.shape[0]):
+                pt = verts2d[i].astype(np.int32)
+                cv2.circle(draw, pt, 2, (255,0,0), 2)
+            #eye_verts2d = eye_verts2d[:481,:]
+            if net.use_eyes:
+                for i in range(eye_verts2d.shape[0]):
+                    pt = eye_verts2d[i].astype(np.int32)
+                    cv2.circle(draw, pt, 2, (0,255,0), 2)
+                for gaze in [gaze_l, gaze_r]:
+                    pt1, pt2 = gaze
+                    cv2.arrowedLine(draw, pt1, pt2, [0, 0, 255], 10)
+        cv2.imwrite('./outputs/%04d.jpg'%index, draw)
+
diff --git a/insightface/reconstruction/jmlr/losses.py b/insightface/reconstruction/jmlr/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..8659f0c911d411b8671478fb1f433a575afcf2df
--- /dev/null
+++ b/insightface/reconstruction/jmlr/losses.py
@@ -0,0 +1,111 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+import kornia
+import numpy as np
+
+#def loss_l1(a, b):
+    #_loss = torch.abs(a - b)
+    #_loss = torch.mean(_loss, dim=1)
+    ##if epoch>4 and cfg.loss_hard:
+    ##    _loss, _ = torch.topk(_loss, k=int(cfg.batch_size*0.3))
+    #_loss = torch.mean(_loss)
+    #return _loss
+
+
+
+def loss_pip(outputs_map, outputs_local_x, outputs_local_y, labels_map, labels_local_x, labels_local_y):
+
+    tmp_batch, tmp_channel, tmp_height, tmp_width = outputs_map.size()
+    labels_map = labels_map.view(tmp_batch*tmp_channel, -1)
+    labels_max_ids = torch.argmax(labels_map, 1)
+    labels_max_ids = labels_max_ids.view(-1, 1)
+
+    #print('TTT:', outputs_local_x.shape, tmp_batch, tmp_channel)
+
+    outputs_local_x = outputs_local_x.reshape(tmp_batch*tmp_channel, -1)
+    outputs_local_x_select = torch.gather(outputs_local_x, 1, labels_max_ids)
+    outputs_local_y = outputs_local_y.reshape(tmp_batch*tmp_channel, -1)
+    outputs_local_y_select = torch.gather(outputs_local_y, 1, labels_max_ids)
+
+    labels_local_x = labels_local_x.view(tmp_batch*tmp_channel, -1)
+    labels_local_x_select = torch.gather(labels_local_x, 1, labels_max_ids)
+    labels_local_y = labels_local_y.view(tmp_batch*tmp_channel, -1)
+    labels_local_y_select = torch.gather(labels_local_y, 1, labels_max_ids)
+
+    labels_map = labels_map.view(tmp_batch, tmp_channel, tmp_height, tmp_width)
+    loss_map = F.mse_loss(outputs_map, labels_map)
+    loss_x = F.l1_loss(outputs_local_x_select, labels_local_x_select)
+    loss_y = F.l1_loss(outputs_local_y_select, labels_local_y_select)
+    return loss_map, loss_x, loss_y
+
+def eye_like(x: torch.Tensor, n: int) -> torch.Tensor:
+    return torch.eye(n, n, dtype=x.dtype, device=x.device).unsqueeze(0).repeat(x.shape[0], 1, 1)
+
+class ProjectLoss(nn.Module):
+    
+    def __init__(self,M_proj):
+        super(ProjectLoss, self).__init__()
+        img_w = 800
+        img_h = 800
+        M1 = np.array([
+            [img_w/2,       0, 0, 0],
+            [      0, img_h/2, 0, 0],
+            [      0,       0, 1, 0],
+            [img_w/2, img_h/2, 0, 1]
+        ])
+        M = M_proj @ M1
+        M = M.astype(np.float32)
+        self.register_buffer('M', torch.from_numpy(M))
+
+        camera_matrix =  M[:3,:3].T.copy()
+        camera_matrix[0,2] = 400
+        camera_matrix[1,2] = 400
+        camera_matrix[2,2] = 1
+        intrinsics = np.array([camera_matrix]).astype(np.float64)
+        self.register_buffer('intrinsics', torch.from_numpy(intrinsics))
+
+
+        self.eps = 1e-5
+        #self.projector = Reprojector(img_w,img_h,M_proj)
+        #self.solver = PnPSolver(self.projector.M.numpy())
+        #self.loss_fn = torch.nn.MSELoss(reduce=False, size_average=False)
+        #self.loss_fn = torch.nn.MSELoss()
+        self.loss_fn = torch.nn.L1Loss()
+        
+    
+    def forward(self,verts3d, points2d, affine):
+        # pred_2d_lmks  Batch*N*2
+        # verts Batch*N*3
+        ones = torch.ones([points2d.shape[0] , points2d.shape[1], 1],device=points2d.device)
+        verts_homo = torch.cat((points2d, ones), 2)
+        K  = eye_like(affine,3)
+        K[:,:2,:3] = affine
+        inv_k = torch.linalg.inv(K)
+        inv_k@verts_homo.permute(0,2,1)
+        points2d_inv = inv_k@verts_homo.permute(0,2,1)
+        points2d_inv = points2d_inv.permute(0,2,1)[:,:,:2]
+
+        intrinsics = self.intrinsics.repeat([verts3d.shape[0],1,1 ])
+        #print(verts3d.double().shape) 
+        #print(points2d.double().shape)
+        #print(intrinsics.shape)
+        RT_ = kornia.geometry.solve_pnp_dlt(verts3d.double(), points2d_inv.double(), intrinsics,svd_eps=self.eps)
+        RT_ = RT_.float()
+        RT = eye_like(verts3d,4)
+#         RT[:,1:3,:] *=-1
+        RT[:,:3,:] = RT_
+        RT = RT.permute(0,2,1)
+        RT[:,:,:2] *= -1
+
+        ones = torch.ones([verts3d.shape[0] , verts3d.shape[1], 1],device=verts3d.device)
+        verts_homo = torch.cat((verts3d, ones), 2)
+        M = self.M.repeat([verts3d.shape[0],1,1 ])
+        verts = verts_homo @ RT @ M
+        w_ = verts[:,:, [3]]
+        verts = verts / w_
+        reproject_points2d = verts[:,:, :2]
+        loss = self.loss_fn(reproject_points2d , points2d_inv)
+
+        return loss
+
diff --git a/insightface/reconstruction/jmlr/lr_scheduler.py b/insightface/reconstruction/jmlr/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e5461da3c58e82343529a529fe07ed614f010b5
--- /dev/null
+++ b/insightface/reconstruction/jmlr/lr_scheduler.py
@@ -0,0 +1,93 @@
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class PolyScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 base_lr,
+                 max_steps,
+                 warmup_steps,
+                 last_epoch=-1):
+        self.base_lr = base_lr
+        self.warmup_lr_init = 0.0001
+        self.max_steps: int = max_steps
+        self.warmup_steps: int = warmup_steps
+        self.power = 2
+        super(PolyScheduler, self).__init__(optimizer, last_epoch, False)
+
+    def get_warmup_lr(self):
+        alpha = float(self.last_epoch) / float(self.warmup_steps)
+        #_lr = max(self.base_lr * alpha, self.warmup_lr_init)
+        _lr = self.base_lr * alpha
+        return [_lr for _ in self.optimizer.param_groups]
+
+    def get_lr(self):
+        if self.last_epoch == -1:
+            return [self.warmup_lr_init for _ in self.optimizer.param_groups]
+        if self.last_epoch < self.warmup_steps:
+            return self.get_warmup_lr()
+        else:
+            alpha = pow(
+                1 - float(self.last_epoch - self.warmup_steps) /
+                float(self.max_steps - self.warmup_steps),
+                self.power,
+            )
+            return [self.base_lr * alpha for _ in self.optimizer.param_groups]
+
+class StepScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 base_lr,
+                 lr_steps,
+                 warmup_steps,
+                 last_epoch=-1):
+        self.base_lr = base_lr
+        self.warmup_lr_init = 0.0001
+        self.lr_steps = lr_steps
+        self.warmup_steps: int = warmup_steps
+        super(StepScheduler, self).__init__(optimizer, last_epoch, False)
+
+    def get_warmup_lr(self):
+        alpha = float(self.last_epoch) / float(self.warmup_steps)
+        #_lr = max(self.base_lr * alpha, self.warmup_lr_init)
+        _lr = self.base_lr * alpha
+        return [_lr for _ in self.optimizer.param_groups]
+
+    def get_lr(self):
+        if self.last_epoch == -1:
+            return [self.warmup_lr_init for _ in self.optimizer.param_groups]
+        if self.last_epoch < self.warmup_steps:
+            return self.get_warmup_lr()
+        else:
+            alpha = 0.1 ** len([m for m in self.lr_steps if m <= self.last_epoch])
+            return [self.base_lr * alpha for _ in self.optimizer.param_groups]
+
+
+
+def get_scheduler(opt, cfg):
+    if cfg.lr_func is not None:
+        scheduler = torch.optim.lr_scheduler.LambdaLR(
+            optimizer=opt, lr_lambda=cfg.lr_func)
+    else:
+        #total_batch_size = cfg.batch_size * cfg.world_size
+        #warmup_steps = cfg.num_images // total_batch_size * cfg.warmup_epochs
+        #total_steps = cfg.num_images // total_batch_size * cfg.num_epochs
+
+        if cfg.lr_steps is None:
+            scheduler = PolyScheduler(
+                optimizer=opt,
+                base_lr=cfg.lr,
+                max_steps=cfg.total_steps,
+                warmup_steps=cfg.warmup_steps,
+            )
+        else:
+            scheduler = StepScheduler(
+                optimizer=opt,
+                base_lr=cfg.lr,
+                lr_steps=cfg.lr_steps,
+                warmup_steps=cfg.warmup_steps,
+            )
+
+    return scheduler
+
diff --git a/insightface/reconstruction/jmlr/rec_builder.py b/insightface/reconstruction/jmlr/rec_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f8f81602589de4c982ca3e747dbb45a5e99115d
--- /dev/null
+++ b/insightface/reconstruction/jmlr/rec_builder.py
@@ -0,0 +1,135 @@
+import pickle
+import numpy as np
+import os
+import os.path as osp
+import glob
+import argparse
+import cv2
+import time
+import datetime
+import pickle
+import sklearn
+import mxnet as mx
+from utils.utils_config import get_config
+from dataset import FaceDataset, Rt26dof
+
+class RecBuilder():
+    def __init__(self, path, image_size=(112, 112), is_train=True):
+        self.path = path
+        self.image_size = image_size
+        self.widx = 0
+        self.wlabel = 0
+        self.max_label = -1
+        #assert not osp.exists(path), '%s exists' % path
+        if is_train:
+            rec_file = osp.join(path, 'train.rec')
+            idx_file = osp.join(path, 'train.idx')
+        else:
+            rec_file = osp.join(path, 'val.rec')
+            idx_file = osp.join(path, 'val.idx')
+        #assert not osp.exists(rec_file), '%s exists' % rec_file
+        if not osp.exists(path):
+            os.makedirs(path)
+        self.writer = mx.recordio.MXIndexedRecordIO(idx_file,
+                                                    rec_file,
+                                                    'w')
+        self.meta = []
+
+    def add(self, imgs):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        assert len(imgs) > 0
+        label = self.wlabel
+        for img in imgs:
+            idx = self.widx
+            image_meta = {'image_index': idx, 'image_classes': [label]}
+            header = mx.recordio.IRHeader(0, label, idx, 0)
+            if isinstance(img, np.ndarray):
+                s = mx.recordio.pack_img(header,img,quality=95,img_fmt='.jpg')
+            else:
+                s = mx.recordio.pack(header, img)
+            self.writer.write_idx(idx, s)
+            self.meta.append(image_meta)
+            self.widx += 1
+        self.max_label = label
+        self.wlabel += 1
+        return label
+
+
+    def add_image(self, img, label):
+        #!!! img should be BGR!!!!
+        #assert label >= 0
+        #assert label > self.last_label
+        idx = self.widx
+        header = mx.recordio.IRHeader(0, label, idx, 0)
+        if isinstance(img, np.ndarray):
+            s = mx.recordio.pack_img(header,img,quality=100,img_fmt='.jpg')
+        else:
+            s = mx.recordio.pack(header, img)
+        self.writer.write_idx(idx, s)
+        self.widx += 1
+
+    def close(self):
+        print('stat:', self.widx, self.wlabel)
+
+if __name__ == "__main__":
+    #cfg = get_config('configs/s1.py')
+    cfg = get_config('configs/s2.py')
+    cfg.task = 0
+    cfg.input_size = 512
+    for is_train in [True, False]:
+        dataset = FaceDataset(cfg, is_train=is_train, local_rank=0)
+        dataset.transform = None
+        writer = RecBuilder(cfg.cache_dir, is_train=is_train)
+        #writer = RecBuilder("temp", is_train=is_train)
+        print('total:', len(dataset))
+        #meta = np.zeros( (len(dataset), 3), dtype=np.float32 )
+        meta = []
+        subset_name = 'train' if is_train else 'val'
+        meta_path = osp.join(cfg.cache_dir, '%s.meta'%subset_name)
+        eye_missing = 0
+        for idx in range(len(dataset)):
+            #img_local, img_global, label_verts, label_Rt, tform = dataset[idx]
+            #img_local, label_verts, label_Rt, tform = dataset[idx]
+            data = dataset[idx]
+            img_local = data['img_local']
+            label_verts = data['verts']
+            label_Rt = data['rt']
+            tform = data['tform']
+            label_verts = label_verts.numpy()
+            label_Rt = label_Rt.numpy()
+            tform = tform.numpy()
+            label_6dof = Rt26dof(label_Rt, True)
+            pose = label_6dof[:3]
+            #print(image.shape, label_verts.shape, label_6dof.shape)
+            #print(image.__class__, label_verts.__class__)
+            img_local = img_local[:,:,::-1]
+            #img_global = img_global[:,:,::-1]
+            #image = np.concatenate( (img_local, img_global), axis=1 )
+            image = img_local
+            label = list(label_verts.flatten()) + list(label_Rt.flatten()) + list(tform.flatten())
+            expect_len = 1220*3+16+6
+            if 'eye_world_left' in data:
+                if idx==0:
+                    print('find eye')
+                eyel = data['eye_world_left'].numpy()
+                eyer = data['eye_world_right'].numpy()
+                label += list(eyel.flatten())
+                label += list(eyer.flatten())
+                expect_len += 481*6
+            else:
+                eye_missing += 1
+                continue
+            meta.append(pose)
+            assert len(label)==expect_len
+            writer.add_image(image, label)
+            if idx%100==0:
+                print('processing:', idx, image.shape, len(label))
+            if idx<10:
+                cv2.imwrite("temp/%d.jpg"%idx, image)
+        writer.close()
+        meta = np.array(meta, dtype=np.float32)
+        np.save(meta_path, meta)
+        print('Eye missing:', eye_missing, is_train)
+
diff --git a/insightface/reconstruction/jmlr/train.py b/insightface/reconstruction/jmlr/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee43fbc36afa98b62d65440dd141cbf390a369b9
--- /dev/null
+++ b/insightface/reconstruction/jmlr/train.py
@@ -0,0 +1,303 @@
+import argparse
+import logging
+import os
+import time
+import timm
+import glob
+import numpy as np
+import os.path as osp
+
+import torch
+import torch.distributed as dist
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data.distributed
+from torch.nn.utils import clip_grad_norm_
+from dataset import FaceDataset, DataLoaderX, MXFaceDataset, get_tris
+
+from backbones import get_network
+from utils.utils_amp import MaxClipGradScaler
+from utils.utils_callbacks import CallBackVerification, CallBackLogging, CallBackModelCheckpoint
+from utils.utils_logging import AverageMeter, init_logging
+from utils.utils_config import get_config
+from lr_scheduler import get_scheduler
+from timm.optim.optim_factory import create_optimizer
+
+
+
+
+
+def main(args):
+    cfg = get_config(args.config)
+    if not cfg.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+
+    try:
+        world_size = int(os.environ['WORLD_SIZE'])
+        rank = int(os.environ['RANK'])
+        dist.init_process_group('nccl')
+    except KeyError:
+        world_size = 1
+        rank = 0
+        dist.init_process_group(backend='nccl', init_method="tcp://127.0.0.1:12584", rank=rank, world_size=world_size)
+
+
+    local_rank = args.local_rank
+    torch.cuda.set_device(local_rank)
+
+    if not os.path.exists(cfg.output) and rank is 0:
+        os.makedirs(cfg.output)
+    else:
+        time.sleep(2)
+
+    log_root = logging.getLogger()
+    init_logging(log_root, rank, cfg.output)
+    if rank==0:
+        logging.info(args)
+        logging.info(cfg)
+        print(cfg.flipindex.shape, cfg.flipindex[400:410])
+    train_set = MXFaceDataset(cfg=cfg, is_train=True, local_rank=local_rank)
+    cfg.num_images = len(train_set)
+    cfg.world_size = world_size
+    total_batch_size = cfg.batch_size * cfg.world_size
+    epoch_steps = cfg.num_images // total_batch_size
+    cfg.warmup_steps = epoch_steps * cfg.warmup_epochs
+    if cfg.max_warmup_steps>0:
+        cfg.warmup_steps = min(cfg.max_warmup_steps, cfg.warmup_steps)
+    cfg.total_steps = epoch_steps * cfg.num_epochs
+    if cfg.lr_epochs is not None:
+        cfg.lr_steps = [m*epoch_steps for m in cfg.lr_epochs]
+    else:
+        cfg.lr_steps = None
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        train_set, shuffle=True)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set, batch_size=cfg.batch_size,
+        sampler=train_sampler, num_workers=4, pin_memory=False, drop_last=True)
+
+    
+    net = get_network(cfg).to(local_rank)
+    
+
+    if cfg.resume:
+        try:
+            
+            ckpts = list(glob.glob(osp.join(cfg.resume_path, "backbone*.pth")))
+            backbone_pth = sorted(ckpts)[-1]
+            backbone_ckpt = torch.load(backbone_pth, map_location=torch.device(local_rank))
+            net.load_state_dict(backbone_ckpt['model'])
+            if rank==0:
+                logging.info("backbone resume successfully! %s"%backbone_pth)
+        except (FileNotFoundError, KeyError, IndexError, RuntimeError):
+            logging.info("resume fail!!")
+            raise RuntimeError
+
+
+    net = torch.nn.parallel.DistributedDataParallel(
+        module=net, broadcast_buffers=False, device_ids=[local_rank])
+    net.train()
+
+
+ 
+
+    if cfg.opt=='sgd':
+        opt = torch.optim.SGD(
+            params=[
+                {"params": net.parameters()}, 
+                ],
+            lr=cfg.lr, momentum=0.9, weight_decay=cfg.weight_decay)
+    elif cfg.opt=='adam':
+        opt = torch.optim.Adam(
+            params=[
+                {"params": net.parameters()}, 
+                ],
+            lr=cfg.lr)
+    elif cfg.opt=='adamw':
+        opt = torch.optim.AdamW(
+            params=[
+                {"params": net.parameters()}, 
+                ],
+            lr=cfg.lr, weight_decay=cfg.weight_decay)
+
+
+    scheduler = get_scheduler(opt, cfg)
+    if cfg.resume:
+        if rank==0:
+            logging.info(opt)
+
+
+
+    if cfg.resume:
+        for g in opt_pfc.param_groups:
+            for key in ['lr', 'initial_lr']:
+                g[key] = cfg.lr
+
+
+
+    start_epoch = 0
+    total_step = cfg.total_steps
+    if rank==0: 
+        logging.info(opt)
+        logging.info("Total Step is: %d" % total_step)
+
+
+    loss = {
+            'Loss': AverageMeter(),
+
+           }
+
+    global_step = 0
+    grad_amp = None
+    if cfg.fp16>0:
+        if cfg.fp16==1:
+            grad_amp = torch.cuda.amp.grad_scaler.GradScaler(growth_interval=100)
+        elif cfg.fp16==2:
+            grad_amp = MaxClipGradScaler(64, 1024, growth_interval=200)
+        elif cfg.fp16==3:
+            grad_amp = MaxClipGradScaler(4, 8, growth_interval=200)
+        else:
+            assert 'fp16 mode not set'
+
+    callback_checkpoint = CallBackModelCheckpoint(rank, cfg)
+
+    callback_checkpoint(global_step, net, opt)
+
+    callback_logging = CallBackLogging(50, rank, total_step, cfg.batch_size, world_size, None)
+
+    l1loss = nn.L1Loss()
+
+
+    tris = get_tris(cfg)
+    tri_index = torch.tensor(tris, dtype=torch.long).to(local_rank)
+    use_eyes = cfg.eyes is not None
+
+    for epoch in range(start_epoch, cfg.num_epochs):
+        train_sampler.set_epoch(epoch)
+        for step, value in enumerate(train_loader):
+            global_step += 1
+            img = value['img_local'].to(local_rank)
+            dloss = {}
+            assert cfg.task==0
+            label_verts = value['verts'].to(local_rank)
+            label_points2d = value['points2d'].to(local_rank)
+            #need_eyes = 'eye_verts' in value
+            preds = net(img)
+
+            
+            if use_eyes:
+                pred_verts, pred_points2d, pred_eye_verts, pred_eye_points = preds.split([1220*3, 1220*2, 481*2*3, 481*2*2], dim=1)
+                pred_eye_verts = pred_eye_verts.view(cfg.batch_size, 481*2, 3)
+                pred_eye_points = pred_eye_points.view(cfg.batch_size, 481*2, 2)
+            else:
+                pred_verts, pred_points2d = preds.split([1220*3, 1220*2], dim=1)
+            pred_verts = pred_verts.view(cfg.batch_size, 1220, 3)
+            pred_points2d = pred_points2d.view(cfg.batch_size, 1220, 2)
+            if not cfg.use_rtloss:
+                loss1 = F.l1_loss(pred_verts, label_verts)
+            else:
+                label_Rt = value['rt'].to(local_rank)
+                _ones = torch.ones([pred_verts.shape[0], 1220, 1], device=pred_verts.device)
+                pred_verts = torch.cat([pred_verts/10, _ones], dim=2)
+                pred_verts = torch.bmm(pred_verts,label_Rt) * 10.0
+                label_verts = torch.cat([label_verts/10, _ones], dim=2)
+                label_verts = torch.bmm(label_verts,label_Rt) * 10.0
+                loss1 = F.l1_loss(pred_verts, label_verts)
+
+            loss2 = F.l1_loss(pred_points2d, label_points2d)
+            loss3d = loss1 * cfg.lossw_verts3d
+            loss2d = loss2 * cfg.lossw_verts2d
+            dloss['Loss'] = loss3d + loss2d
+            dloss['Loss3D'] = loss3d
+            dloss['Loss2D'] = loss2d
+            if use_eyes:
+                label_eye_verts = value['eye_verts'].to(local_rank)
+                label_eye_points = value['eye_points'].to(local_rank)
+                loss3 = F.l1_loss(pred_eye_verts, label_eye_verts)
+                loss4 = F.l1_loss(pred_eye_points, label_eye_points)
+                loss3 = loss3 * cfg.lossw_eyes3d
+                loss4 = loss4 * cfg.lossw_eyes2d
+                dloss['Loss'] += loss3
+                dloss['Loss'] += loss4
+                dloss['LossEye3d'] = loss3
+                dloss['LossEye2d'] = loss4
+
+            if cfg.loss_bone3d:
+                bone_losses = []
+                for i in range(3):
+                    pred_verts_x = pred_verts[:,tri_index[:,i%3],:]
+                    pred_verts_y = pred_verts[:,tri_index[:,(i+1)%3],:]
+                    label_verts_x = label_verts[:,tri_index[:,i%3],:]
+                    label_verts_y = label_verts[:,tri_index[:,(i+1)%3],:]
+                    dist_pred = torch.norm(pred_verts_x - pred_verts_y, p=2, dim=-1, keepdim=False)
+                    dist_label = torch.norm(label_verts_x - label_verts_y, p=2, dim=-1, keepdim=False)
+                    bone_losses.append(F.l1_loss(dist_pred, dist_label) * cfg.lossw_bone3d)
+                _loss = sum(bone_losses)
+                dloss['Loss'] += _loss
+                dloss['LossBone3d'] = _loss
+                        
+
+            if cfg.loss_bone2d:
+                bone_losses = []
+                for i in range(3):
+                    pred_points2d_x = pred_points2d[:,tri_index[:,i%3],:]
+                    pred_points2d_y = pred_points2d[:,tri_index[:,(i+1)%3],:]
+                    label_points2d_x = label_points2d[:,tri_index[:,i%3],:]
+                    label_points2d_y = label_points2d[:,tri_index[:,(i+1)%3],:]
+                    dist_pred = torch.norm(pred_points2d_x - pred_points2d_y, p=2, dim=-1, keepdim=False)
+                    dist_label = torch.norm(label_points2d_x - label_points2d_y, p=2, dim=-1, keepdim=False)
+                    bone_losses.append(F.l1_loss(dist_pred, dist_label) * cfg.lossw_bone2d)
+                _loss = sum(bone_losses)
+                dloss['Loss'] += _loss
+                dloss['LossBone2d'] = _loss
+                        
+            iter_loss = dloss['Loss']
+
+            if cfg.fp16>0:
+                grad_amp.scale(iter_loss).backward()
+                grad_amp.unscale_(opt)
+                if cfg.fp16<2:
+                    torch.nn.utils.clip_grad_norm_(net.parameters(), 5)
+                grad_amp.step(opt)
+                grad_amp.update()
+            else:
+                iter_loss.backward()
+                opt.step()
+
+
+            opt.zero_grad()
+
+            if cfg.lr_func is None:
+                scheduler.step()
+
+            with torch.no_grad():
+                loss['Loss'].update(iter_loss.item(), 1)
+                for k in dloss:
+                    if k=='Loss':
+                        continue
+                    v = dloss[k].item()
+                    if k not in loss:
+                        loss[k] = AverageMeter()
+                    loss[k].update(v, 1)
+
+                callback_logging(global_step, loss, epoch, cfg.fp16, grad_amp, opt)
+
+        if cfg.lr_func is not None:
+            scheduler.step()
+
+    callback_checkpoint(9999, net, opt)
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+    parser = argparse.ArgumentParser(description='JMLR Training')
+    parser.add_argument('config', type=str, help='config file')
+    parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
+    args_ = parser.parse_args()
+    main(args_)
+
diff --git a/insightface/reconstruction/jmlr/utils/__init__.py b/insightface/reconstruction/jmlr/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/reconstruction/jmlr/utils/plot.py b/insightface/reconstruction/jmlr/utils/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc588e5c01ca550b69c385aeb3fd139c59fb88a
--- /dev/null
+++ b/insightface/reconstruction/jmlr/utils/plot.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+
+import os
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from menpo.visualize.viewmatplotlib import sample_colours_from_colourmap
+from prettytable import PrettyTable
+from sklearn.metrics import roc_curve, auc
+
+image_path = "/data/anxiang/IJB_release/IJBC"
+files = [
+        "./ms1mv3_arcface_r100/ms1mv3_arcface_r100/ijbc.npy"
+]
+
+
+def read_template_pair_list(path):
+    pairs = pd.read_csv(path, sep=' ', header=None).values
+    t1 = pairs[:, 0].astype(np.int)
+    t2 = pairs[:, 1].astype(np.int)
+    label = pairs[:, 2].astype(np.int)
+    return t1, t2, label
+
+
+p1, p2, label = read_template_pair_list(
+    os.path.join('%s/meta' % image_path,
+                 '%s_template_pair_label.txt' % 'ijbc'))
+
+methods = []
+scores = []
+for file in files:
+    methods.append(file.split('/')[-2])
+    scores.append(np.load(file))
+
+methods = np.array(methods)
+scores = dict(zip(methods, scores))
+colours = dict(
+    zip(methods, sample_colours_from_colourmap(methods.shape[0], 'Set2')))
+x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1]
+tpr_fpr_table = PrettyTable(['Methods'] + [str(x) for x in x_labels])
+fig = plt.figure()
+for method in methods:
+    fpr, tpr, _ = roc_curve(label, scores[method])
+    roc_auc = auc(fpr, tpr)
+    fpr = np.flipud(fpr)
+    tpr = np.flipud(tpr)  # select largest tpr at same fpr
+    plt.plot(fpr,
+             tpr,
+             color=colours[method],
+             lw=1,
+             label=('[%s (AUC = %0.4f %%)]' %
+                    (method.split('-')[-1], roc_auc * 100)))
+    tpr_fpr_row = []
+    tpr_fpr_row.append("%s-%s" % (method, "IJBC"))
+    for fpr_iter in np.arange(len(x_labels)):
+        _, min_index = min(
+            list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))))
+        tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100))
+    tpr_fpr_table.add_row(tpr_fpr_row)
+plt.xlim([10 ** -6, 0.1])
+plt.ylim([0.3, 1.0])
+plt.grid(linestyle='--', linewidth=1)
+plt.xticks(x_labels)
+plt.yticks(np.linspace(0.3, 1.0, 8, endpoint=True))
+plt.xscale('log')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC on IJB')
+plt.legend(loc="lower right")
+print(tpr_fpr_table)
diff --git a/insightface/reconstruction/jmlr/utils/utils_amp.py b/insightface/reconstruction/jmlr/utils/utils_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a2286f15d3cefde6dee086868418f4064d958be
--- /dev/null
+++ b/insightface/reconstruction/jmlr/utils/utils_amp.py
@@ -0,0 +1,82 @@
+from typing import Dict, List
+
+import torch
+#from torch._six import container_abcs
+import collections.abc as container_abcs
+from torch.cuda.amp import GradScaler
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+class MaxClipGradScaler(GradScaler):
+    def __init__(self, init_scale, max_scale: float, growth_interval=100):
+        GradScaler.__init__(self, init_scale=init_scale, growth_interval=growth_interval)
+        self.max_scale = max_scale
+
+    def scale_clip(self):
+        if self.get_scale() == self.max_scale:
+            self.set_growth_factor(1)
+        elif self.get_scale() < self.max_scale:
+            self.set_growth_factor(2)
+        elif self.get_scale() > self.max_scale:
+            self._scale.fill_(self.max_scale)
+            self.set_growth_factor(1)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Arguments:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+        self.scale_clip()
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[_MultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, container_abcs.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+        return apply_scale(outputs)
diff --git a/insightface/reconstruction/jmlr/utils/utils_callbacks.py b/insightface/reconstruction/jmlr/utils/utils_callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab740ec0862bb8ea3dbc5e50af335225ed2bf90f
--- /dev/null
+++ b/insightface/reconstruction/jmlr/utils/utils_callbacks.py
@@ -0,0 +1,129 @@
+import logging
+import os
+import time
+from typing import List
+
+import torch
+import psutil
+
+#from eval import verification
+#from partial_fc import PartialFC
+#from torch2onnx import convert_onnx
+from utils.utils_logging import AverageMeter
+
+
+class CallBackVerification(object):
+    def __init__(self, frequent, rank, val_targets, rec_prefix, image_size=(112, 112)):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.highest_acc: float = 0.0
+        self.highest_acc_list: List[float] = [0.0] * len(val_targets)
+        self.ver_list: List[object] = []
+        self.ver_name_list: List[str] = []
+        if self.rank is 0:
+            self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size)
+
+    def ver_test(self, backbone: torch.nn.Module, global_step: int):
+        results = []
+        for i in range(len(self.ver_list)):
+            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
+                self.ver_list[i], backbone, 10, 10)
+            logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
+            logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
+            if acc2 > self.highest_acc_list[i]:
+                self.highest_acc_list[i] = acc2
+            logging.info(
+                '[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
+            results.append(acc2)
+
+    def init_dataset(self, val_targets, data_dir, image_size):
+        for name in val_targets:
+            path = os.path.join(data_dir, name + ".bin")
+            if os.path.exists(path):
+                data_set = verification.load_bin(path, image_size)
+                self.ver_list.append(data_set)
+                self.ver_name_list.append(name)
+
+    def __call__(self, num_update, backbone: torch.nn.Module):
+        if self.rank is 0 and num_update > 0 and num_update % self.frequent == 0:
+            backbone.eval()
+            self.ver_test(backbone, num_update)
+            backbone.train()
+
+
+class CallBackLogging(object):
+    def __init__(self, frequent, rank, total_step, batch_size, world_size, writer=None):
+        self.frequent: int = frequent
+        self.rank: int = rank
+        self.time_start = time.time()
+        self.total_step: int = total_step
+        self.batch_size: int = batch_size
+        self.world_size: int = world_size
+        self.writer = writer
+
+        self.init = False
+        self.tic = 0
+
+    def __call__(self, global_step, loss, epoch, fp16, grad_scaler, opt):
+        if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0:
+            if self.init:
+                try:
+                    speed: float = self.frequent * self.batch_size / (time.time() - self.tic)
+                    speed_total = speed * self.world_size
+                except ZeroDivisionError:
+                    speed_total = float('inf')
+
+                time_now = (time.time() - self.time_start) / 3600.0
+                time_total = time_now / ((global_step + 1) / self.total_step)
+                time_for_end = time_total - time_now
+                lr = opt.param_groups[0]['lr']
+                if self.writer is not None:
+                    self.writer.add_scalar('time_for_end', time_for_end, global_step)
+                    #self.writer.add_scalar('loss', loss.avg, global_step)
+                mem = psutil.virtual_memory()
+                mem_used = mem.used / (1024 ** 3)
+                loss_str = ""
+                for k,v in loss.items():
+                    if len(loss_str)!=0:
+                        loss_str += "   "
+                    loss_str += "%s:%.4f"%(k, v.avg)
+                if fp16:
+                    msg = "Speed %.2f samples/sec   %s   Epoch: %d   Global Step: %d   LR: %.8f   " \
+                            "Fp16 Grad Scale: %2.f   Required: %.1f hours   MemUsed: %.3f" % (
+                              speed_total, loss_str, epoch, global_step, lr, grad_scaler.get_scale(), time_for_end, mem_used
+                          )
+                else:
+                    msg = "Speed %.2f samples/sec   %s   Epoch: %d   Global Step: %d   LR: %.8f   Required: %.1f hours   MemUsed: %.3f" % (
+                        speed_total, loss_str, epoch, global_step, lr, time_for_end, mem_used
+                    )
+                logging.info(msg)
+                for k,v in loss.items():
+                    v.reset()
+                self.tic = time.time()
+            else:
+                self.init = True
+                self.tic = time.time()
+
+
+class CallBackModelCheckpoint(object):
+    def __init__(self, rank, cfg):
+        self.rank = rank
+        self.output = cfg.output
+        #self.save_pfc = cfg.save_pfc
+        #self.save_onnx = cfg.save_onnx
+        self.save_opt = cfg.save_opt
+
+    def __call__(self, epoch, backbone, opt_backbone):
+        if self.rank == 0:
+            path_module = os.path.join(self.output, "backbone_ep%04d.pth"%epoch)
+            if self.save_opt:
+                data = {
+                        'model': backbone.module.state_dict(),
+                        'optimizer': opt_backbone.state_dict(),
+                        }
+            else:
+                data = backbone.module.state_dict()
+            torch.save(data, path_module)
+            logging.info("Pytorch Model Saved in '{}'".format(path_module))
+
+
diff --git a/insightface/reconstruction/jmlr/utils/utils_config.py b/insightface/reconstruction/jmlr/utils/utils_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..74408b36bbfa3c7355d69c2bc6233fcc085eeb06
--- /dev/null
+++ b/insightface/reconstruction/jmlr/utils/utils_config.py
@@ -0,0 +1,27 @@
+import importlib
+import os
+import os.path as osp
+import numpy as np
+
+def get_config(config_file):
+    assert config_file.startswith('configs/'), 'config file setting must start with configs/'
+    temp_config_name = osp.basename(config_file)
+    temp_module_name = osp.splitext(temp_config_name)[0]
+    #print('A:', config_file, temp_config_name, temp_module_name)
+    config1 = importlib.import_module("configs.base")
+    importlib.reload(config1)
+    cfg = config1.config
+    #print('B1:', cfg)
+    config2 = importlib.import_module("configs.%s"%temp_module_name)
+    importlib.reload(config2)
+    #reload(config2)
+    job_cfg = config2.config
+    #print('B2:', job_cfg)
+    cfg.update(job_cfg)
+    cfg.job_name = temp_module_name
+    #print('B:', cfg)
+    if cfg.output is None:
+        cfg.output = osp.join('work_dirs', temp_module_name)
+    #print('C:', cfg.output)
+    cfg.flipindex = np.load(cfg.flipindex_file)
+    return cfg
diff --git a/insightface/reconstruction/jmlr/utils/utils_logging.py b/insightface/reconstruction/jmlr/utils/utils_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d58012ead1b0245abc8e778c0f47b036a1250c1
--- /dev/null
+++ b/insightface/reconstruction/jmlr/utils/utils_logging.py
@@ -0,0 +1,40 @@
+import logging
+import os
+import sys
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value
+    """
+
+    def __init__(self):
+        self.val = None
+        self.avg = None
+        self.sum = None
+        self.count = None
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def init_logging(log_root, rank, models_root):
+    if rank is 0:
+        log_root.setLevel(logging.INFO)
+        formatter = logging.Formatter("Training: %(asctime)s-%(message)s")
+        handler_file = logging.FileHandler(os.path.join(models_root, "training.log"))
+        handler_stream = logging.StreamHandler(sys.stdout)
+        handler_file.setFormatter(formatter)
+        handler_stream.setFormatter(formatter)
+        log_root.addHandler(handler_file)
+        log_root.addHandler(handler_stream)
+        log_root.info('rank_id: %d' % rank)
diff --git a/insightface/reconstruction/jmlr/utils/utils_os.py b/insightface/reconstruction/jmlr/utils/utils_os.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/reconstruction/jmlr/validate_dist.py b/insightface/reconstruction/jmlr/validate_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..397401f6fdc9f630f583f7da720c1cdfa01ebcd7
--- /dev/null
+++ b/insightface/reconstruction/jmlr/validate_dist.py
@@ -0,0 +1,186 @@
+from dataset import FaceDataset, DataLoaderX, MXFaceDataset
+import argparse
+import logging
+import os
+import time
+import timm
+import glob
+import numpy as np
+import os.path as osp
+from utils.utils_config import get_config
+from scipy.spatial.transform import Rotation
+
+import torch
+import torch.distributed as dist
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data.distributed
+from backbones import get_network
+from inference_simple import JMLRInference, Rt_from_6dof
+from dataset import Rt26dof
+
+
+def l2_distance(a, b):
+    dist = np.sqrt(np.sum(np.square(a-b), axis=1))
+    distance_list = np.sqrt(((a - b) ** 2).sum(axis=2)).mean(axis=1)
+    return distance_list
+
+def main(args):
+    cfg = get_config(args.config)
+
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    world_size = int(os.environ['WORLD_SIZE'])
+    rank = int(os.environ['RANK'])
+    #dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
+    dist.init_process_group('nccl')
+
+    local_rank = args.local_rank
+    torch.cuda.set_device(local_rank)
+
+    task1 = cfg.task
+    cfg.aug_modes = []
+    cfg.task = 0
+    batch_size = cfg.batch_size
+    dataset = MXFaceDataset(cfg=cfg, is_train=False, local_rank=local_rank)
+    if local_rank==0:
+        print('total:', len(dataset))
+        print('total batch:', len(dataset)//(batch_size*world_size))
+    cfg.task = task1
+    net = JMLRInference(cfg, local_rank)
+    net = net.to(local_rank)
+    net.eval()
+    #net = torch.nn.parallel.DistributedDataParallel(
+    #    module=net, broadcast_buffers=False, device_ids=[local_rank])
+    sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)
+    loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            sampler=sampler,
+            shuffle=False,
+            pin_memory=True,
+            num_workers=3,
+            drop_last=False,
+            )
+    num_epochs = 1
+    all_pred_verts = torch.zeros((len(dataset),1220,3), requires_grad=False).to(local_rank)
+    all_pred_R = torch.zeros((len(dataset),3,3), requires_grad=False).to(local_rank)
+    all_pred_t = torch.zeros((len(dataset),1,3), requires_grad=False).to(local_rank)
+    all_pred_verts2d = torch.zeros((len(dataset),1220,2), requires_grad=False).to(local_rank)
+
+    all_label_verts = torch.zeros((len(dataset),1220,3), requires_grad=False).to(local_rank)
+    all_label_R = torch.zeros((len(dataset),3,3), requires_grad=False).to(local_rank)
+    all_label_t = torch.zeros((len(dataset),1,3), requires_grad=False).to(local_rank)
+    all_label_verts2d = torch.zeros((len(dataset),1220,2), requires_grad=False).to(local_rank)
+    all_weight = 0.0
+    FLIPS = [False, True] if cfg.enable_flip else [False]
+    #FLIPS = [False]
+    if local_rank==0:
+        print('FLIPS:', FLIPS)
+    for epoch in range(num_epochs):
+        weight = 1.0
+
+        if epoch>0:
+            dataset.set_test_aug()
+            weight = 0.6
+        all_weight += weight
+        #all_distance = torch.zeros((len(dataset),), requires_grad=False).to(local_rank)
+        diff_R = []
+        diff_t = []
+        sampler.set_epoch(epoch)
+        for idx, sample in enumerate(loader):
+            img_local = sample['img_local']
+            label_verts = sample['verts']
+            tform = sample['tform']
+            label_6dof = sample['6dof']
+            data_idx = sample['idx']
+            label_verts2d = sample['verts2d']
+            img_local = img_local.to(local_rank)
+            pred_verts, pred_verts2d, pred_points2d = [], [], []
+            for is_flip in FLIPS:
+                with torch.no_grad():
+                    #pred_verts, R_pred, t_pred = infer.forward(img_local, img_raw, tform)
+                    #pred1, pred2 = net(img_local.to(local_rank), img_raw.to(local_rank))
+                    pred1, pred2, meta = net(img_local, is_flip=is_flip)
+                _pred_verts = net.convert_verts(pred1, meta)
+                pred_verts.append(_pred_verts)
+                _pred_verts2d, _pred_points2d = net.convert_2d(pred2, tform, meta)
+                pred_verts2d.append(_pred_verts2d)
+                pred_points2d.append(_pred_points2d)
+            pred_verts = sum(pred_verts) / len(pred_verts)
+            pred_verts2d = sum(pred_verts2d) / len(pred_verts2d)
+            pred_points2d = sum(pred_points2d) / len(pred_points2d)
+            R_pred, t_pred = net.solve(pred_verts, pred_verts2d)
+            label_6dof = label_6dof.cpu().numpy()
+            label_6dof = label_6dof * cfg.label_6dof_std.reshape(1, 6) + cfg.label_6dof_mean.reshape(1,6)
+
+            R_label, t_label = Rt_from_6dof(label_6dof)
+            diff_R.append(np.mean(np.abs(R_pred - R_label)))
+            diff_t.append(np.mean(np.abs(t_pred - t_label)))
+            #distance = torch.tensor(distance, dtype=torch.float32, requires_grad=False).to(local_rank)
+            data_idx = data_idx.view(-1)
+            #all_distance[data_idx] = distance
+            label_verts = label_verts.view(-1,1220,3) / 10.0
+            if epoch==0:
+                all_label_verts[data_idx,:,:] = label_verts.to(local_rank)
+                all_label_R[data_idx,:,:] = torch.tensor(R_label).to(local_rank)
+                all_label_t[data_idx,:,:] = torch.tensor(t_label).to(local_rank)
+                all_label_verts2d[data_idx,:,:] = label_verts2d.to(local_rank)
+            all_pred_verts[data_idx,:,:] += torch.tensor(pred_verts).to(local_rank) * weight
+            #all_pred_R[data_idx,:,:] += torch.tensor(R_pred).to(local_rank) * weight
+            #all_pred_t[data_idx,:,:] += torch.tensor(t_pred).to(local_rank) * weight
+            all_pred_verts2d[data_idx,:,:] += torch.tensor(pred_verts2d).to(local_rank) * weight
+            if idx%20==0 and local_rank==0:
+                print('processing-epoch-idx:', epoch, idx)
+                #print('distance:', distance.shape, distance.cpu().numpy().mean())
+                print('diff_R:', np.mean(diff_R))
+                print('diff_t:', np.mean(diff_t))
+
+    dist.all_reduce(all_label_verts, op=dist.ReduceOp.SUM)
+    dist.all_reduce(all_label_verts2d, op=dist.ReduceOp.SUM)
+    dist.all_reduce(all_label_R, op=dist.ReduceOp.SUM)
+    dist.all_reduce(all_label_t, op=dist.ReduceOp.SUM)
+
+    dist.all_reduce(all_pred_verts, op=dist.ReduceOp.SUM)
+    dist.all_reduce(all_pred_verts2d, op=dist.ReduceOp.SUM)
+    #dist.all_reduce(all_pred_R, op=dist.ReduceOp.SUM)
+    #dist.all_reduce(all_pred_t, op=dist.ReduceOp.SUM)
+    #dist.all_reduce(all_distance, op=dist.ReduceOp.SUM)
+    if local_rank==0:
+        label_verts = all_label_verts.cpu().numpy()
+        label_verts2d = all_label_verts2d.cpu().numpy()
+        R_label = all_label_R.cpu().numpy()
+        t_label = all_label_t.cpu().numpy()
+
+        pred_verts = all_pred_verts.cpu().numpy() / all_weight
+        #R_pred = all_pred_R.cpu().numpy() / all_weight
+        #t_pred = all_pred_t.cpu().numpy() / all_weight
+        pred_verts2d = all_pred_verts2d.cpu().numpy() / all_weight
+        R_pred, t_pred = net.solve(pred_verts, pred_verts2d)
+        #R_pred, t_pred = net.solve(pred_verts, label_verts2d)
+        #R_pred, t_pred = net.solve(label_verts, pred_verts2d)
+
+
+        X1 = label_verts @ R_label + t_label
+        X2 = pred_verts @ R_pred + t_pred
+        X3 = label_verts @ R_pred + t_pred
+        X4 = pred_verts @ R_label + t_label
+        distance = l2_distance(X1, X2) + l2_distance(X1, X3) + 10.0*l2_distance(X1,X4)
+        distance *= 1000.0
+
+        print('top20 distance:', np.mean(distance[:20]))
+
+
+        score = np.mean(distance)
+        print('epoch distance:', epoch, score)
+        with open(os.path.join(cfg.output, 'val.txt'), 'w') as f:
+            f.write("%f\n"%score)
+
+if __name__ == "__main__":
+    #torch.backends.cudnn.benchmark = True
+    parser = argparse.ArgumentParser(description='JMLR validation')
+    parser.add_argument('config', type=str, help='config file')
+    parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
+    args_ = parser.parse_args()
+    main(args_)
+
diff --git a/insightface/reconstruction/ostec/.gitignore b/insightface/reconstruction/ostec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..15a12ee9a1db34998c352f8cfc999939310e4fa6
--- /dev/null
+++ b/insightface/reconstruction/ostec/.gitignore
@@ -0,0 +1,8 @@
+models/*
+!models/topology
+.idea/*
+*.so
+/.ipynb_checkpoints/*
+results/*
+.stylegan2-cache/*
+samples/*
diff --git a/insightface/reconstruction/ostec/.gitmodules b/insightface/reconstruction/ostec/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..672473ea09a2d0f4e204d29f6ba39ae71dd69698
--- /dev/null
+++ b/insightface/reconstruction/ostec/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/deep3dfacerecon"]
+	path = external/deep3dfacerecon
+	url = https://github.com/barisgecer/Deep3DFaceRecon_pytorch
diff --git a/insightface/reconstruction/ostec/README.md b/insightface/reconstruction/ostec/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02e912265cbb5154b98a28e6a270a442f40e9d3a
--- /dev/null
+++ b/insightface/reconstruction/ostec/README.md
@@ -0,0 +1,210 @@
+# [OSTeC: One-Shot Texture Complition](https://openaccess.thecvf.com/content/CVPR2021/html/Gecer_OSTeC_One-Shot_Texture_Completion_CVPR_2021_paper.html)
+#### [CVPR 2021]
+[![arXiv Prepring](https://img.shields.io/badge/arXiv-Preprint-lightgrey?logo=arxiv)](https://arxiv.org/abs/2012.15370)
+[![License: CC BY-NC-SA 4.0](https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
+
+
+ [Baris Gecer](http://barisgecer.github.io) <sup> 1,2</sup>, [Jiankang Deng](https://jiankangdeng.github.io/) <sup> 1,2</sup>, & [Stefanos Zafeiriou](https://wp.doc.ic.ac.uk/szafeiri/) <sup> 1,2</sup>
+ <br/>
+ <sup>1 </sup>Imperial College London
+ <br/>
+ <sup>2 </sup> Huawei CBG
+ <br/>
+
+<p align="center">
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/im18.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/norah.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/robin.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/teaser2.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/teaser3.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/teaser4.gif" style="background-color:white; display: inline;" />
+<img width="12%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/gifs/teaset5.gif" style="background-color:white; display: inline;" />
+</p>
+<p align="center"><img width="100%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/teaser.jpg" style="background-color:white;" /></p>
+
+## Abstract
+
+The last few years have witnessed the great success of non-linear generative models in synthesizing high-quality photorealistic face images. Many recent 3D facial texture reconstruction and pose manipulation from a single image approaches still rely on large and clean face datasets to train image-to-image Generative Adversarial Networks (GANs). Yet the collection of such a large scale high-resolution 3D texture dataset is still very costly and difficult to maintain age/ethnicity balance. Moreover, regression-based approaches suffer from generalization to the in-the-wild conditions and are unable to fine-tune to a target-image. In this work, we propose an unsupervised approach for one-shot 3D facial texture completion that does not require large-scale texture datasets, but rather harnesses the knowledge stored in 2D face generators. The proposed approach rotates an input image in 3D and fill-in the unseen regions by reconstructing the rotated image in a 2D face generator, based on the visible parts. Finally, we stitch the most visible textures at different angles in the UV image-plane. Further, we frontalize the target image by projecting the completed texture into the generator. The qualitative and quantitative experiments demonstrate that the completed UV textures and frontalized images are of high quality, resembles the original identity, can be used to train a texture GAN model for 3DMM fitting and improve pose-invariant face recognition.
+
+## Overview
+
+<p align="center"><img width="100%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/overview.jpg" style="background-color:white;" /></p>
+Overview of the method. The proposed approach iteratively optimizes the texture UV-maps for different re-rendered images with their masks. At the end of each optimization, generated images are used to acquire partial UV images by dense landmarks. Finally, the completed UV images are fed to the next iteration for progressive texture building.
+<br/>
+
+
+## Requirements
+**This implementation is only tested under Ubuntu environment with Nvidia GPUs and CUDA 10.0 and CuDNN-7.0 installed.**
+
+## Installation
+### 1. Clone the repository and set up a conda environment as follows:
+```
+git clone https://github.com/barisgecer/OSTeC --recursive
+cd OSTeC
+conda env create -f environment.yml -n ostec
+source activate ostec
+```
+
+### 2. Installation of Deep3DFaceRecon_pytorch
+- **2.a.** Install Nvdiffrast library:
+```
+cd external/deep3dfacerecon/nvdiffrast    # ./OSTeC/external/deep3dfacerecon/nvdiffrast 
+pip install .
+```
+- **2.b.** Install Arcface Pytorch:
+```
+cd ..    # ./OSTeC/external/deep3dfacerecon/
+git clone https://github.com/deepinsight/insightface.git
+cp -r ./insightface/recognition/arcface_torch/ ./models/
+```
+
+- **2.c.** Prepare prerequisite models: Deep3DFaceRecon_pytorch method uses [Basel Face Model 2009 (BFM09)](https://faces.dmi.unibas.ch/bfm/main.php?nav=1-0&id=basel_face_model) to represent 3d faces. Get access to BFM09 using this [link](https://faces.dmi.unibas.ch/bfm/main.php?nav=1-2&id=downloads). After getting the access, download "01_MorphableModel.mat". In addition, we use an Expression Basis provided by [Guo et al.](https://github.com/Juyong/3DFace). Download the Expression Basis (Exp_Pca.bin) using this [link (google drive)](https://drive.google.com/file/d/1bw5Xf8C12pWmcMhNEu6PtsYVZkVucEN6/view?usp=sharing). Organize all files into the following structure:
+```
+OSTeC
+│
+└─── external
+     │
+     └─── deep3dfacerecon
+          │
+          └─── BFM
+              │
+              └─── 01_MorphableModel.mat
+              │
+              └─── Exp_Pca.bin
+              |
+              └─── ...
+```
+- **2.d.** Deep3DFaceRecon_pytorch provides a model trained on a combination of [CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html), 
+[LFW](http://vis-www.cs.umass.edu/lfw/), [300WLP](http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/main.htm),
+[IJB-A](https://www.nist.gov/programs-projects/face-challenges), [LS3D-W](https://www.adrianbulat.com/face-alignment), and [FFHQ](https://github.com/NVlabs/ffhq-dataset) datasets. Download the pre-trained model using this [link (google drive)](https://drive.google.com/drive/folders/1liaIxn9smpudjjqMaWWRpP0mXRW_qRPP?usp=sharing) and organize the directory into the following structure:
+```
+OSTeC
+│
+└─── external
+     │
+     └─── deep3dfacerecon
+          │
+          └─── checkpoints
+               │
+               └─── face_recon
+                   │
+                   └─── epoch_latest.pth
+
+```
+### 3. Download Face Recognition \& Landmark Detection \& VGG \& Style-Encoder models
+- Download the models here: https://drive.google.com/file/d/1TBoNt55vleRkMZaT9XKt6oNQmo8hkN-Q/view?usp=sharing
+
+- And place it under 'models' directory like the following:
+```
+OSTeC
+│
+└─── models
+     │
+     └─── resnet_18_20191231.h5
+     │
+     └─── vgg16_zhang_perceptual.pkl
+     │
+     └─── alignment
+     │         .
+     │         .
+     │
+     └─── fr_models
+               .
+               .
+
+```
+
+### 4. Download Topology info files
+- Download the topology files here: https://drive.google.com/file/d/1mvb2uDMPNGL1MlBgP6Op00gPdEMQUUWb/view?usp=sharing
+
+- And place it under 'models/topology' directory like the following:
+```
+OSTeC
+│
+└─── models
+     │         .
+     │         .
+     │         .
+     │
+     └─── topology
+          │
+          └─── trilist.pkl
+          │
+          └─── tcoords.pkl
+                    .
+                    .
+
+```
+
+
+
+### 5. Download Face Segmentation models
+- Download the Graphonomy model here: https://drive.google.com/file/d/1eUe18HoH05p0yFUd_sN6GXdTj82aW0m9/view?usp=sharing
+(If the link doesn't work for some reason check the original [Graphonomy](https://github.com/Gaoyiminggithub/Graphonomy) github page and download 'CIHP trained model')
+
+- And place it under 'models' directory like the following:
+```
+OSTeC
+│
+└─── models
+     │
+     └─── Graphonomy
+         │
+         └─── inference.pth
+```
+
+<!--- ### 4. Download StyleGANv2 model
+- Download the model from the original repo: https://nvlabs-fi-cdn.nvidia.com/stylegan2/networks/stylegan2-ffhq-config-f.pkl
+And place it under 'models' directory like the following:
+```
+OSTeC
+│
+└─── models
+     │
+     └─── stylegan2_networks_stylegan2-ffhq-config-f
+
+```
+-->
+
+## Usage
+- Run ```python run_ostec.py --source_dir [source_dir] --save_dir [save_dir] [-f] -i [iterations (default 200)] -m [soft|hard|auto]```
+- Modes (-m or --mode):
+   * soft: keep the original texture for visible parts (recommended when the input image is high resolution, near-frontal, and non-occluded.)
+   * hard: generate all
+   * auto: soft for frontal, hard for profile images
+
+## More Results
+
+<p align="center"><img width="100%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/comp2.jpg" /></p>
+<p align="center"><img width="100%" src="https://raw.githubusercontent.com/barisgecer/OSTeC/main/figures/comp1.jpg" /></p>
+<br/>
+
+## License
+- The source code shared here is protected under Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) License which does **NOT** allow commercial  use. To view a copy of this license, see LICENSE
+- Copyright (c) 2020, Baris Gecer. All rights reserved.
+- This work is made available under the CC BY-NC-SA 4.0.
+
+
+## Acknowledgement
+- Our projection relies on NVIDIA's [StyleGANv2](https://github.com/NVlabs/stylegan2)
+- Thanks [@jiankangdeng](https://jiankangdeng.github.io/) for providing Face Recognition and Landmark Detection models
+- We use [MTCNN](https://github.com/ipazc/mtcnn) for face detection
+- We use [Graphonomy](https://github.com/Gaoyiminggithub/Graphonomy) for face segmentation (i.e. to exclude hairs, occlusion)
+- 3D face reconstruction has been originally solved by [GANFit](https://github.com/barisgecer/GANFit). However, since it is commercialized and will not be public, I had to re-implement the ports for [Deep3DFaceRecon_pytorch](https://github.com/sicxu/Deep3DFaceRecon_pytorch).
+- We initialize StyleGAN parameters by [Style-Encoder](https://github.com/rolux/stylegan2encoder/issues/2) (by [@rolux](https://github.com/rolux), [@pbaylies](https://github.com/pbaylies)).
+- Thanks [Zhang et al.](https://richzhang.github.io/PerceptualSimilarity/) for VGG16 model
+
+## Citation
+If you find this work is useful for your research, please cite our paper: 
+
+```
+@InProceedings{Gecer_2021_CVPR,
+    author    = {Gecer, Baris and Deng, Jiankang and Zafeiriou, Stefanos},
+    title     = {OSTeC: One-Shot Texture Completion},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2021},
+    pages     = {7628-7638}
+}
+```
+<br/>
diff --git a/insightface/reconstruction/ostec/core/arcface_handler.py b/insightface/reconstruction/ostec/core/arcface_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a223a5f46090be3274717be480154a263d0434ea
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/arcface_handler.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import tensorflow as tf
+import numpy as np
+from external import arcface50
+from skimage import transform as trans
+
+
+def align_arcface(image, landmarks):
+    """
+    Aligns 'image' with its corresponding 'landmarks' to a predefined template
+    with similarity transformation. This is the tensorflow implementation of
+    the default alignment procedure of ArcFace
+    Args:
+        image: a 4D float32 numpy array with shape [batch_size, image_height,
+        image_width, 3].
+        landmarks: 68 iBug landmark points of 'image'. [batch_size, 68, 2]
+
+    Returns:
+        4-D float32 numpy array with shape [batch_size, 112, 112, 3]. Contains
+        aligned version of 'image'
+
+    """
+
+    image_size = (112, 112)
+    dst = np.array([
+        [30.2946, 51.6963],
+        [65.5318, 51.5014],
+        [48.0252, 71.7366],
+        [33.5493, 92.3655],
+        [62.7299, 92.2041]], dtype=np.float32)
+    if image_size[1] == 112:
+        dst[:, 0] += 8.0
+    # dst = dst[:, ::-1]
+    landmark5 = tf.stack([(landmarks[:, 36] + landmarks[:, 39]) / 2,
+                          (landmarks[:, 42] + landmarks[:, 45]) / 2,
+                          landmarks[:, 30],
+                          landmarks[:, 48],
+                          landmarks[:, 54]], 1)
+
+    def py_similarity_transform(src, dst):
+        tform = trans.SimilarityTransform()
+        Ms = np.zeros([0, 3, 3], dtype=np.float32)
+        for s in src:
+            tform.estimate(s, dst)
+            Ms = np.concatenate([Ms,[tform.params]],0)
+        return Ms
+
+    M = tf.py_func(py_similarity_transform, [landmark5, tf.constant(dst, 'float32')], tf.double, stateful=False)
+    M.set_shape([image.get_shape().as_list()[0],3, 3])
+
+    aligned = tf.contrib.image.transform(image, tf.cast(tf.contrib.image.matrices_to_flat_transforms(tf.map_fn(tf.linalg.inv,M)), 'float32'),interpolation='BILINEAR')
+
+    return aligned[:, 0:image_size[0], 0:image_size[1], :]
+
+def get_input_features(image, landmarks):
+    """Extract features from a face recongnition networks including
+        intermadiate activations. This function first align the image and
+        then call identity_features()
+    """
+    image_aligned = align_arcface(image, landmarks)
+
+    emb_norm, content, embedding, vars = identity_features(image_aligned,'id_features/')
+    return emb_norm, vars, image_aligned #, content, embedding], image_aligned
+
+def identity_features(input, name):
+    """Extract features from a face recongnition networks including
+        intermadiate activations.
+    """
+    with tf.variable_scope(name, reuse=True):
+        input, embedding, content = arcface50.KitModel('models/fr_models/arcface50.npy', input * 255)
+        emb_norm = tf.nn.l2_normalize(embedding, 1)
+    vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
+    return emb_norm, content, embedding, vars
+
+class Arcface_Handler:
+    def __init__(self):
+        self.img_ph = tf.placeholder(dtype=tf.float32, name='img_ph', shape=[None, 256, 256, 3])
+        self.lms_ph = tf.placeholder(dtype=tf.float32, name='lms_ph', shape=[None, 68, 2])
+        aligned_img = align_arcface(self.img_ph, self.lms_ph)
+        aligned_img.set_shape([None, 112, 112, 3])
+        self.emb_norm, _, _, vars = identity_features(aligned_img, 'input_id_features')
+        var_init = tf.variables_initializer(vars)
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        self.sess = tf.Session(config=config)
+        self.sess.run(var_init)
+
+    def get_identity_features(self, img, lms):
+        lms = (lms / img.shape[::-1]) * [256, 256]
+        img = img.resize([256, 256])
+        return self.sess.run(self.emb_norm, {self.img_ph: [img.pixels_with_channels_at_back()], self.lms_ph:[lms]})
+
+def identity_features_numpy(image, landmarks, return_aligned=False):
+    src_img = np.array([image])
+    src_lms = np.array([landmarks])
+    src_img = tf.constant(src_img,tf.float32)
+    src_lms = tf.constant(src_lms,tf.float32)
+    aligned_img = align_arcface(src_img,src_lms)
+    emb_norm, _, _, vars = identity_features(aligned_img, 'input_id_features')
+    var_init = tf.variables_initializer(vars)
+    with tf.Session() as sess:
+        sess.run(var_init)
+        features = sess.run(emb_norm)
+        if return_aligned:
+            aligned = sess.run(aligned_img)
+
+    tf.reset_default_graph()
+    if return_aligned:
+        return features, aligned
+    else:
+        return features
+
diff --git a/insightface/reconstruction/ostec/core/config.py b/insightface/reconstruction/ostec/core/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d9fd02a846cf88060bbd07e89af4e9f55ac3ca
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/config.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import argparse
+
+def split_to_batches(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+def str2bool(v):
+    if isinstance(v, bool):
+       return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+parser = argparse.ArgumentParser(description='Find latent representation of reference images using perceptual loss')
+
+parser.add_argument('-m', '--mode',default='hard', choices=['soft', 'auto', 'hard'],
+                    help='Soft: keep original texture, Hard: generate all, auto: soft for frontal, hard for profile')
+parser.add_argument('-f', '--frontalize', action='store_true', help='Run frontalization at the end')
+parser.add_argument('-p', '--pickle', action='store_true', help='Save pickle with everything')
+parser.add_argument('-g', '--ganfit', action='store_true', help='Reconstruction from GANFit is a must. If not raised, it is automatic: GANFit rec. if pickle found, Deep3DRecon otherwise.')
+parser.add_argument('--iterations_frontalize', default=300, help='Number of optimization steps for each batch', type=int)
+
+parser.add_argument('--load_last', default='', help='Start with embeddings from directory')
+parser.add_argument('--dlatent_avg', default='', help='Use dlatent from file specified here for truncation instead of dlatent_avg from Gs')
+parser.add_argument('--model_url', default='gdrive:networks/stylegan2-ffhq-config-f.pkl', help='Fetch a StyleGAN model to train on from this URL') # default='gdrive:networks/stylegan2-ffhq-config-f.pkl'
+parser.add_argument('--model_res', default=1024, help='The dimension of images in the StyleGAN model', type=int)
+parser.add_argument('--batch_size', default=1, help='Batch size for generator and perceptual model', type=int)
+parser.add_argument('--optimizer', default='ggt', help='Optimization algorithm used for optimizing dlatents')
+
+# Perceptual model params
+parser.add_argument('--vgg_url', default='models/vgg16_zhang_perceptual.pkl', help='Fetch VGG model on from this URL') # default='https://drive.google.com/uc?id=1N2-m9qszOeVC9Tq77WxsLnuWwOedQiD2'
+parser.add_argument('--image_size', default=256, help='Size of images for perceptual model', type=int)
+parser.add_argument('--resnet_image_size', default=224, help='Size of images for the Resnet model', type=int)
+parser.add_argument('--lr', default=0.25, help='Learning rate for perceptual model', type=float)
+parser.add_argument('--decay_rate', default=0.9, help='Decay rate for learning rate', type=float)
+parser.add_argument('-i', '--iterations', default=200, help='Number of optimization steps for each batch', type=int)
+parser.add_argument('--decay_steps', default=4, help='Decay steps for learning rate decay (as a percent of iterations)', type=float)
+parser.add_argument('--early_stopping', default=True, help='Stop early once training stabilizes', type=str2bool, nargs='?', const=True)
+parser.add_argument('--early_stopping_threshold', default=0.5, help='Stop after this threshold has been reached', type=float)
+parser.add_argument('--early_stopping_patience', default=10, help='Number of iterations to wait below threshold', type=int)
+parser.add_argument('--load_effnet', default='data/finetuned_effnet.h5', help='Model to load for EfficientNet approximation of dlatents')
+parser.add_argument('--load_resnet', default='models/resnet_18_20191231.h5', help='Model to load for ResNet approximation of dlatents')
+parser.add_argument('--use_preprocess_input', default=True, help='Call process_input() first before using feed forward net', type=str2bool, nargs='?', const=True)
+parser.add_argument('--use_best_loss', default=True, help='Output the lowest loss value found as the solution', type=str2bool, nargs='?', const=True)
+parser.add_argument('--average_best_loss', default=0.25, help='Do a running weighted average with the previous best dlatents found', type=float)
+parser.add_argument('--sharpen_input', default=True, help='Sharpen the input images', type=str2bool, nargs='?', const=True)
+parser.add_argument('--landmark_model', default='./models/alignment/3D84/model.ckpt-277538', help='Landmark model path')
+
+# Loss function options
+parser.add_argument('--use_vgg_loss', default=0.4, help='Use VGG perceptual loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_vgg_layer', default=9, help='Pick which VGG layer to use.', type=int)
+parser.add_argument('--use_pixel_loss', default=1.5, help='Use logcosh image pixel loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_mssim_loss', default=200, help='Use MS-SIM perceptual loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_lpips_loss', default=100, help='Use LPIPS perceptual loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_l1_penalty', default=0.5, help='Use L1 penalty on latents; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_discriminator_loss', default=0.5, help='Use trained discriminator to evaluate realism.', type=float)
+parser.add_argument('--use_adaptive_loss', default=False, help='Use the adaptive robust loss function from Google Research for pixel and VGG feature loss.', type=str2bool, nargs='?', const=True)
+parser.add_argument('--use_landmark_loss', default=200, help='Use landmark loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_id_loss', default=10, help='Use landmark loss; 0 to disable, > 0 to scale.', type=float)
+parser.add_argument('--use_id_loss_frontalize', default=100, help='Use landmark loss; 0 to disable, > 0 to scale.', type=float)
+
+# Generator params
+parser.add_argument('--randomize_noise', default=False, help='Add noise to dlatents during optimization', type=str2bool, nargs='?', const=True)
+parser.add_argument('--tile_dlatents', default=False, help='Tile dlatents to use a single vector at each scale', type=str2bool, nargs='?', const=True)
+parser.add_argument('--clipping_threshold', default=2.0, help='Stochastic clipping of gradient values outside of this threshold', type=float)
+
+# Masking params
+parser.add_argument('--load_mask', default=False, help='Load segmentation masks', type=str2bool, nargs='?', const=True)
+parser.add_argument('--face_mask', default=True, help='Generate a mask for predicting only the face area', type=str2bool, nargs='?', const=True)
+parser.add_argument('--use_grabcut', default=True, help='Use grabcut algorithm on the face mask to better segment the foreground', type=str2bool, nargs='?', const=True)
+parser.add_argument('--scale_mask', default=1.4, help='Look over a wider section of foreground for grabcut', type=float)
+parser.add_argument('--composite_mask', default=False, help='Merge the unmasked area back into the generated image', type=str2bool, nargs='?', const=True)
+parser.add_argument('--composite_blur', default=8, help='Size of blur filter to smoothly composite the images', type=int)
+
+# Video params
+parser.add_argument('--video_dir', default='videos', help='Directory for storing training videos')
+parser.add_argument('--output_video', default=False, help='Generate videos of the optimization process', type=bool)
+parser.add_argument('--video_codec', default='MJPG', help='FOURCC-supported video codec name')
+parser.add_argument('--video_frame_rate', default=24, help='Video frames per second', type=int)
+parser.add_argument('--video_size', default=512, help='Video size in pixels', type=int)
+parser.add_argument('--video_skip', default=1, help='Only write every n frames (1 = write every frame)', type=int)
+
+
+def get_config():
+    args, other_args = parser.parse_known_args()
+    args.decay_steps *= 0.01 * args.iterations  # Calculate steps as a percent of total iterations
+
+    return args, other_args
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/core/generator_model.py b/insightface/reconstruction/ostec/core/generator_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..83be6248e555e4f456a3cdfc9f6c6bfa8f5de6a6
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/generator_model.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import math
+import tensorflow as tf
+import numpy as np
+import external.stylegan2.dnnlib.tflib as tflib
+from functools import partial
+
+
+def create_stub(name, batch_size):
+    return tf.constant(0, dtype='float32', shape=(batch_size, 0))
+
+
+def create_variable_for_generator(name, batch_size, tiled_dlatent, model_scale=18, tile_size = 1):
+    if tiled_dlatent:
+        low_dim_dlatent = tf.get_variable('learnable_dlatents',
+            shape=(batch_size, tile_size, 512),
+            dtype='float32',
+            initializer=tf.initializers.random_normal())
+        return tf.tile(low_dim_dlatent, [1, model_scale // tile_size, 1])
+    else:
+        return tf.get_variable('learnable_dlatents',
+            shape=(batch_size, model_scale, 512),
+            dtype='float32',
+            initializer=tf.initializers.random_normal())
+
+
+class Generator:
+    def __init__(self, model, batch_size, custom_input=None, clipping_threshold=2, tiled_dlatent=False, model_res=1024, randomize_noise=False):
+        self.batch_size = batch_size
+        self.tiled_dlatent=tiled_dlatent
+        self.model_scale = int(2*(math.log(model_res,2)-1)) # For example, 1024 -> 18
+
+        if tiled_dlatent:
+            self.initial_dlatents = np.zeros((self.batch_size, 512))
+            model.components.synthesis.run(np.zeros((self.batch_size, self.model_scale, 512)),
+                randomize_noise=randomize_noise, minibatch_size=self.batch_size,
+                custom_inputs=[partial(create_variable_for_generator, batch_size=batch_size, tiled_dlatent=True),
+                                                partial(create_stub, batch_size=batch_size)],
+                structure='fixed')
+        else:
+            self.initial_dlatents = np.zeros((self.batch_size, self.model_scale, 512))
+            if custom_input is not None:
+                model.components.synthesis.run(self.initial_dlatents,
+                    randomize_noise=randomize_noise, minibatch_size=self.batch_size,
+                    custom_inputs=[partial(custom_input.eval(), batch_size=batch_size), partial(create_stub, batch_size=batch_size)],
+                    structure='fixed')
+            else:
+                model.components.synthesis.run(self.initial_dlatents,
+                    randomize_noise=randomize_noise, minibatch_size=self.batch_size,
+                    custom_inputs=[partial(create_variable_for_generator, batch_size=batch_size, tiled_dlatent=False, model_scale=self.model_scale),
+                                                    partial(create_stub, batch_size=batch_size)],
+                    structure='fixed')
+
+        self.dlatent_avg_def = model.get_var('dlatent_avg')
+        self.reset_dlatent_avg()
+        self.sess = tf.get_default_session()
+        self.graph = tf.get_default_graph()
+
+        self.dlatent_variable = next(v for v in tf.global_variables() if 'learnable_dlatents' in v.name)
+        self._assign_dlatent_ph = tf.placeholder(tf.float32, name="assign_dlatent_ph")
+        self._assign_dlantent = tf.assign(self.dlatent_variable, self._assign_dlatent_ph)
+        self.set_dlatents(self.initial_dlatents)
+
+        def get_tensor(name):
+            try:
+                return self.graph.get_tensor_by_name(name)
+            except KeyError:
+                return None
+
+        self.generator_output = get_tensor('G_synthesis_1/_Run/concat:0')
+        if self.generator_output is None:
+            self.generator_output = get_tensor('G_synthesis_1/_Run/concat/concat:0')
+        if self.generator_output is None:
+            self.generator_output = get_tensor('G_synthesis_1/_Run/concat_1/concat:0')
+        # If we loaded only Gs and didn't load G or D, then scope "G_synthesis_1" won't exist in the graph.
+        if self.generator_output is None:
+            self.generator_output = get_tensor('G_synthesis/_Run/concat:0')
+        if self.generator_output is None:
+            self.generator_output = get_tensor('G_synthesis/_Run/concat/concat:0')
+        if self.generator_output is None:
+            self.generator_output = get_tensor('G_synthesis/_Run/concat_1/concat:0')
+        if self.generator_output is None:
+            for op in self.graph.get_operations():
+                print(op)
+            raise Exception("Couldn't find G_synthesis_1/_Run/concat tensor output")
+        self.generated_image = tflib.convert_images_to_uint8(self.generator_output, nchw_to_nhwc=True, uint8_cast=False)
+        self.generated_image_uint8 = tf.saturate_cast(self.generated_image, tf.uint8)
+
+        # Implement stochastic clipping similar to what is described in https://arxiv.org/abs/1702.04782
+        # (Slightly different in that the latent space is normal gaussian here and was uniform in [-1, 1] in that paper,
+        # so we clip any vector components outside of [-2, 2]. It seems fine, but I haven't done an ablation check.)
+        clipping_mask = tf.math.logical_or(self.dlatent_variable > clipping_threshold, self.dlatent_variable < -clipping_threshold)
+        clipped_values = tf.where(clipping_mask, tf.random_normal(shape=self.dlatent_variable.shape), self.dlatent_variable)
+        self.stochastic_clip_op = tf.assign(self.dlatent_variable, clipped_values)
+
+    def reset_dlatents(self):
+        self.set_dlatents(self.initial_dlatents)
+
+    def set_dlatents(self, dlatents):
+        if self.tiled_dlatent:
+            if (dlatents.shape != (self.batch_size, 512)) and (dlatents.shape[1] != 512):
+                dlatents = np.mean(dlatents, axis=1)
+            if (dlatents.shape != (self.batch_size, 512)):
+                dlatents = np.vstack([dlatents, np.zeros((self.batch_size-dlatents.shape[0], 512))])
+            assert (dlatents.shape == (self.batch_size, 512))
+        else:
+            if (dlatents.shape[1] > self.model_scale):
+                dlatents = dlatents[:,:self.model_scale,:]
+            if (isinstance(dlatents.shape[0], int)):
+                if (dlatents.shape != (self.batch_size, self.model_scale, 512)):
+                    dlatents = np.vstack([dlatents, np.zeros((self.batch_size-dlatents.shape[0], self.model_scale, 512))])
+                assert (dlatents.shape == (self.batch_size, self.model_scale, 512))
+                self.sess.run([self._assign_dlantent], {self._assign_dlatent_ph: dlatents})
+                return
+            else:
+                self._assign_dlantent = tf.assign(self.dlatent_variable, dlatents)
+                return
+        self.sess.run([self._assign_dlantent], {self._assign_dlatent_ph: dlatents})
+
+    def stochastic_clip_dlatents(self):
+        self.sess.run(self.stochastic_clip_op)
+
+    def get_dlatents(self):
+        return self.sess.run(self.dlatent_variable)
+
+    def get_dlatent_avg(self):
+        return self.dlatent_avg
+
+    def set_dlatent_avg(self, dlatent_avg):
+        self.dlatent_avg = dlatent_avg
+
+    def reset_dlatent_avg(self):
+        self.dlatent_avg = self.dlatent_avg_def
+
+    def generate_images(self, dlatents=None):
+        if dlatents is not None:
+            self.set_dlatents(dlatents)
+        return self.sess.run(self.generated_image_uint8)
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/core/landmark_handler.py b/insightface/reconstruction/ostec/core/landmark_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da5464a59872440c572713d62f702319377300b
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/landmark_handler.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import tensorflow as tf
+from external.landmark_detector import networks
+from external.landmark_detector.flags import FLAGS
+
+
+def tf_heatmap_to_lms(heatmap):
+    hs = tf.argmax(tf.reduce_max(heatmap, 2), 1)
+    ws = tf.argmax(tf.reduce_max(heatmap, 1), 1)
+    lms = tf.transpose(tf.to_float(tf.stack([hs, ws])), perm=[1, 2, 0])
+    return lms
+
+class Landmark_Handler():
+    def __init__(self, args, sess, generated_image):
+        self.sess = sess
+
+        self.model_path = args.landmark_model
+        n_landmarks = 84
+        FLAGS.n_landmarks = 84
+
+        net_model = networks.DNFaceMultiView('')
+        with tf.variable_scope('net'):
+            self.lms_heatmap_prediction, states = net_model._build_network(generated_image, datas=None, is_training=False,
+                                                                      n_channels=n_landmarks)
+            self.pts_predictions = tf_heatmap_to_lms(self.lms_heatmap_prediction)
+            variables = tf.all_variables()
+            variables_to_restore = [v for v in variables if v.name.split('/')[0] == 'net']
+            self.saver = tf.train.Saver(variables_to_restore)
+
+    def load_model(self):
+        self.saver.restore(self.sess, self.model_path)
diff --git a/insightface/reconstruction/ostec/core/operator.py b/insightface/reconstruction/ostec/core/operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e988f84a3496c8c6af7b361bb5f34cb857c0be6
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/operator.py
@@ -0,0 +1,435 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+from utils.align2stylegan import align_im2stylegan, align_mesh2stylegan
+from core.projection_handler import Projection_Handler
+from skimage.morphology import remove_small_holes
+import time
+from utils.ganfit_camera import apply_camera_only3d, get_pose
+from utils.utils import *
+from core.arcface_handler import Arcface_Handler
+
+from utils import generate_heatmap
+
+class Face:
+    def __init__(self, tmesh,
+            tmesh_masked,
+            tmesh_rotated,
+            img_uv_src,
+            angle_uv_src,
+            angle_uv_list,
+            img_uv_list,
+            view_angle_src,
+            id_features,
+            exclude_mask,
+            is_profile,
+            mode):
+        self.tmesh = tmesh
+        self.tmesh_masked = tmesh_masked
+        self.tmesh_rotated = tmesh_rotated
+        self.img_uv_src = img_uv_src
+        self.angle_uv_src = angle_uv_src
+        self.angle_uv_list = angle_uv_list
+        self.img_uv_list = img_uv_list
+        self.view_angle_src = view_angle_src
+        self.id_features = id_features
+        self.is_profile = is_profile
+        self.exclude_mask = exclude_mask
+
+        if mode == 'auto':
+            if is_profile:
+                self.mode = 'hard'
+            else:
+                self.mode = 'soft'
+        else:
+            self.mode = mode
+
+    def rotation_dict(self):
+        if self.mode == 'soft':
+            return {
+                'bottom': [-30, 0, 0],
+                'bottom_left': [-15, -30, 0],
+                'bottom_right': [-15, 30, 0],
+                'left': [5, -40, 0],
+                'right': [5, 40, 0],
+            }
+        elif self.mode == 'hard':
+            return {
+                'front': [0, 0, 0],
+                'bottom': [-30, 0, 0],
+                'bottom_left': [-15, -30, 0],
+                'bottom_right': [-15, 30, 0],
+                'left': [5, -40, 0],
+                'right': [5, 40, 0],
+            }
+        else:
+            raise Exception('Unknown mode!')
+
+    def coef_dict(self):
+        if self.mode == 'soft':
+            return {
+                'bottom': 1.3,
+                'bottom_left': 1,
+                'bottom_right': 1,
+                'left': 1,
+                'right': 1,
+                'src': 2,
+            }
+        elif self.mode == 'hard':
+            return {
+                'front': 2,
+                'bottom': 1.3,
+                'bottom_left': 1,
+                'bottom_right': 1,
+                'left': 1,
+                'right': 1,
+                'src': 0.1,
+            }
+        else:
+            raise Exception('Unknown mode!')
+
+
+class Operator:
+    def __init__(self, args):
+        self.tcoords_full = mio.import_pickle('models/topology/tcoords_full.pkl')
+        self.tcoords = mio.import_pickle('models/topology/tcoords_alex.pkl')
+        self.mask = mio.import_pickle('models/topology/mask_full2crop.pkl') | True
+        self.tight_mask = mio.import_pickle('models/topology/mask_full2tightcrop.pkl')
+        self.template = mio.import_pickle('models/topology/all_all_all_crop_mean.pkl')
+        self.lms_ind = mio.import_pickle('models/topology/all_all_all_lands_ids.pkl')
+        self.img_shape = [1024, 1024] # 2048
+
+        self.uv_shape = [1024, 1536]
+        uv_mesh = self.tcoords.copy().points[:, ::-1]
+        uv_mesh[:, 0] = 1 - uv_mesh[:, 0]
+        uv_mesh *= self.uv_shape
+        self.uv_mesh = np.concatenate([uv_mesh, uv_mesh[:, 0:1] * 0], 1)
+        self.uv_trilist = mio.import_pickle('models/topology/trilist_full.pkl') #self.template.trilist
+
+        self.args = args
+        self.mode = args.mode # 'soft', 'hard', 'auto'
+
+        self.arcface_handler = Arcface_Handler()
+        self.projector = Projection_Handler(args)
+
+    def render_uv_image(self, generated, tcoords):
+        uv_tmesh = TexturedTriMesh(self.uv_mesh, tcoords, generated, trilist=self.uv_trilist)
+        bcs = rasterize_barycentric_coordinate_images(uv_tmesh, self.uv_shape)
+        img = rasterize_mesh_from_barycentric_coordinate_images(uv_tmesh, *bcs)
+        img.pixels = np.clip(img.pixels, 0.0, 1.0)
+        return img
+
+    def render_colored_image(self, view_angle_trg, return_visibility=False):
+        uv_cmesh = ColouredTriMesh(self.uv_mesh, trilist=self.uv_trilist, colours=np.tile(view_angle_trg, [3, 1]).T)
+        bcs = rasterize_barycentric_coordinate_images(uv_cmesh, self.uv_shape)
+        img = rasterize_mesh_from_barycentric_coordinate_images(uv_cmesh, *bcs)
+        img.pixels = np.clip(img.pixels, -1.0, 1.0)
+        if return_visibility:
+            visible = np.sum(bcs[0].pixels, axis=0) != 0
+            return img, visible
+        else:
+            return img
+
+    def camera_tri_angle_src(self, tmesh):#, pose_angle_deg=[0, 0, 0], cam_dist=-4.5):
+        camera_direction = -tmesh.points / np.tile(np.linalg.norm(tmesh.points, axis=1), [3, 1]).T
+        view_angle = np.sum(camera_direction * tmesh.vertex_normals(), 1)
+        return view_angle
+
+    def camera_tri_angle(self, tmesh, pose_angle_deg=[0, 0, 0], cam_dist=-4.5):
+        rot_z = rotation_z(pose_angle_deg[2])
+        rot_y = rotation_y(-pose_angle_deg[1])
+        rot_x = rotation_x(pose_angle_deg[0])
+        rotation = rot_z.compose_before(rot_y).compose_before(rot_x)
+
+        translation = Translation([0, 0, +cam_dist])
+        camera = rotation.compose_before(translation)
+
+        cam_mesh = camera.apply(tmesh)
+        camera_direction = -cam_mesh.points / np.tile(np.linalg.norm(cam_mesh.points, axis=1), [3, 1]).T
+        view_angle = np.sum(camera_direction * cam_mesh.vertex_normals(), 1)
+        return view_angle
+
+    def create_syn(self, face, trg_angle=[0, 0, 0], include_mask=None):
+        view_angle_trg = self.camera_tri_angle(face.tmesh, pose_angle_deg=trg_angle)
+
+        im, projected_mesh = rasterize_image(face.tmesh, self.img_shape, pose_angle_deg=trg_angle, cam_dist=4.5)
+
+        # fill_mask = include_mask.astype(np.bool)
+        fill_mask = ((view_angle_trg < face.view_angle_src) | (face.view_angle_src > 0.4)) & self.tight_mask
+        if include_mask is not None:
+            fill_mask = fill_mask | include_mask.astype(np.bool)
+        if face.exclude_mask is not None:
+            tcoord_sampling = np.round(self.tcoords.points[:,::-1] * face.exclude_mask.shape).astype(np.int)
+            fill_mask[self.mask] =  fill_mask[self.mask] & ~face.exclude_mask[face.exclude_mask.shape[0] - tcoord_sampling[:, 0], tcoord_sampling[:, 1]]
+
+        mask_mesh = ColouredTriMesh(face.tmesh.points, trilist=face.tmesh.trilist, colours=np.tile(fill_mask, [3, 1]).T)
+        mask = rasterize_image(mask_mesh, self.img_shape,pose_angle_deg=trg_angle, cam_dist=4.5)[0]
+
+        return im, projected_mesh[:, :2], mask
+
+
+    def create_align_syn(self, face, trg_angle=[0, 0, 0], include_mask=None):
+        im, projected_mesh, mask = self.create_syn(face, trg_angle, include_mask)
+        imgs, masks, transformation_params = align_im2stylegan(im_menpo2PIL(im), #im_menpo2PIL(mask),
+                                                               im_menpo2PIL(self.extend_mask(im, mask)),
+                                                               projected_mesh[self.lms_ind][:,::-1])
+        aligned_meshes = align_mesh2stylegan(projected_mesh, transformation_params)
+        landmarks = aligned_meshes[self.lms_ind]
+        landmarks[:,1] = 1 - landmarks[:,1]
+
+        heatmaps = generate_heatmap.generate_heatmaps(width=self.args.model_res,
+                                     height=self.args.model_res,
+                                     points=landmarks*self.args.model_res,
+                                     sigma=25)
+
+        landmarks = landmarks[:,::-1]
+        aligned_meshes = aligned_meshes[self.mask]
+
+        return imgs, masks, heatmaps, aligned_meshes
+
+    def get_tmesh(self, im, reconstruction_dict, face_mask):
+        id_features = self.arcface_handler.get_identity_features(im, reconstruction_dict['dense_lms'][self.lms_ind])
+
+        _, yaw_angle, _ = reconstruction_dict['euler_angles']
+        is_profile = np.abs(yaw_angle* 180 / np.pi)>30
+        visibility_threshold = 0.4
+        dense_lms = reconstruction_dict['dense_lms'] / im.shape[::-1]
+        dense_lms[:, 1] = 1 - dense_lms[:, 1]
+
+        im_masked = np.array(im_menpo2PIL(im))
+        mask_landmarks = np.ones_like(im_masked[:,:,0])
+        if face_mask is not None:
+            im_masked = im_masked * np.repeat(np.expand_dims(np.array(face_mask,np.bool),2),3,2)
+            mask_landmarks *= np.array(face_mask, np.uint8)
+
+        im_masked = fill_UV(im_PIL2menpo(im_masked))
+        im_masked.pixels = np.concatenate([im_masked.pixels, np.expand_dims(mask_landmarks,0)],0)
+        img_uv_src = self.render_uv_image(im_masked, dense_lms[self.mask])
+        mask_landmarks = img_uv_src.pixels[3]<0.5
+        img_uv_src.pixels = img_uv_src.pixels[0:3]
+        img_uv_src = fill_UV(img_uv_src)
+        if is_profile:
+            mask_landmarks = binary_dilation(mask_landmarks,disk(5))
+            visibility_threshold = 0.6
+        img_uv_src.pixels[:,mask_landmarks] = 0
+        tcoords = self.tcoords_full.copy()
+        tcoords.points[self.mask] = self.tcoords.points
+
+        tmesh = TexturedTriMesh(reconstruction_dict['vertices'], tcoords.points, img_uv_src,
+                                trilist=reconstruction_dict['trilist'])
+        tmesh_masked = tmesh.from_mask(self.mask)
+        tmesh_rotated = TexturedTriMesh(reconstruction_dict['vertices_rotated'], tmesh.tcoords.points, tmesh.texture,
+                                trilist=tmesh.trilist)
+        view_angle_src = self.camera_tri_angle_src(tmesh_rotated)
+        view_angle_src_masked = view_angle_src[self.mask]
+        view_angle_src_masked[~self.tight_mask[self.mask]] = -1 # Only take tight crop from the original image
+        angle_uv_src, visible = self.render_colored_image(view_angle_src_masked, return_visibility=True)
+        angle_uv_src.pixels[:,~visible | mask_landmarks] = -1.0
+
+        mask = angle_uv_src.pixels[0] < visibility_threshold
+        mask = ~remove_small_holes(~mask, area_threshold=100000)
+
+        if is_profile and self.mode=='soft':
+            mask = binary_dilation(mask, disk(10))
+            img_uv_src.pixels[:, mask] = 0
+            angle_uv_src.pixels[:, mask] = -1
+
+            img_uv_src_flipped = img_uv_src.mirror(1)
+            angle_uv_src_flipped = angle_uv_src.mirror(1)
+            temp = img_uv_src_flipped.pixels
+            pad = int((16 / 1024) * self.uv_shape[1])
+            img_uv_src_flipped.pixels = np.concatenate(
+                [np.zeros([temp.shape[0], temp.shape[1], pad]), temp[:, :, :-pad]], 2)
+            temp = angle_uv_src_flipped.pixels
+            angle_uv_src_flipped.pixels = np.concatenate(
+                [np.zeros([temp.shape[0], temp.shape[1], pad]), temp[:, :, :-pad]], 2)
+
+
+            img_uv_src_flipped = fill_UV(img_uv_src_flipped)
+            img_uv_src = fill_UV(img_uv_src)
+            mask_flipped = (angle_uv_src_flipped.pixels[0] > visibility_threshold) & mask
+            mask_flipped = remove_small_holes(mask_flipped, area_threshold=100000)
+            # mask_flipped = binary_dilation(mask_flipped, disk(15))
+            angle_uv_src.pixels = mask_flipped * angle_uv_src_flipped.pixels + (1 - mask_flipped) * angle_uv_src.pixels
+
+            mask_all = mask_flipped.astype(int).copy()
+            mask_all[~mask_flipped & ~mask] = 2
+            mask_all = fill_UV(Image(np.tile(mask_all, [3, 1, 1]))).pixels[0]
+
+            mask_flipped_g = gaussian(mask_all == 1, sigma=30, multichannel=True, mode='reflect')
+            mask_flipped_inv_g = gaussian(mask_all == 2, sigma=30, multichannel=True, mode='reflect')
+
+            img_uv_src.pixels = mask_flipped_g * img_uv_src_flipped.pixels + mask_flipped_inv_g * img_uv_src.pixels
+            # img_uv_src.pixels[:,mask_flipped] = img_uv_src_flipped.pixels[:,mask_flipped]
+
+            mask = (angle_uv_src.pixels[0] < visibility_threshold)
+            mask = ~remove_small_holes(~mask, area_threshold=100000)
+            # mask = binary_dilation(mask, disk(15))
+            img_uv_src.pixels[:, mask] = 0
+            angle_uv_src.pixels[:, mask] = -1
+
+        img_uv_src = fill_UV(img_uv_src)
+        tmesh.texture = img_uv_src
+        tmesh_rotated.texture = img_uv_src
+
+        face = Face(
+            tmesh=tmesh,
+            tmesh_masked=tmesh_masked,
+            tmesh_rotated=tmesh_rotated,
+            img_uv_src=img_uv_src,
+            angle_uv_src=angle_uv_src,
+            angle_uv_list=[],
+            img_uv_list=[],
+            view_angle_src=view_angle_src,
+            id_features=id_features,
+            exclude_mask=mask,
+            is_profile=is_profile,
+            mode=self.mode
+        )
+
+        face.angle_uv_list = [np.clip(angle_uv_src.pixels * face.coef_dict()['src'],-1,1)]
+        face.img_uv_list = [fill_UV(img_uv_src).pixels]
+
+        return face
+
+    def extend_mask(self, im, mask):
+        # closed_mask = binary_dilation(mask.pixels[0].astype(np.bool), disk(10))
+        # extended_mask = ((np.sum(im.pixels, 0) == 0) & (closed_mask & ~mask.pixels[0].astype(np.bool))) | mask.pixels[0].astype(np.bool)
+        # im_filled = remove_small_holes(np.sum(im.pixels, 0) > 0, area_threshold=1000)
+        # border = binary_dilation(im_filled, disk(10)) & ~binary_erosion(im_filled, disk(10))
+        return mask #Image(extended_mask)# | border)
+
+    def run_iteration(self, face, key, trg_angle):
+
+        imgs, masks, heatmaps, aligned_meshes = self.create_align_syn(face, trg_angle, face.uv_blending[key])
+
+        # Run Optimizer
+        generated_imgs, generated_dlatents = self.projector.run_projection({key: imgs},
+                                                                           {key: masks},
+                                                                           {key: heatmaps},
+                                                                           face.id_features)
+
+        img_uv = self.render_uv_image(im_PIL2menpo(generated_imgs[key]), aligned_meshes)
+        img_uv =fill_UV(img_uv)
+        img_uv = uv_color_normalize(face.img_uv_src, face.angle_uv_src, img_uv, Image(face.angle_uv_list[len(face.img_uv_list)]))
+
+        face.img_uv_list.append(img_uv.pixels)
+
+        final_uv, _ = uv_stiching(face.img_uv_list, face.angle_uv_list[:len(face.img_uv_list)], 40)
+        results_dict = {
+            'generated_imgs': generated_imgs[key],
+            'generated_dlatents': generated_dlatents[key],
+            'imgs': imgs,
+            'masks': masks,
+            'aligned_meshes': aligned_meshes,
+            'img_uv': img_uv,
+            'final_uv': final_uv
+        }
+
+        face.img_uv_src = final_uv
+        face.tmesh.texture = final_uv
+        face.tmesh.tcoords = self.tcoords_full.copy()
+        face.tmesh.tcoords.points[self.mask] = self.tcoords.points
+
+        return face, results_dict
+
+    def run(self, im, reconstruction_dict, face_mask=None):
+        start = time.time()
+        print('Preprocessing...', end=" ")
+        # GANFit compatibility
+        if not 'vertices' in reconstruction_dict: # GANFit
+            reconstruction_dict['vertices'] = reconstruction_dict['tmesh'].points
+            reconstruction_dict['trilist'] = reconstruction_dict['tmesh'].trilist
+        if not 'vertices_rotated' in reconstruction_dict: # GANFit
+            reconstruction_dict['vertices_rotated'] = apply_camera_only3d(reconstruction_dict['vertices'], reconstruction_dict['camera_params'])
+        if not 'euler_angles' in reconstruction_dict: # GANFit
+            reconstruction_dict['euler_angles'] =  get_pose(reconstruction_dict['camera_params'])
+
+        # Prepare Textured Trimesh with visible part of the face
+        face = self.get_tmesh(im, reconstruction_dict, face_mask)
+
+        img_uv_src = face.img_uv_src.copy()
+        angle_uv_src = face.angle_uv_src.copy()
+        print('Done in %.2f secs' % (time.time() - start))
+
+        # Prepare view angle maps
+        start = time.time()
+        print('Building a Visibility Index...', end=" ")
+        angle_uv = {}
+        key_list = ['src']
+        angle_uv_list = [np.clip(angle_uv_src.pixels * face.coef_dict()['src'],-1,1)]
+
+        view_angle_src_full = self.camera_tri_angle_src(face.tmesh_rotated)
+        tcoord_sampling = np.round(self.tcoords.points*angle_uv_src.shape).astype(np.int)
+        view_angle_src_full[self.mask] = angle_uv_src.pixels[0, angle_uv_src.shape[0] - tcoord_sampling[:, 1], tcoord_sampling[:, 0]]
+        view_angle_src_full[~self.tight_mask] = -1  # Only take tight crop from the original image
+
+        angle_list = [np.clip(view_angle_src_full * face.coef_dict()['src'],-1,1)]
+        dummy_im = im_menpo2PIL(img_uv_src)
+
+        # For each view calculate angles towards the camera (Visibility scores)
+        for key, trg_angle in face.rotation_dict().items():
+            view_angle_trg = self.camera_tri_angle(face.tmesh, pose_angle_deg=trg_angle)
+            view_angle_trg = np.clip(view_angle_trg * face.coef_dict()[key],-1,1)
+            _, projected_mesh = rasterize_image(face.tmesh, self.img_shape, pose_angle_deg=trg_angle,
+                                                 cam_dist=4.5)
+            _, _, transformation_params = align_im2stylegan(dummy_im, dummy_im,
+                                                                             projected_mesh[self.lms_ind, :2][:, ::-1])
+            aligned_meshes = align_mesh2stylegan(projected_mesh[:, :2], transformation_params)
+            out_of_plane = ((aligned_meshes[:, 0] > 1) |
+                            (aligned_meshes[:, 1] > 1) |
+                            (aligned_meshes[:, 0] < 0) |
+                            (aligned_meshes[:, 1] < 0))
+
+            view_angle_trg[out_of_plane] = -1
+            angle_list.append(view_angle_trg)
+            angle_uv[key] = self.render_colored_image(view_angle_trg[self.mask])
+            angle_uv_list.append(angle_uv[key].pixels)
+            key_list.append(key)
+
+        # Building a Visibility Index
+        max_ind = np.argmax(angle_list, axis=0)
+        max_ind_one_hot = np.zeros((max_ind.size, max_ind.max() + 1))
+        max_ind_one_hot[np.arange(max_ind.size), max_ind.flatten()] = 1
+        max_ind_one_hot = max_ind_one_hot.reshape(max_ind.shape + (-1,))
+        mask_out_all = np.max(angle_list,axis=0) ==-1
+        max_ind_one_hot[mask_out_all,:] = 0
+
+        uv_blending = {}
+        for i, key in enumerate(key_list):
+            uv_blending[key] = np.zeros(max_ind_one_hot[:,i].shape,np.float)
+            for j in range(i):
+                uv_blending[key] += max_ind_one_hot[:,j]
+            uv_blending[key] = np.clip(uv_blending[key],0, 1)
+
+        face.uv_blending = uv_blending
+        face.angle_uv_list = angle_uv_list
+        print('Done in %.2f secs' % (time.time() - start))
+
+        # Projecting for each of the predefined views
+        start = time.time()
+        print('Projecting...')
+        results_dict = {}
+        for key, trg_angle in face.rotation_dict().items():
+            face, results_dict[key] = self.run_iteration(face, key, trg_angle)
+
+        final_uv = results_dict[key]['final_uv']
+        print('Done in %.2f secs' % (time.time() - start))
+
+        if self.args.frontalize:
+            start = time.time()
+            print('Frontalizing...')
+            imgs = {}
+            masks = {}
+            heatmaps = {}
+            self.projector.perceptual_model.assign_placeholder('id_loss', self.args.use_id_loss_frontalize)
+
+            imgs['frontal'], masks['frontal'], heatmaps['frontal'], _ = self.create_align_syn(face, trg_angle=[0, 0, 0], include_mask=face.uv_blending[key])
+            generated_imgs, generated_dlatents = self.projector.run_projection(imgs, masks, heatmaps, face.id_features, iterations= self.args.iterations_frontalize)
+            results_dict['frontal'] = im_PIL2menpo(generated_imgs['frontal'])
+            results_dict['frontalize'] = [imgs, masks, heatmaps, face.id_features]
+            print('Done in %.2f secs' % (time.time() - start))
+
+        return final_uv, results_dict
diff --git a/insightface/reconstruction/ostec/core/perceptual_model.py b/insightface/reconstruction/ostec/core/perceptual_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..599bcf659f65e5a300cae759cca1481012f3f944
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/perceptual_model.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import tensorflow as tf
+import bz2
+import PIL.Image
+from PIL import ImageFilter
+import numpy as np
+from keras.models import Model
+from keras.applications.vgg16 import VGG16, preprocess_input
+import keras.backend as K
+import traceback
+import external.stylegan2.dnnlib.tflib as tflib
+from core.landmark_handler import Landmark_Handler
+from core import arcface_handler
+
+
+def load_images(images_PIL, image_size=256, sharpen=False, im_type='RGB'):
+    loaded_images = list()
+    for img_pil in images_PIL:
+      img = img_pil.convert(im_type)
+      if image_size is not None:
+        img = img.resize((image_size,image_size),PIL.Image.LANCZOS)
+        if (sharpen):
+            img = img.filter(ImageFilter.DETAIL)
+      img = np.array(img)
+      img = np.expand_dims(img, 0)
+      loaded_images.append(img)
+    loaded_images = np.vstack(loaded_images)
+    return loaded_images
+
+def tf_custom_adaptive_loss(a,b):
+    from adaptive import lossfun
+    shape = a.get_shape().as_list()
+    dim = np.prod(shape[1:])
+    a = tf.reshape(a, [-1, dim])
+    b = tf.reshape(b, [-1, dim])
+    loss, _, _ = lossfun(b-a, var_suffix='1')
+    return tf.math.reduce_mean(loss)
+
+def tf_custom_adaptive_rgb_loss(a,b):
+    from adaptive import image_lossfun
+    loss, _, _ = image_lossfun(b-a, color_space='RGB', representation='PIXEL')
+    return tf.math.reduce_mean(loss)
+
+def tf_custom_l1_loss(img1,img2):
+  return tf.math.reduce_mean(tf.math.abs(img2-img1), axis=None)
+
+def tf_custom_logcosh_loss(img1,img2):
+  return tf.math.reduce_mean(tf.keras.losses.logcosh(img1,img2))
+
+def create_stub(batch_size):
+    return tf.constant(0, dtype='float32', shape=(batch_size, 0))
+
+def unpack_bz2(src_path):
+    data = bz2.BZ2File(src_path).read()
+    dst_path = src_path[:-4]
+    with open(dst_path, 'wb') as fp:
+        fp.write(data)
+    return dst_path
+
+class PerceptualModel:
+    def __init__(self, args, batch_size=1, perc_model=None, sess=None):
+        self.sess = tf.get_default_session() if sess is None else sess
+        K.set_session(self.sess)
+        self.epsilon = 0.00000001
+        self.args = args
+        self.lr = args.lr
+        self.decay_rate = args.decay_rate
+        self.decay_steps = args.decay_steps
+        self.img_size = args.image_size
+        self.layer = args.use_vgg_layer
+        self.vgg_loss = args.use_vgg_loss
+        self.face_mask = args.face_mask
+        if (self.layer <= 0 or self.vgg_loss <= self.epsilon):
+            self.vgg_loss = None
+        self.pixel_loss = args.use_pixel_loss
+        if (self.pixel_loss <= self.epsilon):
+            self.pixel_loss = None
+        self.mssim_loss = args.use_mssim_loss
+        if (self.mssim_loss <= self.epsilon):
+            self.mssim_loss = None
+        self.lpips_loss = args.use_lpips_loss
+        if (self.lpips_loss <= self.epsilon):
+            self.lpips_loss = None
+        self.l1_penalty = args.use_l1_penalty
+        if (self.l1_penalty <= self.epsilon):
+            self.l1_penalty = None
+        self.adaptive_loss = args.use_adaptive_loss
+        self.sharpen_input = args.sharpen_input
+        self.batch_size = batch_size
+        if perc_model is not None and self.lpips_loss is not None:
+            self.perc_model = perc_model
+        else:
+            self.perc_model = None
+        self.ref_img = None
+        self.ref_weight = None
+        self.ref_heatmaps = None
+        self.perceptual_model = None
+        self.ref_img_features = None
+        self.features_weight = None
+        self.loss = None
+        self.discriminator_loss = args.use_discriminator_loss
+        if (self.discriminator_loss <= self.epsilon):
+            self.discriminator_loss = None
+        if self.discriminator_loss is not None:
+            self.discriminator = None
+            self.stub = create_stub(batch_size)
+        self.landmark_loss = args.use_landmark_loss
+
+
+
+    def add_placeholder(self, var_name):
+        var_val = getattr(self, var_name)
+        setattr(self, var_name + "_placeholder", tf.placeholder(var_val.dtype, shape=var_val.get_shape()))
+        setattr(self, var_name + "_op", var_val.assign(getattr(self, var_name + "_placeholder")))
+
+    def assign_placeholder(self, var_name, var_val):
+        self.sess.run(getattr(self, var_name + "_op"), {getattr(self, var_name + "_placeholder"): var_val})
+
+    def build_perceptual_model(self, generator, discriminator=None):
+        # Learning rate
+        global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
+        incremented_global_step = tf.assign_add(global_step, 1)
+        self._reset_global_step = tf.assign(global_step, 0)
+        self.learning_rate = tf.train.exponential_decay(self.lr, incremented_global_step,
+                self.decay_steps, self.decay_rate, staircase=True)
+        self.sess.run([self._reset_global_step])
+
+        if self.discriminator_loss is not None:
+            self.discriminator = discriminator
+
+        generated_image_tensor = generator.generated_image
+        generated_image = tf.image.resize_nearest_neighbor(generated_image_tensor,
+                                                                  (self.img_size, self.img_size), align_corners=True)
+        self.generated_img = generated_image
+        self.ref_img = tf.get_variable('ref_img', shape=generated_image.shape,
+                                                dtype='float32', initializer=tf.initializers.zeros())
+        self.ref_weight = tf.get_variable('ref_weight', shape=generated_image.shape,
+                                               dtype='float32', initializer=tf.initializers.zeros())
+        self.add_placeholder("ref_img")
+        self.add_placeholder("ref_weight")
+
+        if (self.vgg_loss is not None):
+            vgg16 = VGG16(include_top=False, input_shape=(self.img_size, self.img_size, 3))
+            self.perceptual_model = Model(vgg16.input, vgg16.layers[self.layer].output)
+            generated_img_features = self.perceptual_model(preprocess_input(self.ref_weight * generated_image))
+            dummy_im = np.zeros([self.args.batch_size, self.img_size, self.img_size, 3],np.float32)
+            self.perceptual_model.predict_on_batch(dummy_im)
+            self.ref_img_features = tf.get_variable('ref_img_features', shape=generated_img_features.shape,
+                                                dtype='float32', initializer=tf.initializers.zeros())
+            self.features_weight = tf.get_variable('features_weight', shape=generated_img_features.shape,
+                                               dtype='float32', initializer=tf.initializers.zeros())
+            self.sess.run([self.features_weight.initializer, self.features_weight.initializer])
+            self.add_placeholder("ref_img_features")
+            self.add_placeholder("features_weight")
+
+        landmark_model = Landmark_Handler(self.args, self.sess, generated_image/255)
+        landmark_model.load_model()
+        ibug84to68_ind = list(range(0, 33, 2)) + list(range(33, 84))
+        self.generated_heatmaps = tf.gather(landmark_model.lms_heatmap_prediction, ibug84to68_ind, axis=3)
+        self.generated_landmarks = tf.gather(landmark_model.pts_predictions, ibug84to68_ind, axis=1)
+
+        self.ref_heatmaps = tf.get_variable('ref_heatmaps', shape=self.generated_heatmaps.shape,
+                                            dtype='float32', initializer=tf.initializers.zeros())
+        self.add_placeholder("ref_heatmaps")
+
+        self.generated_id, vars, _ = arcface_handler.get_input_features(generated_image / 255, self.generated_landmarks[:, :, ::-1])
+        self.init_id_vars = tf.variables_initializer(vars)
+
+        self.org_features = tf.get_variable('org_features', shape=self.generated_id.shape,
+                                                dtype='float32', initializer=tf.initializers.zeros())
+        self.add_placeholder("org_features")
+        self.id_loss = tf.get_variable('id_loss', shape=(),
+                                               dtype='float32', initializer=tf.initializers.zeros())
+        self.add_placeholder('id_loss')
+
+        if self.perc_model is not None and self.lpips_loss is not None:
+            img1 = tflib.convert_images_from_uint8(self.ref_weight * self.ref_img, nhwc_to_nchw=True)
+            img2 = tflib.convert_images_from_uint8(self.ref_weight * generated_image, nhwc_to_nchw=True)
+
+        self.loss = 0
+        # L1 loss on VGG16 features
+        if (self.vgg_loss is not None):
+            if self.adaptive_loss:
+                self.loss += self.vgg_loss * tf_custom_adaptive_loss(self.features_weight * self.ref_img_features, self.features_weight * generated_img_features)
+            else:
+                self.loss += self.vgg_loss * tf_custom_logcosh_loss(self.features_weight * self.ref_img_features, self.features_weight * generated_img_features)
+        # + logcosh loss on image pixels
+        if (self.pixel_loss is not None):
+            if self.adaptive_loss:
+                self.loss += self.pixel_loss * tf_custom_adaptive_rgb_loss(self.ref_weight * self.ref_img, self.ref_weight * generated_image)
+            else:
+                self.loss += self.pixel_loss * tf_custom_logcosh_loss(self.ref_weight * self.ref_img, self.ref_weight * generated_image)
+        # + MS-SIM loss on image pixels
+        if (self.mssim_loss is not None):
+            self.loss += self.mssim_loss * tf.math.reduce_mean(1-tf.image.ssim_multiscale(self.ref_weight * self.ref_img, self.ref_weight * generated_image, 1))
+        # + extra perceptual loss on image pixels
+        if self.perc_model is not None and self.lpips_loss is not None:
+            self.loss += self.lpips_loss * tf.math.reduce_mean(self.perc_model.get_output_for(img1, img2))
+        # + L1 penalty on dlatent weights
+        if self.l1_penalty is not None:
+            self.loss += self.l1_penalty * 512 * tf.math.reduce_mean(tf.math.abs(generator.dlatent_variable-generator.get_dlatent_avg()))
+        # discriminator loss (realism)
+        if self.discriminator_loss is not None:
+            self.loss += self.discriminator_loss * tf.math.reduce_mean(self.discriminator.get_output_for(
+                tflib.convert_images_from_uint8(generated_image_tensor, nhwc_to_nchw=True), self.stub))
+        # - discriminator_network.get_output_for(tflib.convert_images_from_uint8(ref_img, nhwc_to_nchw=True), stub)
+        if self.landmark_loss is not None:
+            self.loss += self.landmark_loss * tf.math.reduce_mean(tf.reduce_sum(tf.pow(self.ref_heatmaps - self.generated_heatmaps, 2), 2))
+        if self.id_loss is not None:
+            self.id_loss_comp = tf.losses.cosine_distance(self.generated_id, self.org_features, 1)
+            self.loss += self.id_loss * self.id_loss_comp
+
+        # Define Optimizer
+        vars_to_optimize = generator.dlatent_variable
+        vars_to_optimize = vars_to_optimize if isinstance(vars_to_optimize, list) else [vars_to_optimize]
+        if self.args.optimizer == 'lbfgs':
+            self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.loss, var_list=vars_to_optimize, method='L-BFGS-B', options={'maxiter': self.args.iterations})
+        else:
+            if self.args.optimizer == 'ggt':
+                self.optimizer = tf.contrib.opt.GGTOptimizer(learning_rate=self.learning_rate)
+            else:
+                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
+            self.min_op = self.optimizer.minimize(self.loss, var_list=[vars_to_optimize])
+            self.sess.run(tf.variables_initializer(self.optimizer.variables()))
+        #min_op = optimizer.minimize(self.sess)
+        #optim_results = tfp.optimizer.lbfgs_minimize(make_val_and_grad_fn(get_loss), initial_position=vars_to_optimize, num_correction_pairs=10, tolerance=1e-8)
+        K.manual_variable_initialization(True)
+        self.sess.graph.finalize()  # Graph is read-only after this statement.
+
+
+    def set_reference_images(self, images_PIL, masks_PIL, heatmaps, id_features):
+        assert(len(images_PIL) != 0 and len(images_PIL) <= self.batch_size)
+        loaded_image = load_images(images_PIL, self.img_size, sharpen=self.sharpen_input)
+        loaded_mask = load_images(masks_PIL, self.img_size, sharpen=self.sharpen_input, im_type='L')
+        heatmaps = np.transpose(np.array(heatmaps), [0, 2, 3, 1])
+        input_size = np.array(heatmaps).shape[2]
+        output_size = int(self.ref_heatmaps.shape[1])
+        bin_size = input_size // output_size
+        loaded_heatmaps = heatmaps.reshape((heatmaps.shape[0], output_size, bin_size,
+                                             output_size, bin_size, 68)).max(4).max(2)
+        image_features = None
+        if self.perceptual_model is not None:
+            image_features = self.perceptual_model.predict_on_batch(preprocess_input(np.array(loaded_image)))
+            weight_mask = np.ones(self.features_weight.shape)
+
+        if self.face_mask:
+            image_mask = np.zeros(self.ref_weight.shape)
+            for (i, (im, mask)) in enumerate(zip(loaded_image, loaded_mask)):
+                try:
+                    mask = np.array(mask)/255
+                    mask = np.expand_dims(mask,axis=-1)
+                    mask = np.ones(im.shape,np.float32) * mask #?
+                except Exception as e:
+                    print("Exception in mask handling for mask")
+                    traceback.print_exc()
+                    mask = np.ones(im.shape[:2],np.uint8)
+                    mask = np.ones(im.shape,np.float32) * np.expand_dims(mask,axis=-1)
+                image_mask[i] = mask
+            img = None
+        else:
+            image_mask = np.ones(self.ref_weight.shape)
+
+        if len(images_PIL) != self.batch_size:
+            if image_features is not None:
+                features_space = list(self.features_weight.shape[1:])
+                existing_features_shape = [len(images_PIL)] + features_space
+                empty_features_shape = [self.batch_size - len(images_PIL)] + features_space
+                existing_examples = np.ones(shape=existing_features_shape)
+                empty_examples = np.zeros(shape=empty_features_shape)
+                weight_mask = np.vstack([existing_examples, empty_examples])
+                image_features = np.vstack([image_features, np.zeros(empty_features_shape)])
+
+            images_space = list(self.ref_weight.shape[1:])
+            existing_images_space = [len(images_PIL)] + images_space
+            empty_images_space = [self.batch_size - len(images_PIL)] + images_space
+            existing_images = np.ones(shape=existing_images_space)
+            empty_images = np.zeros(shape=empty_images_space)
+            image_mask = image_mask * np.vstack([existing_images, empty_images])
+            loaded_image = np.vstack([loaded_image, np.zeros(empty_images_space)])
+
+        if image_features is not None:
+            self.assign_placeholder("features_weight", weight_mask)
+            self.assign_placeholder("ref_img_features", image_features)
+        self.assign_placeholder("ref_weight", image_mask)
+        self.assign_placeholder("ref_img", loaded_image)
+        self.assign_placeholder("org_features", id_features)
+        self.assign_placeholder("ref_heatmaps", loaded_heatmaps)
+
+    def optimize(self, vars_to_optimize, iterations=200):
+        self.sess.run(self._reset_global_step)
+        self.sess.run(self.init_id_vars)
+        fetch_ops = [self.min_op, self.loss, self.id_loss_comp, self.learning_rate]
+        for _ in range(iterations):
+            if self.args.optimizer == 'lbfgs':
+                self.optimizer.minimize(self.sess, fetches=[vars_to_optimize, self.loss])
+                yield {"loss":self.loss.eval()}
+            else:
+                _, loss, id_loss, lr = self.sess.run(fetch_ops)
+                yield {"loss":loss,"id_loss":id_loss,"lr":lr}
diff --git a/insightface/reconstruction/ostec/core/projection_handler.py b/insightface/reconstruction/ostec/core/projection_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6098d8cb1a9ee8e7b1220637f827fb0ebf66e33
--- /dev/null
+++ b/insightface/reconstruction/ostec/core/projection_handler.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import os
+import argparse
+import pickle
+from tqdm.auto import tqdm
+import PIL.Image
+from PIL import ImageFilter
+import numpy as np
+import external.stylegan2.dnnlib.tflib as tflib
+from external.stylegan2 import pretrained_networks
+from core.generator_model import Generator
+from core.perceptual_model import PerceptualModel, load_images
+import external.stylegan2.dnnlib
+from keras.models import load_model
+from keras.applications.resnet50 import preprocess_input
+
+
+def split_to_batches(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+def str2bool(v):
+    if isinstance(v, bool):
+       return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+class Projection_Handler():
+    def __init__(self, args):
+        self.args = args
+
+        # Initialize generator and perceptual model
+        tflib.init_tf()
+        generator_network, discriminator_network, Gs_network = pretrained_networks.load_networks(args.model_url)
+        self.generator = Generator(Gs_network, args.batch_size, randomize_noise=args.randomize_noise)
+
+        if (args.dlatent_avg != ''):
+            self.generator.set_dlatent_avg(np.load(args.dlatent_avg))
+
+        perc_model = None
+        if (args.use_lpips_loss > 0.00000001):
+            if external.stylegan2.dnnlib.util.is_url(args.vgg_url):
+                stream = external.stylegan2.dnnlib.util.open_url(args.vgg_url, cache_dir='../.stylegan2-cache')
+            else:
+                stream = open(args.vgg_url, 'rb')
+            with stream as f:
+                perc_model = pickle.load(f)
+
+
+        self.perceptual_model = PerceptualModel(args, perc_model=perc_model, batch_size=args.batch_size)
+
+        self.ff_model = None
+        if (self.ff_model is None):
+            if os.path.exists(self.args.load_resnet):
+                from keras.applications.resnet50 import preprocess_input
+                print("Loading ResNet Model:")
+                self.ff_model = load_model(self.args.load_resnet)
+                # self.ff_model._make_predict_function()
+                dummy_im = np.zeros([args.batch_size, args.resnet_image_size, args.resnet_image_size, 3], np.uint8)
+                self.ff_model.predict(preprocess_input(dummy_im))
+        if (self.ff_model is None):
+            if os.path.exists(self.args.load_effnet):
+                from efficientnet import preprocess_input
+                print("Loading EfficientNet Model:")
+                self.ff_model = load_model(self.args.load_effnet)
+
+        self.perceptual_model.build_perceptual_model(self.generator, discriminator_network)
+        self.perceptual_model.assign_placeholder('id_loss', args.use_id_loss)
+
+    def run_projection(self, input_images, masks, heatmaps, id_features, iterations=None):
+        n_iteration = self.args.iterations
+        if iterations is not None:
+            n_iteration = iterations
+        return_imgs = {}
+        return_dlatents = {}
+        # Optimize (only) dlatents by minimizing perceptual loss between reference and generated images in feature space
+        for names in  split_to_batches(list(input_images.keys()), self.args.batch_size):
+            #split_to_batches(list(input_images.keys()), self.args.batch_size):
+            #tqdm(split_to_batches(list(input_images.keys()), self.args.batch_size),
+                                 #total=len(input_images) // self.args.batch_size):
+            # tqdm._instances.clear()
+            images_batch = [input_images[x] for x in names]
+            masks_batch =  [masks[x] for x in names]
+            heatmaps_batch =  [heatmaps[x] for x in names]
+            # if args.output_video:
+            #     video_out = {}
+            #     for name in names:
+            #         video_out[name] = cv2.VideoWriter(os.path.join(args.video_dir, f'{name}.avi'),
+            #                                           cv2.VideoWriter_fourcc(*args.video_codec), args.video_frame_rate,
+            #                                           (args.video_size, args.video_size))
+
+            ## REGRESSION
+            dlatents = None
+            if (self.args.load_last != ''):  # load previous dlatents for initialization
+                for name in names:
+                    dl = np.expand_dims(np.load(os.path.join(self.args.load_last, f'{name}.npy')), axis=0)
+                    if (dlatents is None):
+                        dlatents = dl
+                    else:
+                        dlatents = np.vstack((dlatents, dl))
+            else:
+                if (self.ff_model is not None):  # predict initial dlatents with ResNet model
+                    if (self.args.use_preprocess_input):
+                        dlatents = self.ff_model.predict(
+                            preprocess_input(load_images(images_batch, image_size=self.args.resnet_image_size)))
+                    else:
+                        dlatents = self.ff_model.predict(load_images(images_batch, image_size=self.args.resnet_image_size))
+            if dlatents is not None:
+                self.generator.set_dlatents(dlatents)
+
+            ## OPTIMIZATION
+            self.perceptual_model.set_reference_images(images_batch, masks_batch, heatmaps_batch, id_features)
+
+            op = self.perceptual_model.optimize(self.generator.dlatent_variable, iterations=n_iteration)
+            pbar = tqdm(op, leave=False, total=n_iteration)
+            vid_count = 0
+            best_loss = None
+            best_dlatent = None
+            avg_loss_count = 0
+            if self.args.early_stopping:
+                avg_loss = prev_loss = None
+
+            for loss_dict in pbar:
+                if self.args.early_stopping:  # early stopping feature
+                    if prev_loss is not None:
+                        if avg_loss is not None:
+                            avg_loss = 0.5 * avg_loss + (prev_loss - loss_dict["loss"])
+                            if avg_loss < self.args.early_stopping_threshold:  # count while under threshold; else reset
+                                avg_loss_count += 1
+                            else:
+                                avg_loss_count = 0
+                            if avg_loss_count > self.args.early_stopping_patience:  # stop once threshold is reached
+                                print("")
+                                break
+                        else:
+                            avg_loss = prev_loss - loss_dict["loss"]
+                pbar.set_description(
+                    " ".join(names) + ": " + "; ".join(["{} {:.4f}".format(k, v) for k, v in loss_dict.items()]))
+                if best_loss is None or loss_dict["loss"] < best_loss:
+                    if best_dlatent is None or self.args.average_best_loss <= 0.00000001:
+                        best_dlatent = self.generator.get_dlatents()
+                    else:
+                        best_dlatent = 0.25 * best_dlatent + 0.75 * self.generator.get_dlatents()
+                    if self.args.use_best_loss:
+                        self.generator.set_dlatents(best_dlatent)
+                    best_loss = loss_dict["loss"]
+                # if self.args.output_video and (vid_count % self.args.video_skip == 0):
+                #     batch_frames = self.generator.generate_images()
+                #     for i, name in enumerate(names):
+                #         video_frame = PIL.Image.fromarray(batch_frames[i], 'RGB').resize(
+                #             (self.args.video_size, self.args.video_size), PIL.Image.LANCZOS)
+                #         video_out[name].write(cv2.cvtColor(np.array(video_frame).astype('uint8'), cv2.COLOR_RGB2BGR))
+                self.generator.stochastic_clip_dlatents()
+                prev_loss = loss_dict["loss"]
+            if not self.args.use_best_loss:
+                best_loss = prev_loss
+            # pbar.set_postfix(loss="{:.4f}".format(best_loss))
+            print(" ".join(names), " Loss {:.4f}".format(best_loss))
+
+            # if self.args.output_video:
+            #     for name in names:
+            #         video_out[name].release()
+
+            # Generate images from found dlatents and save them
+            if self.args.use_best_loss:
+                self.generator.set_dlatents(best_dlatent)
+
+            generated_images = self.generator.generate_images()
+            generated_dlatents = self.generator.get_dlatents()
+            for img_array, dlatent, img_path, img_name in zip(generated_images, generated_dlatents, images_batch,
+                                                              names):
+                mask_img = None
+                if self.args.composite_mask and (self.args.load_mask or self.args.face_mask):
+                    _, im_name = os.path.split(img_path)
+                    mask_img = os.path.join(self.args.mask_dir, f'{im_name}')
+                if self.args.composite_mask and mask_img is not None and os.path.isfile(mask_img):
+                    orig_img = PIL.Image.open(img_path).convert('RGB')
+                    width, height = orig_img.size
+                    imask = PIL.Image.open(mask_img).convert('L').resize((width, height))
+                    imask = imask.filter(ImageFilter.GaussianBlur(self.args.composite_blur))
+                    mask = np.array(imask) / 255
+                    mask = np.expand_dims(mask, axis=-1)
+                    img_array = mask * np.array(img_array) + (1.0 - mask) * np.array(orig_img)
+                    img_array = img_array.astype(np.uint8)
+                img = PIL.Image.fromarray(img_array, 'RGB')
+                return_imgs[img_name] = img
+                return_dlatents[img_name] = dlatent
+
+            self.generator.reset_dlatents()
+        return return_imgs, return_dlatents
diff --git a/insightface/reconstruction/ostec/environment.yml b/insightface/reconstruction/ostec/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7854c7e9fca3ad65d963ef1a526d2bd8161724da
--- /dev/null
+++ b/insightface/reconstruction/ostec/environment.yml
@@ -0,0 +1,216 @@
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - aiocontextvars=0.2.2=py_0
+  - alsa-lib=1.2.3=h516909a_0
+  - apptools=5.1.0=pyh44b312d_0
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.17.1=h7f98852_1
+  - ca-certificates=2021.9.30=h06a4308_1
+  - cairo=1.16.0=h6cf1ce9_1008
+  - certifi=2020.12.5=py36h06a4308_0
+  - cloudpickle=2.0.0=pyhd8ed1ab_0
+  - configobj=5.0.6=py_0
+  - contextvars=2.4=py_0
+  - curl=7.78.0=hea6ffbf_0
+  - cycler=0.10.0=py_2
+  - cytoolz=0.11.0=py36h8f6f2f9_3
+  - cyvlfeat=0.7.0=py36h0280710_0
+  - dask-core=2.25.0=py_0
+  - dbus=1.13.6=h48d8840_2
+  - decorator=4.4.2=py_0
+  - double-conversion=3.1.5=h9c3ff4c_2
+  - eigen=3.3.9=h4bd325d_1
+  - enum34=1.1.10=py36h9f0ad1d_2
+  - envisage=6.0.1=pyhd8ed1ab_0
+  - expat=2.4.1=h9c3ff4c_0
+  - ffmpeg=4.3.2=hca11adc_0
+  - fontconfig=2.13.1=hba837de_1005
+  - freetype=2.10.4=h0708190_1
+  - gettext=0.19.8.1=h0b5b191_1005
+  - gl2ps=1.4.2=h0708190_0
+  - glcontext=2.3.4=py36hc4f0c31_0
+  - glew=2.1.0=h9c3ff4c_2
+  - glib=2.68.3=h9c3ff4c_0
+  - glib-tools=2.68.3=h9c3ff4c_0
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - graphite2=1.3.13=h58526e2_1001
+  - gst-plugins-base=1.18.4=hf529b03_2
+  - gstreamer=1.18.4=h76c114f_2
+  - h5py=2.10.0=py36hd6299e0_1
+  - harfbuzz=2.8.2=h83ec7ef_0
+  - hdf4=4.2.15=h10796ff_3
+  - hdf5=1.10.6=nompi_h6a2412b_1114
+  - icu=68.1=h58526e2_0
+  - imagecodecs-lite=2019.12.3=py36h92226af_3
+  - imageio=2.9.0=py_0
+  - immutables=0.15=py36h8f6f2f9_0
+  - importlib-metadata=4.8.1=py36h5fab9bb_0
+  - importlib_metadata=4.8.1=hd8ed1ab_0
+  - importlib_resources=5.2.2=pyhd8ed1ab_0
+  - jasper=1.900.1=h07fcdf6_1006
+  - jbig=2.1=h7f98852_2003
+  - jpeg=9d=h36c2ea0_0
+  - jsoncpp=1.9.4=h4bd325d_3
+  - kiwisolver=1.3.1=py36h605e78d_1
+  - krb5=1.19.2=hcc1bbae_0
+  - lame=3.100=h7f98852_1001
+  - lcms2=2.12=hddcbb42_0
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - lerc=2.2.1=h9c3ff4c_0
+  - libblas=3.9.0=11_linux64_openblas
+  - libcblas=3.9.0=11_linux64_openblas
+  - libclang=11.1.0=default_ha53f305_1
+  - libcurl=7.78.0=h2574ce0_0
+  - libdeflate=1.7=h7f98852_5
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libevent=2.1.10=hcdb4288_3
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgfortran-ng=11.2.0=h69a702a_11
+  - libgfortran5=11.2.0=h5c6108e_11
+  - libglib=2.68.3=h3e27bee_0
+  - libglu=9.0.0=he1b5a44_1001
+  - libgomp=9.3.0=h5101ec6_17
+  - libiconv=1.16=h516909a_0
+  - liblapack=3.9.0=11_linux64_openblas
+  - liblapacke=3.9.0=11_linux64_openblas
+  - libllvm11=11.1.0=hf817b99_2
+  - libnetcdf=4.8.0=nompi_hcd642e3_103
+  - libnghttp2=1.43.0=h812cca2_0
+  - libogg=1.3.4=h7f98852_1
+  - libopenblas=0.3.17=pthreads_h8fe5266_1
+  - libopencv=4.5.2=py36hb84549a_1
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.37=h21135ba_2
+  - libpq=13.3=hd57d9b9_0
+  - libprotobuf=3.16.0=h780b84a_0
+  - libssh2=1.9.0=ha56f1ee_6
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - libtheora=1.1.1=h7f98852_1005
+  - libtiff=4.3.0=hf544144_1
+  - libuuid=2.32.1=h7f98852_1000
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp-base=1.2.0=h7f98852_2
+  - libxcb=1.13=h7f98852_1003
+  - libxkbcommon=1.0.3=he3ba5ed_0
+  - libxml2=2.9.12=h72842e0_0
+  - libzip=1.8.0=h4de3113_0
+  - loguru=0.5.3=py36h5fab9bb_2
+  - lz4-c=1.9.3=h9c3ff4c_1
+  - menpo=0.11.0=py36h355b2fd_0
+  - menpo3d=0.8.3=py36h355b2fd_0
+  - moderngl=5.6.4=py36h284efc9_0
+  - mysql-common=8.0.25=ha770c72_2
+  - mysql-libs=8.0.25=hfa10184_2
+  - ncurses=6.2=he6710b0_1
+  - nettle=3.6=he412f7d_0
+  - networkx=2.5.1=pyhd8ed1ab_0
+  - nspr=4.30=h9c3ff4c_0
+  - nss=3.67=hb5efdd6_0
+  - olefile=0.46=pyh9f0ad1d_1
+  - opencv=4.5.2=py36h5fab9bb_1
+  - openh264=2.1.1=h780b84a_0
+  - openjpeg=2.4.0=hb52868f_1
+  - openssl=1.1.1l=h7f8727e_0
+  - pathlib=1.0.1=py36h5fab9bb_4
+  - pcre=8.45=h9c3ff4c_0
+  - pillow=8.3.1=py36h676a545_0
+  - pip=21.0.1=py36h06a4308_0
+  - pixman=0.40.0=h36c2ea0_0
+  - proj=7.2.0=h277dcde_2
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - pugixml=1.11.4=h9c3ff4c_0
+  - py-opencv=4.5.2=py36hcb3619a_1
+  - pyface=7.3.0=pyh44b312d_1
+  - pygments=2.10.0=pyhd8ed1ab_0
+  - pyparsing=2.4.7=pyh9f0ad1d_0
+  - pyqt=5.12.3=py36h5fab9bb_7
+  - pyqt-impl=5.12.3=py36h7ec31b9_7
+  - pyqt5-sip=4.19.18=py36hc4f0c31_7
+  - pyqtchart=5.12=py36h7ec31b9_7
+  - pyqtwebengine=5.12.1=py36h7ec31b9_7
+  - python=3.6.13=h12debd9_1
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.6=2_cp36m
+  - pywavelets=1.1.1=py36h92226af_3
+  - qt=5.12.9=hda022c4_4
+  - readline=8.1=h27cfd23_0
+  - scikit-image=0.17.2=py36h284efc9_4
+  - scipy=1.5.3=py36h9e8f40b_0
+  - setuptools=58.0.4=py36h06a4308_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlite=3.36.0=hc218d9a_0
+  - tbb=2020.2=h4bd325d_4
+  - tbb-devel=2020.2=h4bd325d_4
+  - tifffile=2019.7.26.2=py36_0
+  - tk=8.6.11=h1ccaba5_0
+  - toolz=0.11.1=py_0
+  - tornado=6.1=py36h8f6f2f9_1
+  - traits=6.2.0=py36h8f6f2f9_0
+  - traitsui=7.2.0=pyhd8ed1ab_0
+  - typing_extensions=3.10.0.2=pyha770c72_0
+  - utfcpp=3.2.1=ha770c72_0
+  - vlfeat=0.9.20=h14c3975_1002
+  - vtk=9.0.1=no_osmesa_py36hfa3a401_109
+  - wheel=0.37.0=pyhd3eb1b0_1
+  - x264=1!161.3030=h7f98852_1
+  - xorg-kbproto=1.0.7=h7f98852_1002
+  - xorg-libice=1.0.10=h7f98852_0
+  - xorg-libsm=1.2.3=hd9c2040_1000
+  - xorg-libx11=1.7.2=h7f98852_0
+  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xorg-libxext=1.3.4=h7f98852_1
+  - xorg-libxrender=0.9.10=h7f98852_1003
+  - xorg-libxt=1.2.1=h7f98852_2
+  - xorg-renderproto=0.11.1=h7f98852_1002
+  - xorg-xextproto=7.3.0=h7f98852_1002
+  - xorg-xproto=7.0.31=h7f98852_1007
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h516909a_0
+  - zipp=3.6.0=pyhd8ed1ab_0
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.5.0=ha95c52a_0
+  - pip:
+    - absl-py==0.14.1
+    - astor==0.8.1
+    - cached-property==1.5.2
+    - charset-normalizer==2.0.7
+    - cmake==3.21.3
+    - dataclasses==0.8
+    - dlib==19.22.1
+    - dominate==2.6.0
+    - future==0.18.2
+    - gast==0.5.2
+    - google-pasta==0.2.0
+    - grpcio==1.41.0
+    - idna==3.3
+    - imutils==0.5.4
+    - keras==2.3.0
+    - keras-applications==1.0.8
+    - keras-preprocessing==1.0.5
+    - kornia==0.5.5
+    - markdown==3.3.4
+    - matplotlib==3.1.3
+    - mayavi==4.7.2
+    - numpy==1.16.1
+    - protobuf==3.18.1
+    - pyyaml==6.0
+    - requests==2.26.0
+    - tensorboard==1.14.0
+    - tensorflow-estimator==1.14.0
+    - tensorflow-gpu==1.14.0
+    - termcolor==1.1.0
+    - torch==1.6.0
+    - torchvision==0.7.0
+    - tqdm==4.62.3
+    - trimesh==3.9.20
+    - urllib3==1.26.7
+    - werkzeug==2.0.2
+    - wrapt==1.13.2
diff --git a/insightface/reconstruction/ostec/external/arcface50.py b/insightface/reconstruction/ostec/external/arcface50.py
new file mode 100644
index 0000000000000000000000000000000000000000..638fca08b4830e73f2412cb31feb29e30f224619
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/arcface50.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import tensorflow as tf
+
+__weights_dict = dict()
+
+is_train = False
+
+def load_weights(weight_file):
+    import numpy as np
+
+    if weight_file == None:
+        return
+
+    try:
+        weights_dict = np.load(weight_file).item()
+    except:
+        weights_dict = np.load(weight_file, encoding='bytes').item()
+
+    return weights_dict
+
+
+def KitModel(weight_file = None, data = tf.placeholder(tf.float32, shape = (None, 112, 112, 3), name = 'data')):
+    global __weights_dict
+    __weights_dict = load_weights(weight_file)
+
+    minusscalar0_second = tf.constant(__weights_dict['minusscalar0_second']['value'], name='minusscalar0_second')
+
+    mulscalar0_second = tf.constant(__weights_dict['mulscalar0_second']['value'], name='mulscalar0_second')
+    minusscalar0    = data - minusscalar0_second
+    mulscalar0      = minusscalar0 * mulscalar0_second
+    conv0_pad       = tf.pad(mulscalar0, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    conv0           = convolution(conv0_pad, group=1, strides=[1, 1], padding='VALID', name='conv0')
+    bn0             = batch_normalization(conv0, variance_epsilon=1.9999999494757503e-05, name='bn0')
+    relu0           = prelu(bn0, name='relu0')
+    stage1_unit1_bn1 = batch_normalization(relu0, variance_epsilon=1.9999999494757503e-05, name='stage1_unit1_bn1')
+    stage1_unit1_conv1sc = convolution(relu0, group=1, strides=[2, 2], padding='VALID', name='stage1_unit1_conv1sc')
+    stage1_unit1_conv1_pad = tf.pad(stage1_unit1_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit1_conv1 = convolution(stage1_unit1_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage1_unit1_conv1')
+    stage1_unit1_sc = batch_normalization(stage1_unit1_conv1sc, variance_epsilon=1.9999999494757503e-05, name='stage1_unit1_sc')
+    stage1_unit1_bn2 = batch_normalization(stage1_unit1_conv1, variance_epsilon=1.9999999494757503e-05, name='stage1_unit1_bn2')
+    stage1_unit1_relu1 = prelu(stage1_unit1_bn2, name='stage1_unit1_relu1')
+    stage1_unit1_conv2_pad = tf.pad(stage1_unit1_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit1_conv2 = convolution(stage1_unit1_conv2_pad, group=1, strides=[2, 2], padding='VALID', name='stage1_unit1_conv2')
+    stage1_unit1_bn3 = batch_normalization(stage1_unit1_conv2, variance_epsilon=1.9999999494757503e-05, name='stage1_unit1_bn3')
+    plus0           = stage1_unit1_bn3 + stage1_unit1_sc
+    stage1_unit2_bn1 = batch_normalization(plus0, variance_epsilon=1.9999999494757503e-05, name='stage1_unit2_bn1')
+    stage1_unit2_conv1_pad = tf.pad(stage1_unit2_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit2_conv1 = convolution(stage1_unit2_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage1_unit2_conv1')
+    stage1_unit2_bn2 = batch_normalization(stage1_unit2_conv1, variance_epsilon=1.9999999494757503e-05, name='stage1_unit2_bn2')
+    stage1_unit2_relu1 = prelu(stage1_unit2_bn2, name='stage1_unit2_relu1')
+    stage1_unit2_conv2_pad = tf.pad(stage1_unit2_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit2_conv2 = convolution(stage1_unit2_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage1_unit2_conv2')
+    stage1_unit2_bn3 = batch_normalization(stage1_unit2_conv2, variance_epsilon=1.9999999494757503e-05, name='stage1_unit2_bn3')
+    plus1           = stage1_unit2_bn3 + plus0
+    stage1_unit3_bn1 = batch_normalization(plus1, variance_epsilon=1.9999999494757503e-05, name='stage1_unit3_bn1')
+    stage1_unit3_conv1_pad = tf.pad(stage1_unit3_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit3_conv1 = convolution(stage1_unit3_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage1_unit3_conv1')
+    stage1_unit3_bn2 = batch_normalization(stage1_unit3_conv1, variance_epsilon=1.9999999494757503e-05, name='stage1_unit3_bn2')
+    stage1_unit3_relu1 = prelu(stage1_unit3_bn2, name='stage1_unit3_relu1')
+    stage1_unit3_conv2_pad = tf.pad(stage1_unit3_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage1_unit3_conv2 = convolution(stage1_unit3_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage1_unit3_conv2')
+    stage1_unit3_bn3 = batch_normalization(stage1_unit3_conv2, variance_epsilon=1.9999999494757503e-05, name='stage1_unit3_bn3')
+    plus2           = stage1_unit3_bn3 + plus1
+    stage2_unit1_bn1 = batch_normalization(plus2, variance_epsilon=1.9999999494757503e-05, name='stage2_unit1_bn1')
+    stage2_unit1_conv1sc = convolution(plus2, group=1, strides=[2, 2], padding='VALID', name='stage2_unit1_conv1sc')
+    stage2_unit1_conv1_pad = tf.pad(stage2_unit1_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit1_conv1 = convolution(stage2_unit1_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit1_conv1')
+    stage2_unit1_sc = batch_normalization(stage2_unit1_conv1sc, variance_epsilon=1.9999999494757503e-05, name='stage2_unit1_sc')
+    stage2_unit1_bn2 = batch_normalization(stage2_unit1_conv1, variance_epsilon=1.9999999494757503e-05, name='stage2_unit1_bn2')
+    stage2_unit1_relu1 = prelu(stage2_unit1_bn2, name='stage2_unit1_relu1')
+    stage2_unit1_conv2_pad = tf.pad(stage2_unit1_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit1_conv2 = convolution(stage2_unit1_conv2_pad, group=1, strides=[2, 2], padding='VALID', name='stage2_unit1_conv2')
+    stage2_unit1_bn3 = batch_normalization(stage2_unit1_conv2, variance_epsilon=1.9999999494757503e-05, name='stage2_unit1_bn3')
+    plus3           = stage2_unit1_bn3 + stage2_unit1_sc
+    stage2_unit2_bn1 = batch_normalization(plus3, variance_epsilon=1.9999999494757503e-05, name='stage2_unit2_bn1')
+    stage2_unit2_conv1_pad = tf.pad(stage2_unit2_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit2_conv1 = convolution(stage2_unit2_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit2_conv1')
+    stage2_unit2_bn2 = batch_normalization(stage2_unit2_conv1, variance_epsilon=1.9999999494757503e-05, name='stage2_unit2_bn2')
+    stage2_unit2_relu1 = prelu(stage2_unit2_bn2, name='stage2_unit2_relu1')
+    stage2_unit2_conv2_pad = tf.pad(stage2_unit2_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit2_conv2 = convolution(stage2_unit2_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit2_conv2')
+    stage2_unit2_bn3 = batch_normalization(stage2_unit2_conv2, variance_epsilon=1.9999999494757503e-05, name='stage2_unit2_bn3')
+    plus4           = stage2_unit2_bn3 + plus3
+    stage2_unit3_bn1 = batch_normalization(plus4, variance_epsilon=1.9999999494757503e-05, name='stage2_unit3_bn1')
+    stage2_unit3_conv1_pad = tf.pad(stage2_unit3_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit3_conv1 = convolution(stage2_unit3_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit3_conv1')
+    stage2_unit3_bn2 = batch_normalization(stage2_unit3_conv1, variance_epsilon=1.9999999494757503e-05, name='stage2_unit3_bn2')
+    stage2_unit3_relu1 = prelu(stage2_unit3_bn2, name='stage2_unit3_relu1')
+    stage2_unit3_conv2_pad = tf.pad(stage2_unit3_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit3_conv2 = convolution(stage2_unit3_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit3_conv2')
+    stage2_unit3_bn3 = batch_normalization(stage2_unit3_conv2, variance_epsilon=1.9999999494757503e-05, name='stage2_unit3_bn3')
+    plus5           = stage2_unit3_bn3 + plus4
+    stage2_unit4_bn1 = batch_normalization(plus5, variance_epsilon=1.9999999494757503e-05, name='stage2_unit4_bn1')
+    stage2_unit4_conv1_pad = tf.pad(stage2_unit4_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit4_conv1 = convolution(stage2_unit4_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit4_conv1')
+    stage2_unit4_bn2 = batch_normalization(stage2_unit4_conv1, variance_epsilon=1.9999999494757503e-05, name='stage2_unit4_bn2')
+    stage2_unit4_relu1 = prelu(stage2_unit4_bn2, name='stage2_unit4_relu1')
+    stage2_unit4_conv2_pad = tf.pad(stage2_unit4_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage2_unit4_conv2 = convolution(stage2_unit4_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage2_unit4_conv2')
+    stage2_unit4_bn3 = batch_normalization(stage2_unit4_conv2, variance_epsilon=1.9999999494757503e-05, name='stage2_unit4_bn3')
+    plus6           = stage2_unit4_bn3 + plus5
+    stage3_unit1_bn1 = batch_normalization(plus6, variance_epsilon=1.9999999494757503e-05, name='stage3_unit1_bn1')
+    stage3_unit1_conv1sc = convolution(plus6, group=1, strides=[2, 2], padding='VALID', name='stage3_unit1_conv1sc')
+    stage3_unit1_conv1_pad = tf.pad(stage3_unit1_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit1_conv1 = convolution(stage3_unit1_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit1_conv1')
+    stage3_unit1_sc = batch_normalization(stage3_unit1_conv1sc, variance_epsilon=1.9999999494757503e-05, name='stage3_unit1_sc')
+    stage3_unit1_bn2 = batch_normalization(stage3_unit1_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit1_bn2')
+    stage3_unit1_relu1 = prelu(stage3_unit1_bn2, name='stage3_unit1_relu1')
+    stage3_unit1_conv2_pad = tf.pad(stage3_unit1_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit1_conv2 = convolution(stage3_unit1_conv2_pad, group=1, strides=[2, 2], padding='VALID', name='stage3_unit1_conv2')
+    stage3_unit1_bn3 = batch_normalization(stage3_unit1_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit1_bn3')
+    plus7           = stage3_unit1_bn3 + stage3_unit1_sc
+    stage3_unit2_bn1 = batch_normalization(plus7, variance_epsilon=1.9999999494757503e-05, name='stage3_unit2_bn1')
+    stage3_unit2_conv1_pad = tf.pad(stage3_unit2_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit2_conv1 = convolution(stage3_unit2_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit2_conv1')
+    stage3_unit2_bn2 = batch_normalization(stage3_unit2_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit2_bn2')
+    stage3_unit2_relu1 = prelu(stage3_unit2_bn2, name='stage3_unit2_relu1')
+    stage3_unit2_conv2_pad = tf.pad(stage3_unit2_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit2_conv2 = convolution(stage3_unit2_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit2_conv2')
+    stage3_unit2_bn3 = batch_normalization(stage3_unit2_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit2_bn3')
+    plus8           = stage3_unit2_bn3 + plus7
+    stage3_unit3_bn1 = batch_normalization(plus8, variance_epsilon=1.9999999494757503e-05, name='stage3_unit3_bn1')
+    stage3_unit3_conv1_pad = tf.pad(stage3_unit3_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit3_conv1 = convolution(stage3_unit3_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit3_conv1')
+    stage3_unit3_bn2 = batch_normalization(stage3_unit3_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit3_bn2')
+    stage3_unit3_relu1 = prelu(stage3_unit3_bn2, name='stage3_unit3_relu1')
+    stage3_unit3_conv2_pad = tf.pad(stage3_unit3_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit3_conv2 = convolution(stage3_unit3_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit3_conv2')
+    stage3_unit3_bn3 = batch_normalization(stage3_unit3_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit3_bn3')
+    plus9           = stage3_unit3_bn3 + plus8
+    stage3_unit4_bn1 = batch_normalization(plus9, variance_epsilon=1.9999999494757503e-05, name='stage3_unit4_bn1')
+    stage3_unit4_conv1_pad = tf.pad(stage3_unit4_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit4_conv1 = convolution(stage3_unit4_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit4_conv1')
+    stage3_unit4_bn2 = batch_normalization(stage3_unit4_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit4_bn2')
+    stage3_unit4_relu1 = prelu(stage3_unit4_bn2, name='stage3_unit4_relu1')
+    stage3_unit4_conv2_pad = tf.pad(stage3_unit4_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit4_conv2 = convolution(stage3_unit4_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit4_conv2')
+    stage3_unit4_bn3 = batch_normalization(stage3_unit4_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit4_bn3')
+    plus10          = stage3_unit4_bn3 + plus9
+    stage3_unit5_bn1 = batch_normalization(plus10, variance_epsilon=1.9999999494757503e-05, name='stage3_unit5_bn1')
+    stage3_unit5_conv1_pad = tf.pad(stage3_unit5_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit5_conv1 = convolution(stage3_unit5_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit5_conv1')
+    stage3_unit5_bn2 = batch_normalization(stage3_unit5_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit5_bn2')
+    stage3_unit5_relu1 = prelu(stage3_unit5_bn2, name='stage3_unit5_relu1')
+    stage3_unit5_conv2_pad = tf.pad(stage3_unit5_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit5_conv2 = convolution(stage3_unit5_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit5_conv2')
+    stage3_unit5_bn3 = batch_normalization(stage3_unit5_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit5_bn3')
+    plus11          = stage3_unit5_bn3 + plus10
+    stage3_unit6_bn1 = batch_normalization(plus11, variance_epsilon=1.9999999494757503e-05, name='stage3_unit6_bn1')
+    stage3_unit6_conv1_pad = tf.pad(stage3_unit6_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit6_conv1 = convolution(stage3_unit6_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit6_conv1')
+    stage3_unit6_bn2 = batch_normalization(stage3_unit6_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit6_bn2')
+    stage3_unit6_relu1 = prelu(stage3_unit6_bn2, name='stage3_unit6_relu1')
+    stage3_unit6_conv2_pad = tf.pad(stage3_unit6_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit6_conv2 = convolution(stage3_unit6_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit6_conv2')
+    stage3_unit6_bn3 = batch_normalization(stage3_unit6_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit6_bn3')
+    plus12          = stage3_unit6_bn3 + plus11
+    stage3_unit7_bn1 = batch_normalization(plus12, variance_epsilon=1.9999999494757503e-05, name='stage3_unit7_bn1')
+    stage3_unit7_conv1_pad = tf.pad(stage3_unit7_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit7_conv1 = convolution(stage3_unit7_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit7_conv1')
+    stage3_unit7_bn2 = batch_normalization(stage3_unit7_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit7_bn2')
+    stage3_unit7_relu1 = prelu(stage3_unit7_bn2, name='stage3_unit7_relu1')
+    stage3_unit7_conv2_pad = tf.pad(stage3_unit7_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit7_conv2 = convolution(stage3_unit7_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit7_conv2')
+    stage3_unit7_bn3 = batch_normalization(stage3_unit7_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit7_bn3')
+    plus13          = stage3_unit7_bn3 + plus12
+    stage3_unit8_bn1 = batch_normalization(plus13, variance_epsilon=1.9999999494757503e-05, name='stage3_unit8_bn1')
+    stage3_unit8_conv1_pad = tf.pad(stage3_unit8_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit8_conv1 = convolution(stage3_unit8_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit8_conv1')
+    stage3_unit8_bn2 = batch_normalization(stage3_unit8_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit8_bn2')
+    stage3_unit8_relu1 = prelu(stage3_unit8_bn2, name='stage3_unit8_relu1')
+    stage3_unit8_conv2_pad = tf.pad(stage3_unit8_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit8_conv2 = convolution(stage3_unit8_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit8_conv2')
+    stage3_unit8_bn3 = batch_normalization(stage3_unit8_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit8_bn3')
+    plus14          = stage3_unit8_bn3 + plus13
+    stage3_unit9_bn1 = batch_normalization(plus14, variance_epsilon=1.9999999494757503e-05, name='stage3_unit9_bn1')
+    stage3_unit9_conv1_pad = tf.pad(stage3_unit9_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit9_conv1 = convolution(stage3_unit9_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit9_conv1')
+    stage3_unit9_bn2 = batch_normalization(stage3_unit9_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit9_bn2')
+    stage3_unit9_relu1 = prelu(stage3_unit9_bn2, name='stage3_unit9_relu1')
+    stage3_unit9_conv2_pad = tf.pad(stage3_unit9_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit9_conv2 = convolution(stage3_unit9_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit9_conv2')
+    stage3_unit9_bn3 = batch_normalization(stage3_unit9_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit9_bn3')
+    plus15          = stage3_unit9_bn3 + plus14
+    stage3_unit10_bn1 = batch_normalization(plus15, variance_epsilon=1.9999999494757503e-05, name='stage3_unit10_bn1')
+    stage3_unit10_conv1_pad = tf.pad(stage3_unit10_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit10_conv1 = convolution(stage3_unit10_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit10_conv1')
+    stage3_unit10_bn2 = batch_normalization(stage3_unit10_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit10_bn2')
+    stage3_unit10_relu1 = prelu(stage3_unit10_bn2, name='stage3_unit10_relu1')
+    stage3_unit10_conv2_pad = tf.pad(stage3_unit10_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit10_conv2 = convolution(stage3_unit10_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit10_conv2')
+    stage3_unit10_bn3 = batch_normalization(stage3_unit10_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit10_bn3')
+    plus16          = stage3_unit10_bn3 + plus15
+    stage3_unit11_bn1 = batch_normalization(plus16, variance_epsilon=1.9999999494757503e-05, name='stage3_unit11_bn1')
+    stage3_unit11_conv1_pad = tf.pad(stage3_unit11_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit11_conv1 = convolution(stage3_unit11_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit11_conv1')
+    stage3_unit11_bn2 = batch_normalization(stage3_unit11_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit11_bn2')
+    stage3_unit11_relu1 = prelu(stage3_unit11_bn2, name='stage3_unit11_relu1')
+    stage3_unit11_conv2_pad = tf.pad(stage3_unit11_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit11_conv2 = convolution(stage3_unit11_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit11_conv2')
+    stage3_unit11_bn3 = batch_normalization(stage3_unit11_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit11_bn3')
+    plus17          = stage3_unit11_bn3 + plus16
+    stage3_unit12_bn1 = batch_normalization(plus17, variance_epsilon=1.9999999494757503e-05, name='stage3_unit12_bn1')
+    stage3_unit12_conv1_pad = tf.pad(stage3_unit12_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit12_conv1 = convolution(stage3_unit12_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit12_conv1')
+    stage3_unit12_bn2 = batch_normalization(stage3_unit12_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit12_bn2')
+    stage3_unit12_relu1 = prelu(stage3_unit12_bn2, name='stage3_unit12_relu1')
+    stage3_unit12_conv2_pad = tf.pad(stage3_unit12_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit12_conv2 = convolution(stage3_unit12_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit12_conv2')
+    stage3_unit12_bn3 = batch_normalization(stage3_unit12_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit12_bn3')
+    plus18          = stage3_unit12_bn3 + plus17
+    stage3_unit13_bn1 = batch_normalization(plus18, variance_epsilon=1.9999999494757503e-05, name='stage3_unit13_bn1')
+    stage3_unit13_conv1_pad = tf.pad(stage3_unit13_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit13_conv1 = convolution(stage3_unit13_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit13_conv1')
+    stage3_unit13_bn2 = batch_normalization(stage3_unit13_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit13_bn2')
+    stage3_unit13_relu1 = prelu(stage3_unit13_bn2, name='stage3_unit13_relu1')
+    stage3_unit13_conv2_pad = tf.pad(stage3_unit13_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit13_conv2 = convolution(stage3_unit13_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit13_conv2')
+    stage3_unit13_bn3 = batch_normalization(stage3_unit13_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit13_bn3')
+    plus19          = stage3_unit13_bn3 + plus18
+    stage3_unit14_bn1 = batch_normalization(plus19, variance_epsilon=1.9999999494757503e-05, name='stage3_unit14_bn1')
+    stage3_unit14_conv1_pad = tf.pad(stage3_unit14_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit14_conv1 = convolution(stage3_unit14_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit14_conv1')
+    stage3_unit14_bn2 = batch_normalization(stage3_unit14_conv1, variance_epsilon=1.9999999494757503e-05, name='stage3_unit14_bn2')
+    stage3_unit14_relu1 = prelu(stage3_unit14_bn2, name='stage3_unit14_relu1')
+    stage3_unit14_conv2_pad = tf.pad(stage3_unit14_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage3_unit14_conv2 = convolution(stage3_unit14_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage3_unit14_conv2')
+    stage3_unit14_bn3 = batch_normalization(stage3_unit14_conv2, variance_epsilon=1.9999999494757503e-05, name='stage3_unit14_bn3')
+    plus20          = stage3_unit14_bn3 + plus19
+    stage4_unit1_bn1 = batch_normalization(plus20, variance_epsilon=1.9999999494757503e-05, name='stage4_unit1_bn1')
+    stage4_unit1_conv1sc = convolution(plus20, group=1, strides=[2, 2], padding='VALID', name='stage4_unit1_conv1sc')
+    stage4_unit1_conv1_pad = tf.pad(stage4_unit1_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit1_conv1 = convolution(stage4_unit1_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage4_unit1_conv1')
+    stage4_unit1_sc = batch_normalization(stage4_unit1_conv1sc, variance_epsilon=1.9999999494757503e-05, name='stage4_unit1_sc')
+    stage4_unit1_bn2 = batch_normalization(stage4_unit1_conv1, variance_epsilon=1.9999999494757503e-05, name='stage4_unit1_bn2')
+    stage4_unit1_relu1 = prelu(stage4_unit1_bn2, name='stage4_unit1_relu1')
+    stage4_unit1_conv2_pad = tf.pad(stage4_unit1_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit1_conv2 = convolution(stage4_unit1_conv2_pad, group=1, strides=[2, 2], padding='VALID', name='stage4_unit1_conv2')
+    stage4_unit1_bn3 = batch_normalization(stage4_unit1_conv2, variance_epsilon=1.9999999494757503e-05, name='stage4_unit1_bn3')
+    plus21          = stage4_unit1_bn3 + stage4_unit1_sc
+    stage4_unit2_bn1 = batch_normalization(plus21, variance_epsilon=1.9999999494757503e-05, name='stage4_unit2_bn1')
+    stage4_unit2_conv1_pad = tf.pad(stage4_unit2_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit2_conv1 = convolution(stage4_unit2_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage4_unit2_conv1')
+    stage4_unit2_bn2 = batch_normalization(stage4_unit2_conv1, variance_epsilon=1.9999999494757503e-05, name='stage4_unit2_bn2')
+    stage4_unit2_relu1 = prelu(stage4_unit2_bn2, name='stage4_unit2_relu1')
+    stage4_unit2_conv2_pad = tf.pad(stage4_unit2_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit2_conv2 = convolution(stage4_unit2_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage4_unit2_conv2')
+    stage4_unit2_bn3 = batch_normalization(stage4_unit2_conv2, variance_epsilon=1.9999999494757503e-05, name='stage4_unit2_bn3')
+    plus22          = stage4_unit2_bn3 + plus21
+    stage4_unit3_bn1 = batch_normalization(plus22, variance_epsilon=1.9999999494757503e-05, name='stage4_unit3_bn1')
+    stage4_unit3_conv1_pad = tf.pad(stage4_unit3_bn1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit3_conv1 = convolution(stage4_unit3_conv1_pad, group=1, strides=[1, 1], padding='VALID', name='stage4_unit3_conv1')
+    stage4_unit3_bn2 = batch_normalization(stage4_unit3_conv1, variance_epsilon=1.9999999494757503e-05, name='stage4_unit3_bn2')
+    stage4_unit3_relu1 = prelu(stage4_unit3_bn2, name='stage4_unit3_relu1')
+    stage4_unit3_conv2_pad = tf.pad(stage4_unit3_relu1, paddings = [[0, 0], [1, 1], [1, 1], [0, 0]])
+    stage4_unit3_conv2 = convolution(stage4_unit3_conv2_pad, group=1, strides=[1, 1], padding='VALID', name='stage4_unit3_conv2')
+    stage4_unit3_bn3 = batch_normalization(stage4_unit3_conv2, variance_epsilon=1.9999999494757503e-05, name='stage4_unit3_bn3')
+    plus23          = stage4_unit3_bn3 + plus22
+    bn1             = batch_normalization(plus23, variance_epsilon=1.9999999494757503e-05, name='bn1')
+    pre_fc1_flatten = tf.contrib.layers.flatten(bn1)
+    pre_fc1         = tf.layers.dense(pre_fc1_flatten, 512, kernel_initializer = tf.constant_initializer(__weights_dict['pre_fc1']['weights']), bias_initializer = tf.constant_initializer(__weights_dict['pre_fc1']['bias']), use_bias = True,reuse=tf.AUTO_REUSE)
+    fc1             = batch_normalization(pre_fc1, variance_epsilon=1.9999999494757503e-05, name='fc1')
+    return data, fc1, [stage2_unit1_bn1,stage3_unit1_bn1,stage4_unit1_bn1,pre_fc1_flatten]
+
+
+def convolution(input, name, group, **kwargs):
+    w = tf.Variable(__weights_dict[name]['weights'], trainable=is_train, name=name + "_weight")
+    if group == 1:
+        layer = tf.nn.convolution(input, w, **kwargs)
+    else:
+        weight_groups = tf.split(w, num_or_size_splits=group, axis=-1)
+        xs = tf.split(input, num_or_size_splits=group, axis=-1)
+        convolved = [tf.nn.convolution(x, weight, **kwargs) for
+                    (x, weight) in zip(xs, weight_groups)]
+        layer = tf.concat(convolved, axis=-1)
+
+    if 'bias' in __weights_dict[name]:
+        b = tf.Variable(__weights_dict[name]['bias'], trainable=is_train, name=name + "_bias")
+        layer = layer + b
+    return layer
+
+def prelu(input, name):
+    gamma = tf.Variable(__weights_dict[name]['gamma'], name=name + "_gamma", trainable=is_train)
+    return tf.maximum(0.0, input) + gamma * tf.minimum(0.0, input)
+    
+
+def batch_normalization(input, name, **kwargs):
+    mean = tf.Variable(__weights_dict[name]['mean'], name = name + "_mean", trainable = is_train)
+    variance = tf.Variable(__weights_dict[name]['var'], name = name + "_var", trainable = is_train)
+    offset = tf.Variable(__weights_dict[name]['bias'], name = name + "_bias", trainable = is_train) if 'bias' in __weights_dict[name] else None
+    scale = tf.Variable(__weights_dict[name]['scale'], name = name + "_scale", trainable = is_train) if 'scale' in __weights_dict[name] else None
+    return tf.nn.batch_normalization(input, mean, variance, offset, scale, name = name, **kwargs)
+
diff --git a/insightface/reconstruction/ostec/external/face_detector/detect_face.py b/insightface/reconstruction/ostec/external/face_detector/detect_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc01fb49541c432ad77cfcd01d1e2e19883bbd2
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/face_detector/detect_face.py
@@ -0,0 +1,584 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+import cv2
+import os
+import inspect
+
+
+class Face_Detector(object):
+    def __init__(self, gpuid = -1):
+        self.minsize = 40  # minimum size of face
+        self.threshold = [0.6, 0.7, 0.7]  # three steps's threshold
+        self.factor = 0.709  # scale factor
+
+        current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+
+        with tf.device('/'+('cpu' if gpuid<0 else 'gpu')+':'+('0' if gpuid<0 else str(gpuid))):
+            with tf.Graph().as_default():
+                sess = tf.Session(config= tf.ConfigProto(device_count = {'GPU': 0 if gpuid<0 else 1}))
+                with sess.as_default():
+                    self.pnet, self.rnet, self.onet = create_detector(sess, '{}/mtcnn'.format(current_dir))
+                    print('MTCNN loaded')
+
+    def face_detection(self,img):
+
+        bounding_boxes, points = detect_face(img, self.minsize,
+                                                         self.pnet, self.rnet, self.onet, self.threshold,
+                                                         self.factor)
+        return bounding_boxes, points.reshape([2,-1]).T
+
+
+def layer(op):
+    '''Decorator for composable network layers.'''
+
+    def layer_decorated(self, *args, **kwargs):
+        # Automatically set a name if not provided.
+        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
+        # Figure out the layer inputs.
+        if len(self.terminals) == 0:
+            raise RuntimeError('No input variables found for layer %s.' % name)
+        elif len(self.terminals) == 1:
+            layer_input = self.terminals[0]
+        else:
+            layer_input = list(self.terminals)
+        # Perform the operation and get the output.
+        layer_output = op(self, layer_input, *args, **kwargs)
+        # Add to layer LUT.
+        self.layers[name] = layer_output
+        # This output is now the input for the next layer.
+        self.feed(layer_output)
+        # Return self for chained calls.
+        return self
+
+    return layer_decorated
+
+class Network(object):
+
+    def __init__(self, inputs, trainable=True):
+        # The input nodes for this network
+        self.inputs = inputs
+        # The current list of terminal nodes
+        self.terminals = []
+        # Mapping from layer names to layers
+        self.layers = dict(inputs)
+        # If true, the resulting variables are set as trainable
+        self.trainable = trainable
+
+        self.setup()
+
+    def setup(self):
+        '''Construct the network. '''
+        raise NotImplementedError('Must be implemented by the subclass.')
+
+    def load(self, data_path, session, ignore_missing=False):
+        '''Load network weights.
+        data_path: The path to the numpy-serialized network weights
+        session: The current TensorFlow session
+        ignore_missing: If true, serialized weights for missing layers are ignored.
+        '''
+        data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member
+        for op_name in data_dict:
+            with tf.variable_scope(op_name, reuse=True):
+                for param_name, data in data_dict[op_name].items():
+                    try:
+                        var = tf.get_variable(param_name)
+                        session.run(var.assign(data))
+                    except ValueError:
+                        if not ignore_missing:
+                            raise
+
+    def feed(self, *args):
+        '''Set the input(s) for the next operation by replacing the terminal nodes.
+        The arguments can be either layer names or the actual layers.
+        '''
+        assert len(args) != 0
+        self.terminals = []
+        for fed_layer in args:
+            if isinstance(fed_layer, str):
+                try:
+                    fed_layer = self.layers[fed_layer]
+                except KeyError:
+                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
+            self.terminals.append(fed_layer)
+        return self
+
+    def get_output(self):
+        '''Returns the current network output.'''
+        return self.terminals[-1]
+
+    def get_unique_name(self, prefix):
+        '''Returns an index-suffixed unique name for the given prefix.
+        This is used for auto-generating layer names based on the type-prefix.
+        '''
+        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
+        return '%s_%d' % (prefix, ident)
+
+    def make_var(self, name, shape):
+        '''Creates a new TensorFlow variable.'''
+        return tf.get_variable(name, shape, trainable=self.trainable)
+
+    def validate_padding(self, padding):
+        '''Verifies that the padding is one of the supported ones.'''
+        assert padding in ('SAME', 'VALID')
+
+    @layer
+    def conv(self,
+             inp,
+             k_h,
+             k_w,
+             c_o,
+             s_h,
+             s_w,
+             name,
+             relu=True,
+             padding='SAME',
+             group=1,
+             biased=True):
+        # Verify that the padding is acceptable
+        self.validate_padding(padding)
+        # Get the number of channels in the input
+        c_i = inp.get_shape()[-1]
+        # Verify that the grouping parameter is valid
+        assert c_i % group == 0
+        assert c_o % group == 0
+        # Convolution for a given input and kernel
+        convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
+        with tf.variable_scope(name) as scope:
+            kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o])
+            # This is the common-case. Convolve the input without any further complications.
+            output = convolve(inp, kernel)
+            # Add the biases
+            if biased:
+                biases = self.make_var('biases', [c_o])
+                output = tf.nn.bias_add(output, biases)
+            if relu:
+                # ReLU non-linearity
+                output = tf.nn.relu(output, name=scope.name)
+            return output
+
+    @layer
+    def prelu(self, inp, name):
+        with tf.variable_scope(name):
+            i = inp.get_shape().as_list()
+            alpha = self.make_var('alpha', shape=(i[-1]))
+            output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp))
+        return output
+
+    @layer
+    def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'):
+        self.validate_padding(padding)
+        return tf.nn.max_pool(inp,
+                              ksize=[1, k_h, k_w, 1],
+                              strides=[1, s_h, s_w, 1],
+                              padding=padding,
+                              name=name)
+
+    @layer
+    def fc(self, inp, num_out, name, relu=True):
+        with tf.variable_scope(name):
+            input_shape = inp.get_shape()
+            if input_shape.ndims == 4:
+                # The input is spatial. Vectorize it first.
+                dim = 1
+                for d in input_shape[1:].as_list():
+                    dim *= d
+                feed_in = tf.reshape(inp, [-1, dim])
+            else:
+                feed_in, dim = (inp, input_shape[-1].value)
+            weights = self.make_var('weights', shape=[dim, num_out])
+            biases = self.make_var('biases', [num_out])
+            op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
+            fc = op(feed_in, weights, biases, name=name)
+            return fc
+
+
+    """
+    Multi dimensional softmax,
+    refer to https://github.com/tensorflow/tensorflow/issues/210
+    compute softmax along the dimension of target
+    the native softmax only supports batch_size x dimension
+    """
+    @layer
+    def softmax(self, target, axis, name=None):
+        max_axis = tf.reduce_max(target, axis, keepdims=True)
+        target_exp = tf.exp(target-max_axis)
+        normalize = tf.reduce_sum(target_exp, axis, keepdims=True)
+        softmax = tf.div(target_exp, normalize, name)
+        return softmax
+
+class PNet(Network):
+    def setup(self):
+        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
+             .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1')
+             .prelu(name='PReLU1')
+             .max_pool(2, 2, 2, 2, name='pool1')
+             .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2')
+             .prelu(name='PReLU2')
+             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3')
+             .prelu(name='PReLU3')
+             .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1')
+             .softmax(3,name='prob1'))
+
+        (self.feed('PReLU3') #pylint: disable=no-value-for-parameter
+             .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2'))
+
+class RNet(Network):
+    def setup(self):
+        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
+             .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1')
+             .prelu(name='prelu1')
+             .max_pool(3, 3, 2, 2, name='pool1')
+             .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2')
+             .prelu(name='prelu2')
+             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
+             .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3')
+             .prelu(name='prelu3')
+             .fc(128, relu=False, name='conv4')
+             .prelu(name='prelu4')
+             .fc(2, relu=False, name='conv5-1')
+             .softmax(1,name='prob1'))
+
+        (self.feed('prelu4') #pylint: disable=no-value-for-parameter
+             .fc(4, relu=False, name='conv5-2'))
+
+class ONet(Network):
+    def setup(self):
+        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
+             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
+             .prelu(name='prelu1')
+             .max_pool(3, 3, 2, 2, name='pool1')
+             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
+             .prelu(name='prelu2')
+             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
+             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
+             .prelu(name='prelu3')
+             .max_pool(2, 2, 2, 2, name='pool3')
+             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
+             .prelu(name='prelu4')
+             .fc(256, relu=False, name='conv5')
+             .prelu(name='prelu5')
+             .fc(2, relu=False, name='conv6-1')
+             .softmax(1, name='prob1'))
+
+        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
+             .fc(4, relu=False, name='conv6-2'))
+
+        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
+             .fc(10, relu=False, name='conv6-3'))
+
+def create_detector(sess, model_path):
+    with tf.variable_scope('pnet'):
+        data = tf.placeholder(tf.float32, (None,None,None,3), 'input')
+        pnet = PNet({'data':data})
+        pnet.load(os.path.join(model_path, 'cas1.npy'), sess)
+    with tf.variable_scope('rnet'):
+        data = tf.placeholder(tf.float32, (None,24,24,3), 'input')
+        rnet = RNet({'data':data})
+        rnet.load(os.path.join(model_path, 'cas2.npy'), sess)
+    with tf.variable_scope('onet'):
+        data = tf.placeholder(tf.float32, (None,48,48,3), 'input')
+        onet = ONet({'data':data})
+        onet.load(os.path.join(model_path, 'cas3.npy'), sess)
+
+    pnet_fun = lambda img : sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0':img})
+    rnet_fun = lambda img : sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0':img})
+    onet_fun = lambda img : sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), feed_dict={'onet/input:0':img})
+    return pnet_fun, rnet_fun, onet_fun
+
+def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
+    # im: input image
+    # minsize: minimum of faces' size
+    # pnet, rnet, onet: model
+    # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold
+    # factor: resize img to generate pyramid
+    factor_count=0
+    total_boxes=np.empty((0,9))
+    points=[]
+    h=img.shape[0]
+    w=img.shape[1]
+    minl=np.amin([h, w])
+    m=12.0/minsize
+    minl=minl*m
+    # creat scale pyramid
+    scales=[]
+    while minl>=12:
+        scales += [m*np.power(factor, factor_count)]
+        minl = minl*factor
+        factor_count += 1
+
+    # first stage
+    for j in range(len(scales)):
+        scale=scales[j]
+        hs=int(np.ceil(h*scale))
+        ws=int(np.ceil(w*scale))
+        im_data = imresample(img, (hs, ws))
+        im_data = (im_data-127.5)*0.0078125
+        img_x = np.expand_dims(im_data, 0)
+        img_y = np.transpose(img_x, (0,2,1,3))
+        out = pnet(img_y)
+        out0 = np.transpose(out[0], (0,2,1,3))
+        out1 = np.transpose(out[1], (0,2,1,3))
+
+        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
+
+        # inter-scale nms
+        pick = nms(boxes.copy(), 0.5, 'Union')
+        if boxes.size>0 and pick.size>0:
+            boxes = boxes[pick,:]
+            total_boxes = np.append(total_boxes, boxes, axis=0)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        pick = nms(total_boxes.copy(), 0.7, 'Union')
+        total_boxes = total_boxes[pick,:]
+        regw = total_boxes[:,2]-total_boxes[:,0]
+        regh = total_boxes[:,3]-total_boxes[:,1]
+        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
+        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
+        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
+        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
+        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
+        total_boxes = rerec(total_boxes.copy())
+        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # second stage
+        tempimg = np.zeros((24,24,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = rnet(tempimg1)
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        score = out1[1,:]
+        ipass = np.where(score>threshold[1])
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+        if total_boxes.shape[0]>0:
+            pick = nms(total_boxes, 0.7, 'Union')
+            total_boxes = total_boxes[pick,:]
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
+            total_boxes = rerec(total_boxes.copy())
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # third stage
+        total_boxes = np.fix(total_boxes).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+        tempimg = np.zeros((48,48,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = onet(tempimg1)
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        out2 = np.transpose(out[2])
+        score = out2[1,:]
+        points = out1
+        ipass = np.where(score>threshold[2])
+        points = points[:,ipass[0]]
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+
+        w = total_boxes[:,2]-total_boxes[:,0]+1
+        h = total_boxes[:,3]-total_boxes[:,1]+1
+        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
+        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
+        if total_boxes.shape[0]>0:
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
+            pick = nms(total_boxes.copy(), 0.7, 'Min')
+            total_boxes = total_boxes[pick,:]
+            points = points[:,pick]
+
+    return total_boxes, points
+
+def box_regression(img, onet, total_boxes, threshold):
+    # im: input image
+    # onet: model
+    # total_boxes: [x1 y1 x2 y2 score, 5]    
+    # threshold: 0.7
+    points=[]
+    h=img.shape[0]
+    w=img.shape[1]
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        total_boxes = rerec(total_boxes)
+        total_boxes = np.fix(total_boxes).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+        tempimg = np.zeros((48,48,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = onet(tempimg1)
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        out2 = np.transpose(out[2])
+        score = out2[1,:]
+        points = out1
+        ipass = np.where(score>threshold)
+        points = points[:,ipass[0]]
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+
+        w = total_boxes[:,2]-total_boxes[:,0]+1
+        h = total_boxes[:,3]-total_boxes[:,1]+1
+        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
+        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
+        if total_boxes.shape[0]>0:
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
+            pick = nms(total_boxes.copy(), 0.7, 'Min')
+            total_boxes = total_boxes[pick,:]
+            points = points[:,pick]
+
+    return total_boxes, points
+
+
+def bbreg(boundingbox,reg):
+    # calibrate bounding boxes
+    if reg.shape[1]==1:
+        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
+
+    w = boundingbox[:,2]-boundingbox[:,0]+1
+    h = boundingbox[:,3]-boundingbox[:,1]+1
+    b1 = boundingbox[:,0]+reg[:,0]*w
+    b2 = boundingbox[:,1]+reg[:,1]*h
+    b3 = boundingbox[:,2]+reg[:,2]*w
+    b4 = boundingbox[:,3]+reg[:,3]*h
+    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
+    return boundingbox
+
+def generateBoundingBox(imap, reg, scale, t):
+    # use heatmap to generate bounding boxes
+    stride=2
+    cellsize=12
+
+    imap = np.transpose(imap)
+    dx1 = np.transpose(reg[:,:,0])
+    dy1 = np.transpose(reg[:,:,1])
+    dx2 = np.transpose(reg[:,:,2])
+    dy2 = np.transpose(reg[:,:,3])
+    y, x = np.where(imap >= t)
+    if y.shape[0]==1:
+        dx1 = np.flipud(dx1)
+        dy1 = np.flipud(dy1)
+        dx2 = np.flipud(dx2)
+        dy2 = np.flipud(dy2)
+    score = imap[(y,x)]
+    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
+    if reg.size==0:
+        reg = np.empty((0,3))
+    bb = np.transpose(np.vstack([y,x]))
+    q1 = np.fix((stride*bb+1)/scale)
+    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
+    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
+    return boundingbox, reg
+
+def nms(boxes, threshold, method):
+    if boxes.size==0:
+        return np.empty((0,3))
+    x1 = boxes[:,0]
+    y1 = boxes[:,1]
+    x2 = boxes[:,2]
+    y2 = boxes[:,3]
+    s = boxes[:,4]
+    area = (x2-x1+1) * (y2-y1+1)
+    I = np.argsort(s)
+    pick = np.zeros_like(s, dtype=np.int16)
+    counter = 0
+    while I.size>0:
+        i = I[-1]
+        pick[counter] = i
+        counter += 1
+        idx = I[0:-1]
+        xx1 = np.maximum(x1[i], x1[idx])
+        yy1 = np.maximum(y1[i], y1[idx])
+        xx2 = np.minimum(x2[i], x2[idx])
+        yy2 = np.minimum(y2[i], y2[idx])
+        w = np.maximum(0.0, xx2-xx1+1)
+        h = np.maximum(0.0, yy2-yy1+1)
+        inter = w * h
+        if method is 'Min':
+            o = inter / np.minimum(area[i], area[idx])
+        else:
+            o = inter / (area[i] + area[idx] - inter)
+        I = I[np.where(o<=threshold)]
+    pick = pick[0:counter]
+    return pick
+
+def pad(total_boxes, w, h):
+    # compute the padding coordinates (pad the bounding boxes to square)
+    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
+    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
+    numbox = total_boxes.shape[0]
+
+    dx = np.ones((numbox), dtype=np.int32)
+    dy = np.ones((numbox), dtype=np.int32)
+    edx = tmpw.copy().astype(np.int32)
+    edy = tmph.copy().astype(np.int32)
+
+    x = total_boxes[:,0].copy().astype(np.int32)
+    y = total_boxes[:,1].copy().astype(np.int32)
+    ex = total_boxes[:,2].copy().astype(np.int32)
+    ey = total_boxes[:,3].copy().astype(np.int32)
+
+    tmp = np.where(ex>w)
+    edx[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],0)
+    ex[tmp] = w
+
+    tmp = np.where(ey>h)
+    edy[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],0)
+    ey[tmp] = h
+
+    tmp = np.where(x<1)
+    dx[tmp] = np.expand_dims(2-x[tmp],0)
+    x[tmp] = 1
+
+    tmp = np.where(y<1)
+    dy[tmp] = np.expand_dims(2-y[tmp],0)
+    y[tmp] = 1
+
+    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
+
+def rerec(bboxA):
+    # convert bboxA to square
+    h = bboxA[:,3]-bboxA[:,1]
+    w = bboxA[:,2]-bboxA[:,0]
+    l = np.maximum(w, h)
+    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
+    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
+    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
+    return bboxA
+
+def imresample(img, sz):
+    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA)
+    return im_data
diff --git a/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas1.npy b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas1.npy
new file mode 100644
index 0000000000000000000000000000000000000000..7a7abccca635ea95865ccd2aaa873669653492b1
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3285cf7a3de2651c5784cb9e32013f5919aae95fd1ed1bc371dd9691affb39af
+size 27368
diff --git a/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas2.npy b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas2.npy
new file mode 100644
index 0000000000000000000000000000000000000000..9e66e4fc06dc1dc0bea606324a0d4f31d26d3357
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:716b8b83e42476791c9096f14dbb09fefc88bf5c7ec876b1683f9acd52b3f39c
+size 401681
diff --git a/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas3.npy b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas3.npy
new file mode 100644
index 0000000000000000000000000000000000000000..801d8c16743e2a3f22fa0d598885de24d6efd5dd
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/face_detector/mtcnn/cas3.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:396ead803d85d3443307ff8f45fb6aed2536579b415a4f4d4cb8f93ea6b1476a
+size 1557360
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/MaskExtractor.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/MaskExtractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb815eac2f1860594f8b50cd5fac1d27797f807e
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/MaskExtractor.py
@@ -0,0 +1,64 @@
+import torch
+from PIL import Image
+from torchvision.transforms import transforms
+
+from FaceHairMask import deeplab_xception_transfer
+from FaceHairMask.graphonomy_inference import inference
+
+import numpy as np
+import cv2
+
+def preprocess(image, size=256, normalize=1):
+    if size is None:
+        image = transforms.Resize((1024, 1024))(image)
+    else:
+        image = transforms.Resize((size, size))(image)
+    image = transforms.ToTensor()(image)
+    if normalize is not None:
+        image = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])(image)
+    return image
+
+def postProcess(faceMask, hairMask):
+    hairMask = hairMask.cpu().permute(1,2,0).detach().numpy()
+    faceMask = faceMask.cpu().permute(1,2,0).detach().numpy()
+    return faceMask, hairMask
+
+class MaskExtractor:
+    def __init__(self):
+        
+        #? Hair Face Extractors
+        self.net = deeplab_xception_transfer.deeplab_xception_transfer_projection_savemem(n_classes=20, hidden_layers=128, source_classes=7)
+        stateDict = torch.load("models/Graphonomy/inference.pth")
+        self.net.load_source_model(stateDict)
+        self.net.to("cuda")
+        self.net.eval()
+        
+
+    def processInput4(self, image):
+        preprocessedImage = preprocess(image, size=256, normalize=1)
+        preprocessedImage = preprocessedImage.unsqueeze(0).to("cuda")
+        return preprocessedImage
+    
+    def getMask(self, image):
+        preprocessedImage = self.processInput4(image)
+        _, hairMask, faceMask = inference(net=self.net, img=preprocessedImage, device="cuda")
+        faceMask, hairMask = postProcess(faceMask, hairMask)
+        return hairMask, faceMask
+        
+    def main(self, image):
+        image = (image.pixels_with_channels_at_back()[:, :, ::-1] * 255).astype('uint8')
+        hairMask, faceMask = self.getMask(Image.fromarray(image))
+        hairMask = transforms.Resize((Image.fromarray(image).size[1], Image.fromarray(image).size[0]))(Image.fromarray((hairMask[:,:,0]* 255).astype('uint8')))
+        faceMask = transforms.Resize((Image.fromarray(image).size[1], Image.fromarray(image).size[0]))(Image.fromarray((faceMask[:,:,0]* 255).astype('uint8')))
+
+        # Additional Morphology
+        hairMask = np.array(hairMask) / 255
+        faceMask = np.array(faceMask) / 255
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (10, 10))
+        faceMask = cv2.erode(faceMask, kernel, iterations=1)
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (35, 35))
+        hairMask = cv2.dilate(hairMask, kernel, iterations=1)
+        faceMask = faceMask * (1 - hairMask)
+
+
+        return hairMask, faceMask
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingModel.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f165b26208daefc40eac0661fc829379d07a8a3
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingModel.py
@@ -0,0 +1,265 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ParsingResnet import Resnet18
+
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1, *args, **kwargs):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2d(in_chan,
+                out_chan,
+                kernel_size = ks,
+                stride = stride,
+                padding = padding,
+                bias = False)
+        self.bn = nn.BatchNorm2d(out_chan)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(self.bn(x))
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+class BiSeNetOutput(nn.Module):
+    def __init__(self, in_chan, mid_chan, n_classes, *args, **kwargs):
+        super(BiSeNetOutput, self).__init__()
+        self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2d(mid_chan, n_classes, kernel_size=1, bias=False)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class AttentionRefinementModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
+        self.bn_atten = nn.BatchNorm2d(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = torch.mul(feat, atten)
+        return out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+
+class ContextPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ContextPath, self).__init__()
+        self.resnet = Resnet18()
+        self.arm16 = AttentionRefinementModule(256, 128)
+        self.arm32 = AttentionRefinementModule(512, 128)
+        self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1)
+        self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0)
+
+        self.init_weight()
+
+    def forward(self, x):
+        H0, W0 = x.size()[2:]
+        feat8, feat16, feat32 = self.resnet(x)
+        H8, W8 = feat8.size()[2:]
+        H16, W16 = feat16.size()[2:]
+        H32, W32 = feat32.size()[2:]
+
+        avg = F.avg_pool2d(feat32, feat32.size()[2:])
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, (H32, W32), mode='nearest')
+
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, (H16, W16), mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, (H8, W8), mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+
+        return feat8, feat16_up, feat32_up  # x8, x8, x16
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+### This is not used, since I replace this with the resnet feature with the same size
+class SpatialPath(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(SpatialPath, self).__init__()
+        self.conv1 = ConvBNReLU(3, 64, ks=7, stride=2, padding=3)
+        self.conv2 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv3 = ConvBNReLU(64, 64, ks=3, stride=2, padding=1)
+        self.conv_out = ConvBNReLU(64, 128, ks=1, stride=1, padding=0)
+        self.init_weight()
+
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.conv2(feat)
+        feat = self.conv3(feat)
+        feat = self.conv_out(feat)
+        return feat
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class FeatureFusionModule(nn.Module):
+    def __init__(self, in_chan, out_chan, *args, **kwargs):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2d(out_chan,
+                out_chan//4,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.conv2 = nn.Conv2d(out_chan//4,
+                out_chan,
+                kernel_size = 1,
+                stride = 1,
+                padding = 0,
+                bias = False)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+
+    def forward(self, fsp, fcp):
+        fcat = torch.cat([fsp, fcp], dim=1)
+        feat = self.convblk(fcat)
+        atten = F.avg_pool2d(feat, feat.size()[2:])
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = torch.mul(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module, nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
+
+
+class BiSeNet(nn.Module):
+    def __init__(self, n_classes, *args, **kwargs):
+        super(BiSeNet, self).__init__()
+        self.cp = ContextPath()
+        ## here self.sp is deleted
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_out = BiSeNetOutput(256, 256, n_classes)
+        self.conv_out16 = BiSeNetOutput(128, 64, n_classes)
+        self.conv_out32 = BiSeNetOutput(128, 64, n_classes)
+        self.init_weight()
+
+    def forward(self, x):
+        H, W = x.size()[2:]
+        feat_res8, feat_cp8, feat_cp16 = self.cp(x)  # here return res3b1 feature
+        feat_sp = feat_res8  # use res3b1 feature to replace spatial path feature
+        feat_fuse = self.ffm(feat_sp, feat_cp8)
+
+        feat_out = self.conv_out(feat_fuse)
+        feat_out16 = self.conv_out16(feat_cp8)
+        feat_out32 = self.conv_out32(feat_cp16)
+
+        feat_out = F.interpolate(feat_out, (H, W), mode='bilinear', align_corners=True)
+        feat_out16 = F.interpolate(feat_out16, (H, W), mode='bilinear', align_corners=True)
+        feat_out32 = F.interpolate(feat_out32, (H, W), mode='bilinear', align_corners=True)
+        return feat_out, feat_out16, feat_out32
+
+    def init_weight(self):
+        for ly in self.children():
+            if isinstance(ly, nn.Conv2d):
+                nn.init.kaiming_normal_(ly.weight, a=1)
+                if not ly.bias is None: nn.init.constant_(ly.bias, 0)
+
+    def get_params(self):
+        wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params = [], [], [], []
+        for name, child in self.named_children():
+            child_wd_params, child_nowd_params = child.get_params()
+            if isinstance(child, FeatureFusionModule) or isinstance(child, BiSeNetOutput):
+                lr_mul_wd_params += child_wd_params
+                lr_mul_nowd_params += child_nowd_params
+            else:
+                wd_params += child_wd_params
+                nowd_params += child_nowd_params
+        return wd_params, nowd_params, lr_mul_wd_params, lr_mul_nowd_params
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingResnet.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingResnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a58284303d4d9524f29cac572b8be61f22ed069
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/ParsingResnet.py
@@ -0,0 +1,93 @@
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as modelzoo
+
+resnet18_url = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_chan, out_chan, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(in_chan, out_chan, stride)
+        self.bn1 = nn.BatchNorm2d(out_chan)
+        self.conv2 = conv3x3(out_chan, out_chan)
+        self.bn2 = nn.BatchNorm2d(out_chan)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        if in_chan != out_chan or stride != 1:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_chan, out_chan,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_chan),
+                )
+
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = F.relu(self.bn1(residual))
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+
+        shortcut = x
+        if self.downsample is not None:
+            shortcut = self.downsample(x)
+
+        out = shortcut + residual
+        out = self.relu(out)
+        return out
+
+
+def create_layer_basic(in_chan, out_chan, bnum, stride=1):
+    layers = [BasicBlock(in_chan, out_chan, stride=stride)]
+    for i in range(bnum-1):
+        layers.append(BasicBlock(out_chan, out_chan, stride=1))
+    return nn.Sequential(*layers)
+
+
+class Resnet18(nn.Module):
+    def __init__(self):
+        super(Resnet18, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1)
+        self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2)
+        self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2)
+        self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2)
+        self.init_weight()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feat8 = self.layer2(x) # 1/8
+        feat16 = self.layer3(feat8) # 1/16
+        feat32 = self.layer4(feat16) # 1/32
+        return feat8, feat16, feat32
+
+    def init_weight(self):
+        state_dict = modelzoo.load_url(resnet18_url)
+        self_state_dict = self.state_dict()
+        for k, v in state_dict.items():
+            if 'fc' in k: continue
+            self_state_dict.update({k: v})
+        self.load_state_dict(self_state_dict)
+
+    def get_params(self):
+        wd_params, nowd_params = [], []
+        for name, module in self.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                wd_params.append(module.weight)
+                if not module.bias is None:
+                    nowd_params.append(module.bias)
+            elif isinstance(module,  nn.BatchNorm2d):
+                nowd_params += list(module.parameters())
+        return wd_params, nowd_params
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2d98d5266466a590894ecaed9d0c8ea1c7dffd8
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception.py
@@ -0,0 +1,986 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import OrderedDict
+import torch.utils.model_zoo as model_zoo
+from torch.nn.parameter import Parameter
+
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=False,
+    ):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            inplanes,
+            inplanes,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups=inplanes,
+            bias=bias,
+        )
+        self.pointwise = nn.Conv2d(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+
+
+def fixed_padding(inputs, kernel_size, rate):
+    kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
+    pad_total = kernel_size_effective - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))
+    return padded_inputs
+
+
+class SeparableConv2d_aspp(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        bias=False,
+        padding=0,
+    ):
+        super(SeparableConv2d_aspp, self).__init__()
+
+        self.depthwise = nn.Conv2d(
+            inplanes,
+            inplanes,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups=inplanes,
+            bias=bias,
+        )
+        self.depthwise_bn = nn.BatchNorm2d(inplanes)
+        self.pointwise = nn.Conv2d(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)
+        self.pointwise_bn = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        #         x = fixed_padding(x, self.depthwise.kernel_size[0], rate=self.depthwise.dilation[0])
+        x = self.depthwise(x)
+        x = self.depthwise_bn(x)
+        x = self.relu(x)
+        x = self.pointwise(x)
+        x = self.pointwise_bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Decoder_module(nn.Module):
+    def __init__(self, inplanes, planes, rate=1):
+        super(Decoder_module, self).__init__()
+        self.atrous_convolution = SeparableConv2d_aspp(
+            inplanes, planes, 3, stride=1, dilation=rate, padding=1
+        )
+
+    def forward(self, x):
+        x = self.atrous_convolution(x)
+        return x
+
+
+class ASPP_module(nn.Module):
+    def __init__(self, inplanes, planes, rate):
+        super(ASPP_module, self).__init__()
+        if rate == 1:
+            raise RuntimeError()
+        else:
+            kernel_size = 3
+            padding = rate
+            self.atrous_convolution = SeparableConv2d_aspp(
+                inplanes, planes, 3, stride=1, dilation=rate, padding=padding
+            )
+
+    def forward(self, x):
+        x = self.atrous_convolution(x)
+        return x
+
+
+class ASPP_module_rate0(nn.Module):
+    def __init__(self, inplanes, planes, rate=1):
+        super(ASPP_module_rate0, self).__init__()
+        if rate == 1:
+            kernel_size = 1
+            padding = 0
+            self.atrous_convolution = nn.Conv2d(
+                inplanes,
+                planes,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding,
+                dilation=rate,
+                bias=False,
+            )
+            self.bn = nn.BatchNorm2d(planes, eps=1e-5, affine=True)
+            self.relu = nn.ReLU()
+        else:
+            raise RuntimeError()
+
+    def forward(self, x):
+        x = self.atrous_convolution(x)
+        x = self.bn(x)
+        return self.relu(x)
+
+
+class SeparableConv2d_same(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        bias=False,
+        padding=0,
+    ):
+        super(SeparableConv2d_same, self).__init__()
+
+        self.depthwise = nn.Conv2d(
+            inplanes,
+            inplanes,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups=inplanes,
+            bias=bias,
+        )
+        self.depthwise_bn = nn.BatchNorm2d(inplanes)
+        self.pointwise = nn.Conv2d(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)
+        self.pointwise_bn = nn.BatchNorm2d(planes)
+
+    def forward(self, x):
+        x = fixed_padding(
+            x, self.depthwise.kernel_size[0], rate=self.depthwise.dilation[0]
+        )
+        x = self.depthwise(x)
+        x = self.depthwise_bn(x)
+        x = self.pointwise(x)
+        x = self.pointwise_bn(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        reps,
+        stride=1,
+        dilation=1,
+        start_with_relu=True,
+        grow_first=True,
+        is_last=False,
+    ):
+        super(Block, self).__init__()
+
+        if planes != inplanes or stride != 1:
+            self.skip = nn.Conv2d(inplanes, planes, 1, stride=2, bias=False)
+            if is_last:
+                self.skip = nn.Conv2d(inplanes, planes, 1, stride=1, bias=False)
+            self.skipbn = nn.BatchNorm2d(planes)
+        else:
+            self.skip = None
+
+        self.relu = nn.ReLU(inplace=True)
+        rep = []
+
+        filters = inplanes
+        if grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(inplanes, planes, 3, stride=1, dilation=dilation)
+            )
+            #             rep.append(nn.BatchNorm2d(planes))
+            filters = planes
+
+        for i in range(reps - 1):
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(filters, filters, 3, stride=1, dilation=dilation)
+            )
+        #             rep.append(nn.BatchNorm2d(filters))
+
+        if not grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(inplanes, planes, 3, stride=1, dilation=dilation)
+            )
+        #             rep.append(nn.BatchNorm2d(planes))
+
+        if not start_with_relu:
+            rep = rep[1:]
+
+        if stride != 1:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(planes, planes, 3, stride=2, dilation=dilation)
+            )
+
+        if is_last:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(planes, planes, 3, stride=1, dilation=dilation)
+            )
+
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = self.rep(inp)
+
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+        # print(x.size(),skip.size())
+        x += skip
+
+        return x
+
+
+class Block2(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        reps,
+        stride=1,
+        dilation=1,
+        start_with_relu=True,
+        grow_first=True,
+        is_last=False,
+    ):
+        super(Block2, self).__init__()
+
+        if planes != inplanes or stride != 1:
+            self.skip = nn.Conv2d(inplanes, planes, 1, stride=stride, bias=False)
+            self.skipbn = nn.BatchNorm2d(planes)
+        else:
+            self.skip = None
+
+        self.relu = nn.ReLU(inplace=True)
+        rep = []
+
+        filters = inplanes
+        if grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(inplanes, planes, 3, stride=1, dilation=dilation)
+            )
+            #             rep.append(nn.BatchNorm2d(planes))
+            filters = planes
+
+        for i in range(reps - 1):
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(filters, filters, 3, stride=1, dilation=dilation)
+            )
+        #             rep.append(nn.BatchNorm2d(filters))
+
+        if not grow_first:
+            rep.append(self.relu)
+            rep.append(
+                SeparableConv2d_same(inplanes, planes, 3, stride=1, dilation=dilation)
+            )
+        #             rep.append(nn.BatchNorm2d(planes))
+
+        if not start_with_relu:
+            rep = rep[1:]
+
+        if stride != 1:
+            self.block2_lastconv = nn.Sequential(
+                *[
+                    self.relu,
+                    SeparableConv2d_same(
+                        planes, planes, 3, stride=2, dilation=dilation
+                    ),
+                ]
+            )
+
+        if is_last:
+            rep.append(SeparableConv2d_same(planes, planes, 3, stride=1))
+
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = self.rep(inp)
+        low_middle = x.clone()
+        x1 = x
+        x1 = self.block2_lastconv(x1)
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+
+        x1 += skip
+
+        return x1, low_middle
+
+
+class Xception(nn.Module):
+    """
+    Modified Alighed Xception
+    """
+
+    def __init__(self, inplanes=3, os=16, pretrained=False):
+        super(Xception, self).__init__()
+
+        if os == 16:
+            entry_block3_stride = 2
+            middle_block_rate = 1
+            exit_block_rates = (1, 2)
+        elif os == 8:
+            entry_block3_stride = 1
+            middle_block_rate = 2
+            exit_block_rates = (2, 4)
+        else:
+            raise NotImplementedError
+
+        # Entry flow
+        self.conv1 = nn.Conv2d(inplanes, 32, 3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+
+        self.block1 = Block(64, 128, reps=2, stride=2, start_with_relu=False)
+        self.block2 = Block2(
+            128, 256, reps=2, stride=2, start_with_relu=True, grow_first=True
+        )
+        self.block3 = Block(
+            256,
+            728,
+            reps=2,
+            stride=entry_block3_stride,
+            start_with_relu=True,
+            grow_first=True,
+        )
+
+        # Middle flow
+        self.block4 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block5 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block6 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block7 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block8 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block9 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block10 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block11 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block12 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block13 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block14 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block15 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block16 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block17 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block18 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+        self.block19 = Block(
+            728,
+            728,
+            reps=3,
+            stride=1,
+            dilation=middle_block_rate,
+            start_with_relu=True,
+            grow_first=True,
+        )
+
+        # Exit flow
+        self.block20 = Block(
+            728,
+            1024,
+            reps=2,
+            stride=1,
+            dilation=exit_block_rates[0],
+            start_with_relu=True,
+            grow_first=False,
+            is_last=True,
+        )
+
+        self.conv3 = SeparableConv2d_aspp(
+            1024,
+            1536,
+            3,
+            stride=1,
+            dilation=exit_block_rates[1],
+            padding=exit_block_rates[1],
+        )
+        # self.bn3 = nn.BatchNorm2d(1536)
+
+        self.conv4 = SeparableConv2d_aspp(
+            1536,
+            1536,
+            3,
+            stride=1,
+            dilation=exit_block_rates[1],
+            padding=exit_block_rates[1],
+        )
+        # self.bn4 = nn.BatchNorm2d(1536)
+
+        self.conv5 = SeparableConv2d_aspp(
+            1536,
+            2048,
+            3,
+            stride=1,
+            dilation=exit_block_rates[1],
+            padding=exit_block_rates[1],
+        )
+        # self.bn5 = nn.BatchNorm2d(2048)
+
+        # Init weights
+        # self.__init_weight()
+
+        # Load pretrained model
+        if pretrained:
+            self.__load_xception_pretrained()
+
+    def forward(self, x):
+        # Entry flow
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        # print('conv1 ',x.size())
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.block1(x)
+        # print('block1',x.size())
+        # low_level_feat = x
+        x, low_level_feat = self.block2(x)
+        # print('block2',x.size())
+        x = self.block3(x)
+        # print('xception block3 ',x.size())
+
+        # Middle flow
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+        x = self.block13(x)
+        x = self.block14(x)
+        x = self.block15(x)
+        x = self.block16(x)
+        x = self.block17(x)
+        x = self.block18(x)
+        x = self.block19(x)
+
+        # Exit flow
+        x = self.block20(x)
+        x = self.conv3(x)
+        # x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.conv4(x)
+        # x = self.bn4(x)
+        x = self.relu(x)
+
+        x = self.conv5(x)
+        # x = self.bn5(x)
+        x = self.relu(x)
+
+        return x, low_level_feat
+
+    def __init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                # m.weight.data.normal_(0, math.sqrt(2. / n))
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def __load_xception_pretrained(self):
+        pretrain_dict = model_zoo.load_url(
+            "http://data.lip6.fr/cadene/pretrainedmodels/xception-b5690688.pth"
+        )
+        model_dict = {}
+        state_dict = self.state_dict()
+
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                if "pointwise" in k:
+                    v = v.unsqueeze(-1).unsqueeze(-1)
+                if k.startswith("block12"):
+                    model_dict[k.replace("block12", "block20")] = v
+                elif k.startswith("block11"):
+                    model_dict[k.replace("block11", "block12")] = v
+                    model_dict[k.replace("block11", "block13")] = v
+                    model_dict[k.replace("block11", "block14")] = v
+                    model_dict[k.replace("block11", "block15")] = v
+                    model_dict[k.replace("block11", "block16")] = v
+                    model_dict[k.replace("block11", "block17")] = v
+                    model_dict[k.replace("block11", "block18")] = v
+                    model_dict[k.replace("block11", "block19")] = v
+                elif k.startswith("conv3"):
+                    model_dict[k] = v
+                elif k.startswith("bn3"):
+                    model_dict[k] = v
+                    model_dict[k.replace("bn3", "bn4")] = v
+                elif k.startswith("conv4"):
+                    model_dict[k.replace("conv4", "conv5")] = v
+                elif k.startswith("bn4"):
+                    model_dict[k.replace("bn4", "bn5")] = v
+                else:
+                    model_dict[k] = v
+        state_dict.update(model_dict)
+        self.load_state_dict(state_dict)
+
+
+class DeepLabv3_plus(nn.Module):
+    def __init__(
+        self, nInputChannels=3, n_classes=21, os=16, pretrained=False, _print=True
+    ):
+        if _print:
+            print("Constructing DeepLabv3+ model...")
+            print("Number of classes: {}".format(n_classes))
+            print("Output stride: {}".format(os))
+            print("Number of Input Channels: {}".format(nInputChannels))
+        super(DeepLabv3_plus, self).__init__()
+
+        # Atrous Conv
+        self.xception_features = Xception(nInputChannels, os, pretrained)
+
+        # ASPP
+        if os == 16:
+            rates = [1, 6, 12, 18]
+        elif os == 8:
+            rates = [1, 12, 24, 36]
+            raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = ASPP_module_rate0(2048, 256, rate=rates[0])
+        self.aspp2 = ASPP_module(2048, 256, rate=rates[1])
+        self.aspp3 = ASPP_module(2048, 256, rate=rates[2])
+        self.aspp4 = ASPP_module(2048, 256, rate=rates[3])
+
+        self.relu = nn.ReLU()
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            nn.Conv2d(2048, 256, 1, stride=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+        )
+
+        self.concat_projection_conv1 = nn.Conv2d(1280, 256, 1, bias=False)
+        self.concat_projection_bn1 = nn.BatchNorm2d(256)
+
+        # adopt [1x1, 48] for channel reduction.
+        self.feature_projection_conv1 = nn.Conv2d(256, 48, 1, bias=False)
+        self.feature_projection_bn1 = nn.BatchNorm2d(48)
+
+        self.decoder = nn.Sequential(Decoder_module(304, 256), Decoder_module(256, 256))
+        self.semantic = nn.Conv2d(256, n_classes, kernel_size=1, stride=1)
+
+    def forward(self, input):
+        x, low_level_features = self.xception_features(input)
+        # print(x.size())
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.upsample(x5, size=x4.size()[2:], mode="bilinear", align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.concat_projection_conv1(x)
+        x = self.concat_projection_bn1(x)
+        x = self.relu(x)
+        # print(x.size())
+
+        low_level_features = self.feature_projection_conv1(low_level_features)
+        low_level_features = self.feature_projection_bn1(low_level_features)
+        low_level_features = self.relu(low_level_features)
+
+        x = F.upsample(
+            x, size=low_level_features.size()[2:], mode="bilinear", align_corners=True
+        )
+        # print(low_level_features.size())
+        # print(x.size())
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.decoder(x)
+        x = self.semantic(x)
+        x = F.upsample(x, size=input.size()[2:], mode="bilinear", align_corners=True)
+
+        return x
+
+    def freeze_bn(self):
+        for m in self.xception_features.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def freeze_totally_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def freeze_aspp_bn(self):
+        for m in self.aspp1.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+        for m in self.aspp2.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+        for m in self.aspp3.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+        for m in self.aspp4.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def learnable_parameters(self):
+        layer_features_BN = []
+        layer_features = []
+        layer_aspp = []
+        layer_projection = []
+        layer_decoder = []
+        layer_other = []
+        model_para = list(self.named_parameters())
+        for name, para in model_para:
+            if "xception" in name:
+                if (
+                    "bn" in name
+                    or "downsample.1.weight" in name
+                    or "downsample.1.bias" in name
+                ):
+                    layer_features_BN.append(para)
+                else:
+                    layer_features.append(para)
+                    # print (name)
+            elif "aspp" in name:
+                layer_aspp.append(para)
+            elif "projection" in name:
+                layer_projection.append(para)
+            elif "decode" in name:
+                layer_decoder.append(para)
+            elif "global" not in name:
+                layer_other.append(para)
+        return (
+            layer_features_BN,
+            layer_features,
+            layer_aspp,
+            layer_projection,
+            layer_decoder,
+            layer_other,
+        )
+
+    def get_backbone_para(self):
+        layer_features = []
+        other_features = []
+        model_para = list(self.named_parameters())
+        for name, para in model_para:
+            if "xception" in name:
+                layer_features.append(para)
+            else:
+                other_features.append(para)
+
+        return layer_features, other_features
+
+    def train_fixbn(self, mode=True, freeze_bn=True, freeze_bn_affine=False):
+        r"""Sets the module in training mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in training/evaluation
+        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        Returns:
+            Module: self
+        """
+        super(DeepLabv3_plus, self).train(mode)
+        if freeze_bn:
+            print("Freezing Mean/Var of BatchNorm2D.")
+            if freeze_bn_affine:
+                print("Freezing Weight/Bias of BatchNorm2D.")
+        if freeze_bn:
+            for m in self.xception_features.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if freeze_bn_affine:
+                        m.weight.requires_grad = False
+                        m.bias.requires_grad = False
+            # for m in self.aspp1.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.aspp2.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.aspp3.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.aspp4.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.global_avg_pool.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.concat_projection_bn1.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+            # for m in self.feature_projection_bn1.modules():
+            #     if isinstance(m, nn.BatchNorm2d):
+            #         m.eval()
+            #         if freeze_bn_affine:
+            #             m.weight.requires_grad = False
+            #             m.bias.requires_grad = False
+
+    def __init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                # torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def load_state_dict_new(self, state_dict):
+        own_state = self.state_dict()
+        # for name inshop_cos own_state:
+        #    print name
+        new_state_dict = OrderedDict()
+        for name, param in state_dict.items():
+            name = name.replace("module.", "")
+            new_state_dict[name] = 0
+            if name not in own_state:
+                if "num_batch" in name:
+                    continue
+                print('unexpected key "{}" in state_dict'.format(name))
+                continue
+                # if isinstance(param, own_state):
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            try:
+                own_state[name].copy_(param)
+            except:
+                print(
+                    "While copying the parameter named {}, whose dimensions in the model are"
+                    " {} and whose dimensions in the checkpoint are {}, ...".format(
+                        name, own_state[name].size(), param.size()
+                    )
+                )
+                continue  # i add inshop_cos 2018/02/01
+                # raise
+                # print 'copying %s' %name
+                # if isinstance(param, own_state):
+                # backwards compatibility for serialized parameters
+            own_state[name].copy_(param)
+            # print 'copying %s' %name
+
+        missing = set(own_state.keys()) - set(new_state_dict.keys())
+        if len(missing) > 0:
+            print('missing keys in state_dict: "{}"'.format(missing))
+
+
+def get_1x_lr_params(model):
+    """
+    This generator returns all the parameters of the net except for
+    the last classification layer. Note that for each batchnorm layer,
+    requires_grad is set to False in deeplab_resnet.py, therefore this function does not return
+    any batchnorm parameter
+    """
+    b = [model.xception_features]
+    for i in range(len(b)):
+        for k in b[i].parameters():
+            if k.requires_grad:
+                yield k
+
+
+def get_10x_lr_params(model):
+    """
+    This generator returns all the parameters for the last layer of the net,
+    which does the classification of pixel into classes
+    """
+    b = [
+        model.aspp1,
+        model.aspp2,
+        model.aspp3,
+        model.aspp4,
+        model.conv1,
+        model.conv2,
+        model.last_conv,
+    ]
+    for j in range(len(b)):
+        for k in b[j].parameters():
+            if k.requires_grad:
+                yield k
+
+
+if __name__ == "__main__":
+    model = DeepLabv3_plus(
+        nInputChannels=3, n_classes=21, os=16, pretrained=False, _print=True
+    )
+    model.eval()
+    image = torch.randn(1, 3, 512, 512) * 255
+    with torch.no_grad():
+        output = model.forward(image)
+    print(output.size())
+    # print(output)
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception_transfer.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception_transfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f9cfb625fb585e203c86392516d04c79452949
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/deeplab_xception_transfer.py
@@ -0,0 +1,753 @@
+import torch
+import torch.nn as nn
+from torch.nn import Parameter
+import torch.nn.functional as F
+from collections import OrderedDict
+from . import deeplab_xception, gcn
+from torch.nn.parameter import Parameter
+
+
+#######################
+# base model
+#######################
+
+
+class deeplab_xception_transfer_basemodel(deeplab_xception.DeepLabv3_plus):
+    def __init__(
+        self,
+        nInputChannels=3,
+        n_classes=7,
+        os=16,
+        input_channels=256,
+        hidden_layers=128,
+        out_channels=256,
+    ):
+        super(deeplab_xception_transfer_basemodel, self).__init__(
+            nInputChannels=nInputChannels,
+            n_classes=n_classes,
+            os=os,
+        )
+        ### source graph
+        # self.source_featuremap_2_graph = gcn.Featuremaps_to_Graph(input_channels=input_channels, hidden_layers=hidden_layers,
+        #                                                    nodes=n_classes)
+        # self.source_graph_conv1 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        # self.source_graph_conv2 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        # self.source_graph_conv3 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        #
+        # self.source_graph_2_fea = gcn.Graph_to_Featuremaps(input_channels=input_channels, output_channels=out_channels,
+        #                                             hidden_layers=hidden_layers, nodes=n_classes
+        #                                             )
+        # self.source_skip_conv = nn.Sequential(*[nn.Conv2d(input_channels, input_channels, kernel_size=1),
+        #                                  nn.ReLU(True)])
+
+        ### target graph
+        self.target_featuremap_2_graph = gcn.Featuremaps_to_Graph(
+            input_channels=input_channels, hidden_layers=hidden_layers, nodes=n_classes
+        )
+        self.target_graph_conv1 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.target_graph_conv2 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.target_graph_conv3 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+
+        self.target_graph_2_fea = gcn.Graph_to_Featuremaps(
+            input_channels=input_channels,
+            output_channels=out_channels,
+            hidden_layers=hidden_layers,
+            nodes=n_classes,
+        )
+        self.target_skip_conv = nn.Sequential(
+            *[nn.Conv2d(input_channels, input_channels, kernel_size=1), nn.ReLU(True)]
+        )
+
+    def load_source_model(self, state_dict):
+        own_state = self.state_dict()
+        # for name inshop_cos own_state:
+        #    print name
+        new_state_dict = OrderedDict()
+        for name, param in state_dict.items():
+            name = name.replace("module.", "")
+            if (
+                "graph" in name
+                and "source" not in name
+                and "target" not in name
+                and "fc_graph" not in name
+                and "transpose_graph" not in name
+            ):
+                if "featuremap_2_graph" in name:
+                    name = name.replace(
+                        "featuremap_2_graph", "source_featuremap_2_graph"
+                    )
+                else:
+                    name = name.replace("graph", "source_graph")
+            new_state_dict[name] = 0
+            if name not in own_state:
+                if "num_batch" in name:
+                    continue
+                print('unexpected key "{}" in state_dict'.format(name))
+                continue
+                # if isinstance(param, own_state):
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            try:
+                own_state[name].copy_(param)
+            except:
+                print(
+                    "While copying the parameter named {}, whose dimensions in the model are"
+                    " {} and whose dimensions in the checkpoint are {}, ...".format(
+                        name, own_state[name].size(), param.size()
+                    )
+                )
+                continue  # i add inshop_cos 2018/02/01
+            own_state[name].copy_(param)
+            # print 'copying %s' %name
+
+        missing = set(own_state.keys()) - set(new_state_dict.keys())
+        if len(missing) > 0:
+            print('missing keys in state_dict: "{}"'.format(missing))
+
+    def get_target_parameter(self):
+        l = []
+        other = []
+        for name, k in self.named_parameters():
+            if "target" in name or "semantic" in name:
+                l.append(k)
+            else:
+                other.append(k)
+        return l, other
+
+    def get_semantic_parameter(self):
+        l = []
+        for name, k in self.named_parameters():
+            if "semantic" in name:
+                l.append(k)
+        return l
+
+    def get_source_parameter(self):
+        l = []
+        for name, k in self.named_parameters():
+            if "source" in name:
+                l.append(k)
+        return l
+
+    def forward(self, input, adj1_target=None, adj2_source=None, adj3_transfer=None):
+        x, low_level_features = self.xception_features(input)
+        # print(x.size())
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.upsample(x5, size=x4.size()[2:], mode="bilinear", align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.concat_projection_conv1(x)
+        x = self.concat_projection_bn1(x)
+        x = self.relu(x)
+        # print(x.size())
+        x = F.upsample(
+            x, size=low_level_features.size()[2:], mode="bilinear", align_corners=True
+        )
+
+        low_level_features = self.feature_projection_conv1(low_level_features)
+        low_level_features = self.feature_projection_bn1(low_level_features)
+        low_level_features = self.relu(low_level_features)
+        # print(low_level_features.size())
+        # print(x.size())
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.decoder(x)
+
+        ### add graph
+
+        # target graph
+        # print('x size',x.size(),adj1.size())
+        graph = self.target_featuremap_2_graph(x)
+
+        # graph combine
+        # print(graph.size(),source_2_target_graph.size())
+        # graph = self.fc_graph.forward(graph,relu=True)
+        # print(graph.size())
+
+        graph = self.target_graph_conv1.forward(graph, adj=adj1_target, relu=True)
+        graph = self.target_graph_conv2.forward(graph, adj=adj1_target, relu=True)
+        graph = self.target_graph_conv3.forward(graph, adj=adj1_target, relu=True)
+        # print(graph.size(),x.size())
+        # graph = self.gcn_encode.forward(graph,relu=True)
+        # graph = self.graph_conv2.forward(graph,adj=adj2,relu=True)
+        # graph = self.gcn_decode.forward(graph,relu=True)
+        graph = self.target_graph_2_fea.forward(graph, x)
+        x = self.target_skip_conv(x)
+        x = x + graph
+
+        ###
+        x = self.semantic(x)
+        x = F.upsample(x, size=input.size()[2:], mode="bilinear", align_corners=True)
+
+        return x
+
+
+class deeplab_xception_transfer_basemodel_savememory(deeplab_xception.DeepLabv3_plus):
+    def __init__(
+        self,
+        nInputChannels=3,
+        n_classes=7,
+        os=16,
+        input_channels=256,
+        hidden_layers=128,
+        out_channels=256,
+    ):
+        super(deeplab_xception_transfer_basemodel_savememory, self).__init__(
+            nInputChannels=nInputChannels,
+            n_classes=n_classes,
+            os=os,
+        )
+        ### source graph
+
+        ### target graph
+        self.target_featuremap_2_graph = gcn.Featuremaps_to_Graph(
+            input_channels=input_channels, hidden_layers=hidden_layers, nodes=n_classes
+        )
+        self.target_graph_conv1 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.target_graph_conv2 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.target_graph_conv3 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+
+        self.target_graph_2_fea = gcn.Graph_to_Featuremaps_savemem(
+            input_channels=input_channels,
+            output_channels=out_channels,
+            hidden_layers=hidden_layers,
+            nodes=n_classes,
+        )
+        self.target_skip_conv = nn.Sequential(
+            *[nn.Conv2d(input_channels, input_channels, kernel_size=1), nn.ReLU(True)]
+        )
+
+    def load_source_model(self, state_dict):
+        own_state = self.state_dict()
+        # for name inshop_cos own_state:
+        #    print name
+        new_state_dict = OrderedDict()
+        for name, param in state_dict.items():
+            name = name.replace("module.", "")
+            if (
+                "graph" in name
+                and "source" not in name
+                and "target" not in name
+                and "fc_graph" not in name
+                and "transpose_graph" not in name
+            ):
+                if "featuremap_2_graph" in name:
+                    name = name.replace(
+                        "featuremap_2_graph", "source_featuremap_2_graph"
+                    )
+                else:
+                    name = name.replace("graph", "source_graph")
+            new_state_dict[name] = 0
+            if name not in own_state:
+                if "num_batch" in name:
+                    continue
+                print('unexpected key "{}" in state_dict'.format(name))
+                continue
+                # if isinstance(param, own_state):
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            try:
+                own_state[name].copy_(param)
+            except:
+                print(
+                    "While copying the parameter named {}, whose dimensions in the model are"
+                    " {} and whose dimensions in the checkpoint are {}, ...".format(
+                        name, own_state[name].size(), param.size()
+                    )
+                )
+                continue  # i add inshop_cos 2018/02/01
+            own_state[name].copy_(param)
+            # print 'copying %s' %name
+
+        missing = set(own_state.keys()) - set(new_state_dict.keys())
+        if len(missing) > 0:
+            print('missing keys in state_dict: "{}"'.format(missing))
+
+    def get_target_parameter(self):
+        l = []
+        other = []
+        for name, k in self.named_parameters():
+            if "target" in name or "semantic" in name:
+                l.append(k)
+            else:
+                other.append(k)
+        return l, other
+
+    def get_semantic_parameter(self):
+        l = []
+        for name, k in self.named_parameters():
+            if "semantic" in name:
+                l.append(k)
+        return l
+
+    def get_source_parameter(self):
+        l = []
+        for name, k in self.named_parameters():
+            if "source" in name:
+                l.append(k)
+        return l
+
+    def forward(self, input, adj1_target=None, adj2_source=None, adj3_transfer=None):
+        x, low_level_features = self.xception_features(input)
+        # print(x.size())
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.upsample(x5, size=x4.size()[2:], mode="bilinear", align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.concat_projection_conv1(x)
+        x = self.concat_projection_bn1(x)
+        x = self.relu(x)
+        # print(x.size())
+        x = F.upsample(
+            x, size=low_level_features.size()[2:], mode="bilinear", align_corners=True
+        )
+
+        low_level_features = self.feature_projection_conv1(low_level_features)
+        low_level_features = self.feature_projection_bn1(low_level_features)
+        low_level_features = self.relu(low_level_features)
+        # print(low_level_features.size())
+        # print(x.size())
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.decoder(x)
+
+        ### add graph
+
+        # target graph
+        # print('x size',x.size(),adj1.size())
+        graph = self.target_featuremap_2_graph(x)
+
+        # graph combine
+        # print(graph.size(),source_2_target_graph.size())
+        # graph = self.fc_graph.forward(graph,relu=True)
+        # print(graph.size())
+
+        graph = self.target_graph_conv1.forward(graph, adj=adj1_target, relu=True)
+        graph = self.target_graph_conv2.forward(graph, adj=adj1_target, relu=True)
+        graph = self.target_graph_conv3.forward(graph, adj=adj1_target, relu=True)
+        # print(graph.size(),x.size())
+        # graph = self.gcn_encode.forward(graph,relu=True)
+        # graph = self.graph_conv2.forward(graph,adj=adj2,relu=True)
+        # graph = self.gcn_decode.forward(graph,relu=True)
+        graph = self.target_graph_2_fea.forward(graph, x)
+        x = self.target_skip_conv(x)
+        x = x + graph
+
+        ###
+        x = self.semantic(x)
+        x = F.upsample(x, size=input.size()[2:], mode="bilinear", align_corners=True)
+
+        return x
+
+
+#######################
+# transfer model
+#######################
+
+
+class deeplab_xception_transfer_projection(deeplab_xception_transfer_basemodel):
+    def __init__(
+        self,
+        nInputChannels=3,
+        n_classes=7,
+        os=16,
+        input_channels=256,
+        hidden_layers=128,
+        out_channels=256,
+        transfer_graph=None,
+        source_classes=20,
+    ):
+        super(deeplab_xception_transfer_projection, self).__init__(
+            nInputChannels=nInputChannels,
+            n_classes=n_classes,
+            os=os,
+            input_channels=input_channels,
+            hidden_layers=hidden_layers,
+            out_channels=out_channels,
+        )
+        self.source_featuremap_2_graph = gcn.Featuremaps_to_Graph(
+            input_channels=input_channels,
+            hidden_layers=hidden_layers,
+            nodes=source_classes,
+        )
+        self.source_graph_conv1 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.source_graph_conv2 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.source_graph_conv3 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.transpose_graph = gcn.Graph_trans(
+            in_features=hidden_layers,
+            out_features=hidden_layers,
+            adj=transfer_graph,
+            begin_nodes=source_classes,
+            end_nodes=n_classes,
+        )
+        self.fc_graph = gcn.GraphConvolution(hidden_layers * 3, hidden_layers)
+
+    def forward(self, input, adj1_target=None, adj2_source=None, adj3_transfer=None):
+        x, low_level_features = self.xception_features(input)
+        # print(x.size())
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.upsample(x5, size=x4.size()[2:], mode="bilinear", align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.concat_projection_conv1(x)
+        x = self.concat_projection_bn1(x)
+        x = self.relu(x)
+        # print(x.size())
+        x = F.upsample(
+            x, size=low_level_features.size()[2:], mode="bilinear", align_corners=True
+        )
+
+        low_level_features = self.feature_projection_conv1(low_level_features)
+        low_level_features = self.feature_projection_bn1(low_level_features)
+        low_level_features = self.relu(low_level_features)
+        # print(low_level_features.size())
+        # print(x.size())
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.decoder(x)
+
+        ### add graph
+        # source graph
+        source_graph = self.source_featuremap_2_graph(x)
+        source_graph1 = self.source_graph_conv1.forward(
+            source_graph, adj=adj2_source, relu=True
+        )
+        source_graph2 = self.source_graph_conv2.forward(
+            source_graph1, adj=adj2_source, relu=True
+        )
+        source_graph3 = self.source_graph_conv2.forward(
+            source_graph2, adj=adj2_source, relu=True
+        )
+
+        source_2_target_graph1_v5 = self.transpose_graph.forward(
+            source_graph1, adj=adj3_transfer, relu=True
+        )
+        source_2_target_graph2_v5 = self.transpose_graph.forward(
+            source_graph2, adj=adj3_transfer, relu=True
+        )
+        source_2_target_graph3_v5 = self.transpose_graph.forward(
+            source_graph3, adj=adj3_transfer, relu=True
+        )
+
+        # target graph
+        # print('x size',x.size(),adj1.size())
+        graph = self.target_featuremap_2_graph(x)
+
+        source_2_target_graph1 = self.similarity_trans(source_graph1, graph)
+        # graph combine 1
+        # print(graph.size())
+        # print(source_2_target_graph1.size())
+        # print(source_2_target_graph1_v5.size())
+        graph = torch.cat(
+            (
+                graph,
+                source_2_target_graph1.squeeze(0),
+                source_2_target_graph1_v5.squeeze(0),
+            ),
+            dim=-1,
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv1.forward(graph, adj=adj1_target, relu=True)
+
+        source_2_target_graph2 = self.similarity_trans(source_graph2, graph)
+        # graph combine 2
+        graph = torch.cat(
+            (graph, source_2_target_graph2, source_2_target_graph2_v5), dim=-1
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv2.forward(graph, adj=adj1_target, relu=True)
+
+        source_2_target_graph3 = self.similarity_trans(source_graph3, graph)
+        # graph combine 3
+        graph = torch.cat(
+            (graph, source_2_target_graph3, source_2_target_graph3_v5), dim=-1
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv3.forward(graph, adj=adj1_target, relu=True)
+
+        # print(graph.size(),x.size())
+
+        graph = self.target_graph_2_fea.forward(graph, x)
+        x = self.target_skip_conv(x)
+        x = x + graph
+
+        ###
+        x = self.semantic(x)
+        x = F.upsample(x, size=input.size()[2:], mode="bilinear", align_corners=True)
+
+        return x
+
+    def similarity_trans(self, source, target):
+        sim = torch.matmul(
+            F.normalize(target, p=2, dim=-1),
+            F.normalize(source, p=2, dim=-1).transpose(-1, -2),
+        )
+        sim = F.softmax(sim, dim=-1)
+        return torch.matmul(sim, source)
+
+    def load_source_model(self, state_dict):
+        own_state = self.state_dict()
+        # for name inshop_cos own_state:
+        #    print name
+        new_state_dict = OrderedDict()
+        for name, param in state_dict.items():
+            name = name.replace("module.", "")
+
+            if (
+                "graph" in name
+                and "source" not in name
+                and "target" not in name
+                and "fc_" not in name
+                and "transpose_graph" not in name
+            ):
+                if "featuremap_2_graph" in name:
+                    name = name.replace(
+                        "featuremap_2_graph", "source_featuremap_2_graph"
+                    )
+                else:
+                    name = name.replace("graph", "source_graph")
+            new_state_dict[name] = 0
+            if name not in own_state:
+                if "num_batch" in name:
+                    continue
+                print('unexpected key "{}" in state_dict'.format(name))
+                continue
+                # if isinstance(param, own_state):
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            try:
+                own_state[name].copy_(param)
+            except:
+                print(
+                    "While copying the parameter named {}, whose dimensions in the model are"
+                    " {} and whose dimensions in the checkpoint are {}, ...".format(
+                        name, own_state[name].size(), param.size()
+                    )
+                )
+                continue  # i add inshop_cos 2018/02/01
+            own_state[name].copy_(param)
+            # print 'copying %s' %name
+
+        missing = set(own_state.keys()) - set(new_state_dict.keys())
+        if len(missing) > 0:
+            print('missing keys in state_dict: "{}"'.format(missing))
+
+
+class deeplab_xception_transfer_projection_savemem(
+    deeplab_xception_transfer_basemodel_savememory
+):
+    def __init__(
+        self,
+        nInputChannels=3,
+        n_classes=7,
+        os=16,
+        input_channels=256,
+        hidden_layers=128,
+        out_channels=256,
+        transfer_graph=None,
+        source_classes=20,
+    ):
+        super(deeplab_xception_transfer_projection_savemem, self).__init__(
+            nInputChannels=nInputChannels,
+            n_classes=n_classes,
+            os=os,
+            input_channels=input_channels,
+            hidden_layers=hidden_layers,
+            out_channels=out_channels,
+        )
+        self.source_featuremap_2_graph = gcn.Featuremaps_to_Graph(
+            input_channels=input_channels,
+            hidden_layers=hidden_layers,
+            nodes=source_classes,
+        )
+        self.source_graph_conv1 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.source_graph_conv2 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.source_graph_conv3 = gcn.GraphConvolution(hidden_layers, hidden_layers)
+        self.transpose_graph = gcn.Graph_trans(
+            in_features=hidden_layers,
+            out_features=hidden_layers,
+            adj=transfer_graph,
+            begin_nodes=source_classes,
+            end_nodes=n_classes,
+        )
+        self.fc_graph = gcn.GraphConvolution(hidden_layers * 3, hidden_layers)
+
+    def forward(self, input, adj1_target=None, adj2_source=None, adj3_transfer=None):
+        x, low_level_features = self.xception_features(input)
+        # print(x.size())
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.upsample(x5, size=x4.size()[2:], mode="bilinear", align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.concat_projection_conv1(x)
+        x = self.concat_projection_bn1(x)
+        x = self.relu(x)
+        # print(x.size())
+        x = F.upsample(
+            x, size=low_level_features.size()[2:], mode="bilinear", align_corners=True
+        )
+
+        low_level_features = self.feature_projection_conv1(low_level_features)
+        low_level_features = self.feature_projection_bn1(low_level_features)
+        low_level_features = self.relu(low_level_features)
+        # print(low_level_features.size())
+        # print(x.size())
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.decoder(x)
+
+        ### add graph
+        # source graph
+        source_graph = self.source_featuremap_2_graph(x)
+        source_graph1 = self.source_graph_conv1.forward(
+            source_graph, adj=adj2_source, relu=True
+        )
+        source_graph2 = self.source_graph_conv2.forward(
+            source_graph1, adj=adj2_source, relu=True
+        )
+        source_graph3 = self.source_graph_conv2.forward(
+            source_graph2, adj=adj2_source, relu=True
+        )
+
+        source_2_target_graph1_v5 = self.transpose_graph.forward(
+            source_graph1, adj=adj3_transfer, relu=True
+        )
+        source_2_target_graph2_v5 = self.transpose_graph.forward(
+            source_graph2, adj=adj3_transfer, relu=True
+        )
+        source_2_target_graph3_v5 = self.transpose_graph.forward(
+            source_graph3, adj=adj3_transfer, relu=True
+        )
+
+        # target graph
+        # print('x size',x.size(),adj1.size())
+        graph = self.target_featuremap_2_graph(x)
+
+        source_2_target_graph1 = self.similarity_trans(source_graph1, graph)
+        # graph combine 1
+        graph = torch.cat(
+            (
+                graph,
+                source_2_target_graph1.squeeze(0),
+                source_2_target_graph1_v5.squeeze(0),
+            ),
+            dim=-1,
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv1.forward(graph, adj=adj1_target, relu=True)
+
+        source_2_target_graph2 = self.similarity_trans(source_graph2, graph)
+        # graph combine 2
+        graph = torch.cat(
+            (graph, source_2_target_graph2, source_2_target_graph2_v5), dim=-1
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv2.forward(graph, adj=adj1_target, relu=True)
+
+        source_2_target_graph3 = self.similarity_trans(source_graph3, graph)
+        # graph combine 3
+        graph = torch.cat(
+            (graph, source_2_target_graph3, source_2_target_graph3_v5), dim=-1
+        )
+        graph = self.fc_graph.forward(graph, relu=True)
+
+        graph = self.target_graph_conv3.forward(graph, adj=adj1_target, relu=True)
+
+        # print(graph.size(),x.size())
+
+        graph = self.target_graph_2_fea.forward(graph, x)
+        x = self.target_skip_conv(x)
+        x = x + graph
+
+        ###
+        x = self.semantic(x)
+        x = F.upsample(x, size=input.size()[2:], mode="bilinear", align_corners=True)
+
+        return x
+
+    def similarity_trans(self, source, target):
+        sim = torch.matmul(
+            F.normalize(target, p=2, dim=-1),
+            F.normalize(source, p=2, dim=-1).transpose(-1, -2),
+        )
+        sim = F.softmax(sim, dim=-1)
+        return torch.matmul(sim, source)
+
+    def load_source_model(self, state_dict):
+        own_state = self.state_dict()
+        # for name inshop_cos own_state:
+        #    print name
+        new_state_dict = OrderedDict()
+        for name, param in state_dict.items():
+            name = name.replace("module.", "")
+
+            if (
+                "graph" in name
+                and "source" not in name
+                and "target" not in name
+                and "fc_" not in name
+                and "transpose_graph" not in name
+            ):
+                if "featuremap_2_graph" in name:
+                    name = name.replace(
+                        "featuremap_2_graph", "source_featuremap_2_graph"
+                    )
+                else:
+                    name = name.replace("graph", "source_graph")
+            new_state_dict[name] = 0
+            if name not in own_state:
+                if "num_batch" in name:
+                    continue
+                print('unexpected key "{}" in state_dict'.format(name))
+                continue
+                # if isinstance(param, own_state):
+            if isinstance(param, Parameter):
+                # backwards compatibility for serialized parameters
+                param = param.data
+            try:
+                own_state[name].copy_(param)
+            except:
+                print(
+                    "While copying the parameter named {}, whose dimensions in the model are"
+                    " {} and whose dimensions in the checkpoint are {}, ...".format(
+                        name, own_state[name].size(), param.size()
+                    )
+                )
+                continue  # i add inshop_cos 2018/02/01
+            own_state[name].copy_(param)
+            # print 'copying %s' %name
+
+        missing = set(own_state.keys()) - set(new_state_dict.keys())
+        if len(missing) > 0:
+            print('missing keys in state_dict: "{}"'.format(missing))
+
+
+# if __name__ == '__main__':
+# net = deeplab_xception_transfer_projection_v3v5_more_savemem()
+# img = torch.rand((2,3,128,128))
+# net.eval()
+# a = torch.rand((1,1,7,7))
+# net.forward(img, adj1_target=a)
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/gcn.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/gcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5eb0661bed24e24167b3a9426a97a639a4ff70d
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/gcn.py
@@ -0,0 +1,304 @@
+import torch
+from . import graph
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+class GraphConvolution(nn.Module):
+    def __init__(self, in_features, out_features, bias=False):
+        super(GraphConvolution, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
+        if bias:
+            self.bias = Parameter(torch.FloatTensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # stdv = 1./math.sqrt(self.weight(1))
+        # self.weight.data.uniform_(-stdv,stdv)
+        torch.nn.init.xavier_uniform_(self.weight)
+        # if self.bias is not None:
+        #     self.bias.data.uniform_(-stdv,stdv)
+
+    def forward(self, input, adj=None, relu=False):
+        support = torch.matmul(input, self.weight)
+        # print(support.size(),adj.size())
+        if adj is not None:
+            output = torch.matmul(adj, support)
+        else:
+            output = support
+        # print(output.size())
+        if self.bias is not None:
+            return output + self.bias
+        else:
+            if relu:
+                return F.relu(output)
+            else:
+                return output
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + " ("
+            + str(self.in_features)
+            + " -> "
+            + str(self.out_features)
+            + ")"
+        )
+
+
+class Featuremaps_to_Graph(nn.Module):
+    def __init__(self, input_channels, hidden_layers, nodes=7):
+        super(Featuremaps_to_Graph, self).__init__()
+        self.pre_fea = Parameter(torch.FloatTensor(input_channels, nodes))
+        self.weight = Parameter(torch.FloatTensor(input_channels, hidden_layers))
+        self.reset_parameters()
+
+    def forward(self, input):
+        n, c, h, w = input.size()
+        # print('fea input',input.size())
+        input1 = input.view(n, c, h * w)
+        input1 = input1.transpose(1, 2)  # n x hw x c
+        # print('fea input1', input1.size())
+        ############## Feature maps to node ################
+        fea_node = torch.matmul(input1, self.pre_fea)  # n x hw x n_classes
+        weight_node = torch.matmul(input1, self.weight)  # n x hw x hidden_layer
+        # softmax fea_node
+        fea_node = F.softmax(fea_node, dim=-1)
+        # print(fea_node.size(),weight_node.size())
+        graph_node = F.relu(torch.matmul(fea_node.transpose(1, 2), weight_node))
+        return graph_node  # n x n_class x hidden_layer
+
+    def reset_parameters(self):
+        for ww in self.parameters():
+            torch.nn.init.xavier_uniform_(ww)
+        # if self.bias is not None:
+        #     self.bias.data.uniform_(-stdv,stdv)
+
+
+class Featuremaps_to_Graph_transfer(nn.Module):
+    def __init__(self, input_channels, hidden_layers, nodes=7, source_nodes=20):
+        super(Featuremaps_to_Graph_transfer, self).__init__()
+        self.pre_fea = Parameter(torch.FloatTensor(input_channels, nodes))
+        self.weight = Parameter(torch.FloatTensor(input_channels, hidden_layers))
+        self.pre_fea_transfer = nn.Sequential(
+            *[
+                nn.Linear(source_nodes, source_nodes),
+                nn.LeakyReLU(True),
+                nn.Linear(source_nodes, nodes),
+                nn.LeakyReLU(True),
+            ]
+        )
+        self.reset_parameters()
+
+    def forward(self, input, source_pre_fea):
+        self.pre_fea.data = self.pre_fea_learn(source_pre_fea)
+        n, c, h, w = input.size()
+        # print('fea input',input.size())
+        input1 = input.view(n, c, h * w)
+        input1 = input1.transpose(1, 2)  # n x hw x c
+        # print('fea input1', input1.size())
+        ############## Feature maps to node ################
+        fea_node = torch.matmul(input1, self.pre_fea)  # n x hw x n_classes
+        weight_node = torch.matmul(input1, self.weight)  # n x hw x hidden_layer
+        # softmax fea_node
+        fea_node = F.softmax(fea_node, dim=1)
+        # print(fea_node.size(),weight_node.size())
+        graph_node = F.relu(torch.matmul(fea_node.transpose(1, 2), weight_node))
+        return graph_node  # n x n_class x hidden_layer
+
+    def pre_fea_learn(self, input):
+        pre_fea = self.pre_fea_transfer.forward(input.unsqueeze(0)).squeeze(0)
+        return self.pre_fea.data + pre_fea
+
+
+class Graph_to_Featuremaps(nn.Module):
+    # this is a special version
+    def __init__(self, input_channels, output_channels, hidden_layers, nodes=7):
+        super(Graph_to_Featuremaps, self).__init__()
+        self.node_fea = Parameter(torch.FloatTensor(input_channels + hidden_layers, 1))
+        self.weight = Parameter(torch.FloatTensor(hidden_layers, output_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for ww in self.parameters():
+            torch.nn.init.xavier_uniform_(ww)
+
+    def forward(self, input, res_feature):
+        """
+
+        :param input: 1 x batch x nodes x hidden_layer
+        :param res_feature: batch x channels x h x w
+        :return:
+        """
+        batchi, channeli, hi, wi = res_feature.size()
+        # print(res_feature.size())
+        # print(input.size())
+        try:
+            _, batch, nodes, hidden = input.size()
+        except:
+            # print(input.size())
+            input = input.unsqueeze(0)
+            _, batch, nodes, hidden = input.size()
+
+        assert batch == batchi
+        input1 = input.transpose(0, 1).expand(batch, hi * wi, nodes, hidden)
+        res_feature_after_view = res_feature.view(batch, channeli, hi * wi).transpose(
+            1, 2
+        )
+        res_feature_after_view1 = res_feature_after_view.unsqueeze(2).expand(
+            batch, hi * wi, nodes, channeli
+        )
+        new_fea = torch.cat((res_feature_after_view1, input1), dim=3)
+
+        # print(self.node_fea.size(),new_fea.size())
+        new_node = torch.matmul(new_fea, self.node_fea)  # batch x hw x nodes x 1
+        new_weight = torch.matmul(input, self.weight)  # batch x node x channel
+        new_node = new_node.view(batch, hi * wi, nodes)
+        # 0721
+        new_node = F.softmax(new_node, dim=-1)
+        #
+        feature_out = torch.matmul(new_node, new_weight)
+        # print(feature_out.size())
+        feature_out = feature_out.transpose(2, 3).contiguous().view(res_feature.size())
+        return F.relu(feature_out)
+
+
+class Graph_to_Featuremaps_savemem(nn.Module):
+    # this is a special version for saving gpu memory. The process is same as Graph_to_Featuremaps.
+    def __init__(self, input_channels, output_channels, hidden_layers, nodes=7):
+        super(Graph_to_Featuremaps_savemem, self).__init__()
+        self.node_fea_for_res = Parameter(torch.FloatTensor(input_channels, 1))
+        self.node_fea_for_hidden = Parameter(torch.FloatTensor(hidden_layers, 1))
+        self.weight = Parameter(torch.FloatTensor(hidden_layers, output_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for ww in self.parameters():
+            torch.nn.init.xavier_uniform_(ww)
+
+    def forward(self, input, res_feature):
+        """
+
+        :param input: 1 x batch x nodes x hidden_layer
+        :param res_feature: batch x channels x h x w
+        :return:
+        """
+        batchi, channeli, hi, wi = res_feature.size()
+        # print(res_feature.size())
+        # print(input.size())
+        try:
+            _, batch, nodes, hidden = input.size()
+        except:
+            # print(input.size())
+            input = input.unsqueeze(0)
+            _, batch, nodes, hidden = input.size()
+
+        assert batch == batchi
+        input1 = input.transpose(0, 1).expand(batch, hi * wi, nodes, hidden)
+        res_feature_after_view = res_feature.view(batch, channeli, hi * wi).transpose(
+            1, 2
+        )
+        res_feature_after_view1 = res_feature_after_view.unsqueeze(2).expand(
+            batch, hi * wi, nodes, channeli
+        )
+        # new_fea = torch.cat((res_feature_after_view1,input1),dim=3)
+        ## sim
+        new_node1 = torch.matmul(res_feature_after_view1, self.node_fea_for_res)
+        new_node2 = torch.matmul(input1, self.node_fea_for_hidden)
+        new_node = new_node1 + new_node2
+        ## sim end
+        # print(self.node_fea.size(),new_fea.size())
+        # new_node = torch.matmul(new_fea, self.node_fea) # batch x hw x nodes x 1
+        new_weight = torch.matmul(input, self.weight)  # batch x node x channel
+        new_node = new_node.view(batch, hi * wi, nodes)
+        # 0721
+        new_node = F.softmax(new_node, dim=-1)
+        #
+        feature_out = torch.matmul(new_node, new_weight)
+        # print(feature_out.size())
+        feature_out = feature_out.transpose(2, 3).contiguous().view(res_feature.size())
+        return F.relu(feature_out)
+
+
+class Graph_trans(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        begin_nodes=7,
+        end_nodes=2,
+        bias=False,
+        adj=None,
+    ):
+        super(Graph_trans, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
+        if adj is not None:
+            h, w = adj.size()
+            assert (h == end_nodes) and (w == begin_nodes)
+            self.adj = torch.autograd.Variable(adj, requires_grad=False)
+        else:
+            self.adj = Parameter(torch.FloatTensor(end_nodes, begin_nodes))
+        if bias:
+            self.bias = Parameter(torch.FloatTensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        # self.reset_parameters()
+
+    def reset_parameters(self):
+        # stdv = 1./math.sqrt(self.weight(1))
+        # self.weight.data.uniform_(-stdv,stdv)
+        torch.nn.init.xavier_uniform_(self.weight)
+        # if self.bias is not None:
+        #     self.bias.data.uniform_(-stdv,stdv)
+
+    def forward(self, input, relu=False, adj_return=False, adj=None):
+        support = torch.matmul(input, self.weight)
+        # print(support.size(),self.adj.size())
+        if adj is None:
+            adj = self.adj
+        adj1 = self.norm_trans_adj(adj)
+        output = torch.matmul(adj1, support)
+        if adj_return:
+            output1 = F.normalize(output, p=2, dim=-1)
+            self.adj_mat = torch.matmul(output1, output1.transpose(-2, -1))
+        if self.bias is not None:
+            return output + self.bias
+        else:
+            if relu:
+                return F.relu(output)
+            else:
+                return output
+
+    def get_adj_mat(self):
+        adj = graph.normalize_adj_torch(F.relu(self.adj_mat))
+        return adj
+
+    def get_encode_adj(self):
+        return self.adj
+
+    def norm_trans_adj(self, adj):  # maybe can use softmax
+        adj = F.relu(adj)
+        r = F.softmax(adj, dim=-1)
+        # print(adj.size())
+        # row_sum = adj.sum(-1).unsqueeze(-1)
+        # d_mat = row_sum.expand(adj.size())
+        # r = torch.div(row_sum,d_mat)
+        # r[torch.isnan(r)] = 0
+
+        return r
+
+
+if __name__ == "__main__":
+
+    graph = torch.randn((7, 128))
+    en = GraphConvolution(128, 128)
+    a = en.forward(graph)
+    print(a)
+    # a = en.forward(graph,pred)
+    # print(a.size())
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graph.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c04e502d13f9e77876e18f1c859262c2d79d5e
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graph.py
@@ -0,0 +1,839 @@
+import torch
+import numpy as np
+import networkx as nx
+import scipy.sparse as sp
+
+pascal_graph = {
+    0: [0],
+    1: [1, 2],
+    2: [1, 2, 3, 5],
+    3: [2, 3, 4],
+    4: [3, 4],
+    5: [2, 5, 6],
+    6: [5, 6],
+}
+
+cihp_graph = {
+    0: [],
+    1: [2, 13],
+    2: [1, 13],
+    3: [14, 15],
+    4: [13],
+    5: [6, 7, 9, 10, 11, 12, 14, 15],
+    6: [5, 7, 10, 11, 14, 15, 16, 17],
+    7: [5, 6, 9, 10, 11, 12, 14, 15],
+    8: [16, 17, 18, 19],
+    9: [5, 7, 10, 16, 17, 18, 19],
+    10: [5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17],
+    11: [5, 6, 7, 10, 13],
+    12: [5, 7, 10, 16, 17],
+    13: [1, 2, 4, 10, 11],
+    14: [3, 5, 6, 7, 10],
+    15: [3, 5, 6, 7, 10],
+    16: [6, 8, 9, 10, 12, 18],
+    17: [6, 8, 9, 10, 12, 19],
+    18: [8, 9, 16],
+    19: [8, 9, 17],
+}
+
+atr_graph = {
+    0: [],
+    1: [2, 11],
+    2: [1, 11],
+    3: [11],
+    4: [5, 6, 7, 11, 14, 15, 17],
+    5: [4, 6, 7, 8, 12, 13],
+    6: [4, 5, 7, 8, 9, 10, 12, 13],
+    7: [4, 11, 12, 13, 14, 15],
+    8: [5, 6],
+    9: [6, 12],
+    10: [6, 13],
+    11: [1, 2, 3, 4, 7, 14, 15, 17],
+    12: [5, 6, 7, 9],
+    13: [5, 6, 7, 10],
+    14: [4, 7, 11, 16],
+    15: [4, 7, 11, 16],
+    16: [14, 15],
+    17: [4, 11],
+}
+
+cihp2pascal_adj = np.array(
+    [
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
+        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
+    ]
+)
+
+cihp2pascal_nlp_adj = np.array(
+    [
+        [
+            1.0,
+            0.35333052,
+            0.32727194,
+            0.17418084,
+            0.18757584,
+            0.40608522,
+            0.37503981,
+            0.35448462,
+            0.22598555,
+            0.23893579,
+            0.33064262,
+            0.28923404,
+            0.27986573,
+            0.4211553,
+            0.36915778,
+            0.41377746,
+            0.32485771,
+            0.37248222,
+            0.36865639,
+            0.41500332,
+        ],
+        [
+            0.39615879,
+            0.46201529,
+            0.52321467,
+            0.30826114,
+            0.25669527,
+            0.54747773,
+            0.3670523,
+            0.3901983,
+            0.27519473,
+            0.3433325,
+            0.52728509,
+            0.32771333,
+            0.34819325,
+            0.63882953,
+            0.68042925,
+            0.69368576,
+            0.63395791,
+            0.65344337,
+            0.59538781,
+            0.6071375,
+        ],
+        [
+            0.16373166,
+            0.21663339,
+            0.3053872,
+            0.28377612,
+            0.1372435,
+            0.4448808,
+            0.29479995,
+            0.31092595,
+            0.22703953,
+            0.33983576,
+            0.75778818,
+            0.2619818,
+            0.37069392,
+            0.35184867,
+            0.49877512,
+            0.49979437,
+            0.51853277,
+            0.52517541,
+            0.32517741,
+            0.32377309,
+        ],
+        [
+            0.32687232,
+            0.38482461,
+            0.37693463,
+            0.41610834,
+            0.20415749,
+            0.76749079,
+            0.35139853,
+            0.3787411,
+            0.28411737,
+            0.35155421,
+            0.58792618,
+            0.31141718,
+            0.40585111,
+            0.51189218,
+            0.82042737,
+            0.8342413,
+            0.70732188,
+            0.72752501,
+            0.60327325,
+            0.61431337,
+        ],
+        [
+            0.34069369,
+            0.34817292,
+            0.37525998,
+            0.36497069,
+            0.17841617,
+            0.69746208,
+            0.31731463,
+            0.34628951,
+            0.25167277,
+            0.32072379,
+            0.56711286,
+            0.24894776,
+            0.37000453,
+            0.52600859,
+            0.82483993,
+            0.84966274,
+            0.7033991,
+            0.73449378,
+            0.56649608,
+            0.58888791,
+        ],
+        [
+            0.28477487,
+            0.35139564,
+            0.42742352,
+            0.41664321,
+            0.20004676,
+            0.78566833,
+            0.42237487,
+            0.41048549,
+            0.37933812,
+            0.46542516,
+            0.62444759,
+            0.3274493,
+            0.49466009,
+            0.49314658,
+            0.71244233,
+            0.71497003,
+            0.8234787,
+            0.83566589,
+            0.62597135,
+            0.62626812,
+        ],
+        [
+            0.3011378,
+            0.31775977,
+            0.42922647,
+            0.36896257,
+            0.17597556,
+            0.72214655,
+            0.39162804,
+            0.38137872,
+            0.34980296,
+            0.43818419,
+            0.60879174,
+            0.26762545,
+            0.46271161,
+            0.51150476,
+            0.72318109,
+            0.73678399,
+            0.82620388,
+            0.84942166,
+            0.5943811,
+            0.60607602,
+        ],
+    ]
+)
+
+pascal2atr_nlp_adj = np.array(
+    [
+        [
+            1.0,
+            0.35333052,
+            0.32727194,
+            0.18757584,
+            0.40608522,
+            0.27986573,
+            0.23893579,
+            0.27600672,
+            0.30964391,
+            0.36865639,
+            0.41500332,
+            0.4211553,
+            0.32485771,
+            0.37248222,
+            0.36915778,
+            0.41377746,
+            0.32006291,
+            0.28923404,
+        ],
+        [
+            0.39615879,
+            0.46201529,
+            0.52321467,
+            0.25669527,
+            0.54747773,
+            0.34819325,
+            0.3433325,
+            0.26603942,
+            0.45162929,
+            0.59538781,
+            0.6071375,
+            0.63882953,
+            0.63395791,
+            0.65344337,
+            0.68042925,
+            0.69368576,
+            0.44354613,
+            0.32771333,
+        ],
+        [
+            0.16373166,
+            0.21663339,
+            0.3053872,
+            0.1372435,
+            0.4448808,
+            0.37069392,
+            0.33983576,
+            0.26563416,
+            0.35443504,
+            0.32517741,
+            0.32377309,
+            0.35184867,
+            0.51853277,
+            0.52517541,
+            0.49877512,
+            0.49979437,
+            0.21750868,
+            0.2619818,
+        ],
+        [
+            0.32687232,
+            0.38482461,
+            0.37693463,
+            0.20415749,
+            0.76749079,
+            0.40585111,
+            0.35155421,
+            0.28271333,
+            0.52684576,
+            0.60327325,
+            0.61431337,
+            0.51189218,
+            0.70732188,
+            0.72752501,
+            0.82042737,
+            0.8342413,
+            0.40137029,
+            0.31141718,
+        ],
+        [
+            0.34069369,
+            0.34817292,
+            0.37525998,
+            0.17841617,
+            0.69746208,
+            0.37000453,
+            0.32072379,
+            0.27268885,
+            0.47426719,
+            0.56649608,
+            0.58888791,
+            0.52600859,
+            0.7033991,
+            0.73449378,
+            0.82483993,
+            0.84966274,
+            0.37830796,
+            0.24894776,
+        ],
+        [
+            0.28477487,
+            0.35139564,
+            0.42742352,
+            0.20004676,
+            0.78566833,
+            0.49466009,
+            0.46542516,
+            0.32662614,
+            0.55780359,
+            0.62597135,
+            0.62626812,
+            0.49314658,
+            0.8234787,
+            0.83566589,
+            0.71244233,
+            0.71497003,
+            0.41223219,
+            0.3274493,
+        ],
+        [
+            0.3011378,
+            0.31775977,
+            0.42922647,
+            0.17597556,
+            0.72214655,
+            0.46271161,
+            0.43818419,
+            0.3192333,
+            0.50979216,
+            0.5943811,
+            0.60607602,
+            0.51150476,
+            0.82620388,
+            0.84942166,
+            0.72318109,
+            0.73678399,
+            0.39259827,
+            0.26762545,
+        ],
+    ]
+)
+
+cihp2atr_nlp_adj = np.array(
+    [
+        [
+            1.0,
+            0.35333052,
+            0.32727194,
+            0.18757584,
+            0.40608522,
+            0.27986573,
+            0.23893579,
+            0.27600672,
+            0.30964391,
+            0.36865639,
+            0.41500332,
+            0.4211553,
+            0.32485771,
+            0.37248222,
+            0.36915778,
+            0.41377746,
+            0.32006291,
+            0.28923404,
+        ],
+        [
+            0.35333052,
+            1.0,
+            0.39206695,
+            0.42143438,
+            0.4736689,
+            0.47139544,
+            0.51999208,
+            0.38354847,
+            0.45628529,
+            0.46514124,
+            0.50083501,
+            0.4310595,
+            0.39371443,
+            0.4319752,
+            0.42938598,
+            0.46384034,
+            0.44833757,
+            0.6153155,
+        ],
+        [
+            0.32727194,
+            0.39206695,
+            1.0,
+            0.32836702,
+            0.52603065,
+            0.39543695,
+            0.3622627,
+            0.43575346,
+            0.33866223,
+            0.45202552,
+            0.48421,
+            0.53669903,
+            0.47266611,
+            0.50925436,
+            0.42286557,
+            0.45403656,
+            0.37221304,
+            0.40999322,
+        ],
+        [
+            0.17418084,
+            0.46892601,
+            0.25774838,
+            0.31816231,
+            0.39330317,
+            0.34218382,
+            0.48253904,
+            0.22084125,
+            0.41335728,
+            0.52437572,
+            0.5191713,
+            0.33576117,
+            0.44230914,
+            0.44250678,
+            0.44330833,
+            0.43887264,
+            0.50693611,
+            0.39278795,
+        ],
+        [
+            0.18757584,
+            0.42143438,
+            0.32836702,
+            1.0,
+            0.35030067,
+            0.30110947,
+            0.41055555,
+            0.34338879,
+            0.34336307,
+            0.37704433,
+            0.38810141,
+            0.34702081,
+            0.24171562,
+            0.25433078,
+            0.24696241,
+            0.2570884,
+            0.4465962,
+            0.45263213,
+        ],
+        [
+            0.40608522,
+            0.4736689,
+            0.52603065,
+            0.35030067,
+            1.0,
+            0.54372584,
+            0.58300258,
+            0.56674191,
+            0.555266,
+            0.66599594,
+            0.68567555,
+            0.55716359,
+            0.62997328,
+            0.65638548,
+            0.61219615,
+            0.63183318,
+            0.54464151,
+            0.44293752,
+        ],
+        [
+            0.37503981,
+            0.50675565,
+            0.4761106,
+            0.37561813,
+            0.60419403,
+            0.77912403,
+            0.64595517,
+            0.85939662,
+            0.46037144,
+            0.52348817,
+            0.55875094,
+            0.37741886,
+            0.455671,
+            0.49434392,
+            0.38479954,
+            0.41804074,
+            0.47285709,
+            0.57236283,
+        ],
+        [
+            0.35448462,
+            0.50576632,
+            0.51030446,
+            0.35841033,
+            0.55106903,
+            0.50257274,
+            0.52591451,
+            0.4283053,
+            0.39991808,
+            0.42327211,
+            0.42853819,
+            0.42071825,
+            0.41240559,
+            0.42259136,
+            0.38125352,
+            0.3868255,
+            0.47604934,
+            0.51811717,
+        ],
+        [
+            0.22598555,
+            0.5053299,
+            0.36301185,
+            0.38002282,
+            0.49700941,
+            0.45625243,
+            0.62876479,
+            0.4112051,
+            0.33944371,
+            0.48322639,
+            0.50318714,
+            0.29207815,
+            0.38801966,
+            0.41119094,
+            0.29199072,
+            0.31021029,
+            0.41594871,
+            0.54961962,
+        ],
+        [
+            0.23893579,
+            0.51999208,
+            0.3622627,
+            0.41055555,
+            0.58300258,
+            0.68874251,
+            1.0,
+            0.56977937,
+            0.49918447,
+            0.48484363,
+            0.51615925,
+            0.41222306,
+            0.49535971,
+            0.53134951,
+            0.3807616,
+            0.41050298,
+            0.48675801,
+            0.51112664,
+        ],
+        [
+            0.33064262,
+            0.306412,
+            0.60679935,
+            0.25592294,
+            0.58738706,
+            0.40379627,
+            0.39679161,
+            0.33618385,
+            0.39235148,
+            0.45474013,
+            0.4648476,
+            0.59306762,
+            0.58976007,
+            0.60778661,
+            0.55400397,
+            0.56551297,
+            0.3698029,
+            0.33860535,
+        ],
+        [
+            0.28923404,
+            0.6153155,
+            0.40999322,
+            0.45263213,
+            0.44293752,
+            0.60359359,
+            0.51112664,
+            0.46578181,
+            0.45656936,
+            0.38142307,
+            0.38525582,
+            0.33327223,
+            0.35360175,
+            0.36156453,
+            0.3384992,
+            0.34261229,
+            0.49297863,
+            1.0,
+        ],
+        [
+            0.27986573,
+            0.47139544,
+            0.39543695,
+            0.30110947,
+            0.54372584,
+            1.0,
+            0.68874251,
+            0.67765588,
+            0.48690078,
+            0.44010641,
+            0.44921156,
+            0.32321099,
+            0.48311542,
+            0.4982002,
+            0.39378102,
+            0.40297733,
+            0.45309735,
+            0.60359359,
+        ],
+        [
+            0.4211553,
+            0.4310595,
+            0.53669903,
+            0.34702081,
+            0.55716359,
+            0.32321099,
+            0.41222306,
+            0.25721705,
+            0.36633509,
+            0.5397475,
+            0.56429928,
+            1.0,
+            0.55796926,
+            0.58842844,
+            0.57930828,
+            0.60410597,
+            0.41615326,
+            0.33327223,
+        ],
+        [
+            0.36915778,
+            0.42938598,
+            0.42286557,
+            0.24696241,
+            0.61219615,
+            0.39378102,
+            0.3807616,
+            0.28089866,
+            0.48450394,
+            0.77400821,
+            0.68813814,
+            0.57930828,
+            0.8856886,
+            0.81673412,
+            1.0,
+            0.92279623,
+            0.46969152,
+            0.3384992,
+        ],
+        [
+            0.41377746,
+            0.46384034,
+            0.45403656,
+            0.2570884,
+            0.63183318,
+            0.40297733,
+            0.41050298,
+            0.332879,
+            0.48799542,
+            0.69231828,
+            0.77015091,
+            0.60410597,
+            0.79788484,
+            0.88232104,
+            0.92279623,
+            1.0,
+            0.45685017,
+            0.34261229,
+        ],
+        [
+            0.32485771,
+            0.39371443,
+            0.47266611,
+            0.24171562,
+            0.62997328,
+            0.48311542,
+            0.49535971,
+            0.32477932,
+            0.51486622,
+            0.79353556,
+            0.69768738,
+            0.55796926,
+            1.0,
+            0.92373745,
+            0.8856886,
+            0.79788484,
+            0.47883134,
+            0.35360175,
+        ],
+        [
+            0.37248222,
+            0.4319752,
+            0.50925436,
+            0.25433078,
+            0.65638548,
+            0.4982002,
+            0.53134951,
+            0.38057074,
+            0.52403969,
+            0.72035243,
+            0.78711147,
+            0.58842844,
+            0.92373745,
+            1.0,
+            0.81673412,
+            0.88232104,
+            0.47109935,
+            0.36156453,
+        ],
+        [
+            0.36865639,
+            0.46514124,
+            0.45202552,
+            0.37704433,
+            0.66599594,
+            0.44010641,
+            0.48484363,
+            0.39636574,
+            0.50175258,
+            1.0,
+            0.91320249,
+            0.5397475,
+            0.79353556,
+            0.72035243,
+            0.77400821,
+            0.69231828,
+            0.59087008,
+            0.38142307,
+        ],
+        [
+            0.41500332,
+            0.50083501,
+            0.48421,
+            0.38810141,
+            0.68567555,
+            0.44921156,
+            0.51615925,
+            0.45156472,
+            0.50438158,
+            0.91320249,
+            1.0,
+            0.56429928,
+            0.69768738,
+            0.78711147,
+            0.68813814,
+            0.77015091,
+            0.57698754,
+            0.38525582,
+        ],
+    ]
+)
+
+
+def normalize_adj(adj):
+    """Symmetrically normalize adjacency matrix."""
+    adj = sp.coo_matrix(adj)
+    rowsum = np.array(adj.sum(1))
+    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
+    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
+    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
+    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
+
+
+def preprocess_adj(adj):
+    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
+    adj = nx.adjacency_matrix(
+        nx.from_dict_of_lists(adj)
+    )  # return a adjacency matrix of adj ( type is numpy)
+    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))  #
+    # return sparse_to_tuple(adj_normalized)
+    return adj_normalized.todense()
+
+
+def row_norm(inputs):
+    outputs = []
+    for x in inputs:
+        xsum = x.sum()
+        x = x / xsum
+        outputs.append(x)
+    return outputs
+
+
+def normalize_adj_torch(adj):
+    # print(adj.size())
+    if len(adj.size()) == 4:
+        new_r = torch.zeros(adj.size()).type_as(adj)
+        for i in range(adj.size(1)):
+            adj_item = adj[0, i]
+            rowsum = adj_item.sum(1)
+            d_inv_sqrt = rowsum.pow_(-0.5)
+            d_inv_sqrt[torch.isnan(d_inv_sqrt)] = 0
+            d_mat_inv_sqrt = torch.diag(d_inv_sqrt)
+            r = torch.matmul(torch.matmul(d_mat_inv_sqrt, adj_item), d_mat_inv_sqrt)
+            new_r[0, i, ...] = r
+        return new_r
+    rowsum = adj.sum(1)
+    d_inv_sqrt = rowsum.pow_(-0.5)
+    d_inv_sqrt[torch.isnan(d_inv_sqrt)] = 0
+    d_mat_inv_sqrt = torch.diag(d_inv_sqrt)
+    r = torch.matmul(torch.matmul(d_mat_inv_sqrt, adj), d_mat_inv_sqrt)
+    return r
+
+
+# def row_norm(adj):
+
+
+if __name__ == "__main__":
+    a = row_norm(cihp2pascal_adj)
+    print(a)
+    print(cihp2pascal_adj)
+    # print(a.shape)
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_inference.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7de247713d1839aa0778801dec30175fc7e223
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_inference.py
@@ -0,0 +1,242 @@
+import cv2
+import torch
+import timeit
+import numpy as np
+from PIL import Image
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torchvision import transforms
+
+
+from FaceHairMask import graph
+from FaceHairMask import graphonomy_process as tr
+
+
+
+label_colours = [(0, 0, 0) for i in range(20)]
+label_colours[2] = (255, 0, 0)
+label_colours[13] = (0, 0, 255)
+
+
+def custom_decode_labels(mask, num_images=1, num_classes=20):
+    """Decode batch of segmentation masks.
+
+    Args:
+      mask: result of inference after taking argmax.
+      num_images: number of images to decode from the batch.
+      num_classes: number of classes to predict (including background).
+
+    Returns:
+      A batch with num_images RGB images of the same size as the input.
+    """
+    n, h, w = mask.shape
+
+    # import ipdb; ipdb.set_trace()
+    assert (
+        n >= num_images
+    ), "Batch size %d should be greater or equal than number of images to save %d." % (
+        n,
+        num_images,
+    )
+
+    hair_mask = torch.where(mask == 2, torch.ones_like(mask), torch.zeros_like(mask))
+
+    face_mask = torch.where(mask == 13, torch.ones_like(mask), torch.zeros_like(mask))
+
+    return hair_mask, face_mask
+
+
+def overlay(frame, mask):
+
+    mask = np.array(mask)
+    frame = np.array(frame)
+
+    tmp = cv2.cvtColor(mask, cv2.COLOR_RGB2GRAY)
+    _, alpha = cv2.threshold(tmp, 0, 255, cv2.THRESH_BINARY)
+    b, g, r = cv2.split(mask)
+    rgba = [b, g, r, alpha]
+    dst = cv2.merge(rgba, 4)
+
+    # overlay mask on frame
+    overlaid_image = cv2.addWeighted(frame, 0.4, dst, 0.1, 0)
+    return overlaid_image
+
+
+def flip(x, dim):
+    indices = [slice(None)] * x.dim()
+    indices[dim] = torch.arange(
+        x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device
+    )
+    return x[tuple(indices)]
+
+
+def flip_cihp(tail_list):
+    """
+
+    :param tail_list: tail_list size is 1 x n_class x h x w
+    :return:
+    """
+    # tail_list = tail_list[0]
+    tail_list_rev = [None] * 20
+    for xx in range(14):
+        tail_list_rev[xx] = tail_list[xx].unsqueeze(0)
+    tail_list_rev[14] = tail_list[15].unsqueeze(0)
+    tail_list_rev[15] = tail_list[14].unsqueeze(0)
+    tail_list_rev[16] = tail_list[17].unsqueeze(0)
+    tail_list_rev[17] = tail_list[16].unsqueeze(0)
+    tail_list_rev[18] = tail_list[19].unsqueeze(0)
+    tail_list_rev[19] = tail_list[18].unsqueeze(0)
+    return torch.cat(tail_list_rev, dim=0)
+
+
+def decode_labels(mask, num_images=1, num_classes=20):
+    """Decode batch of segmentation masks.
+
+    Args:
+      mask: result of inference after taking argmax.
+      num_images: number of images to decode from the batch.
+      num_classes: number of classes to predict (including background).
+
+    Returns:
+      A batch with num_images RGB images of the same size as the input.
+    """
+    n, h, w = mask.shape
+    assert (
+        n >= num_images
+    ), "Batch size %d should be greater or equal than number of images to save %d." % (
+        n,
+        num_images,
+    )
+    outputs = np.zeros((num_images, h, w, 3), dtype=np.uint8)
+    for i in range(num_images):
+        img = Image.new("RGB", (len(mask[i, 0]), len(mask[i])))
+        pixels = img.load()
+        for j_, j in enumerate(mask[i, :, :]):
+            for k_, k in enumerate(j):
+                if k < num_classes:
+                    pixels[k_, j_] = label_colours[k]
+        outputs[i] = np.array(img)
+    return outputs
+
+
+def read_img(img_path):
+    _img = Image.open(img_path).convert("RGB")  # return is RGB pic
+    return _img
+
+
+def img_transform(img, transform=None):
+    sample = {"image": img, "label": 0}
+
+    sample = transform(sample)
+    return sample
+
+
+def inference(net, img=None, device=None):
+    """
+
+    :param net:
+    :return:
+    """
+    # adj
+    adj2_ = torch.from_numpy(graph.cihp2pascal_nlp_adj).float()
+    adj2_test = (
+        adj2_.unsqueeze(0).unsqueeze(0).expand(1, 1, 7, 20).to(device).transpose(2, 3)
+    )
+
+    adj1_ = Variable(torch.from_numpy(graph.preprocess_adj(graph.pascal_graph)).float())
+    adj3_test = adj1_.unsqueeze(0).unsqueeze(0).expand(1, 1, 7, 7).to(device)
+
+    cihp_adj = graph.preprocess_adj(graph.cihp_graph)
+    adj3_ = Variable(torch.from_numpy(cihp_adj).float())
+    adj1_test = adj3_.unsqueeze(0).unsqueeze(0).expand(1, 1, 20, 20).to(device)
+
+    # multi-scale
+    scale_list = [1, 0.5, 0.75, 1.25, 1.5, 1.75]
+    #scale_list = [1, 0.5, 0.75, 1.25]
+    # NOTE: this part of the code assumes img is PIL image in RGB color space
+    # We provide torch tensor in range [-1, 1]
+    # Bring range to [0, 255]
+    img = torch.clamp(img, -1, 1)
+    img = (img + 1.0) / 2.0
+    img *= 255
+
+    testloader_list = []
+    testloader_flip_list = []
+    for pv in scale_list:
+        composed_transforms_ts = transforms.Compose(
+            [
+                tr.Scale_only_img(pv),
+                tr.Normalize_xception_tf_only_img(),
+                tr.ToTensor_only_img(),
+            ]
+        )
+
+        composed_transforms_ts_flip = transforms.Compose(
+            [
+                tr.Scale_only_img(pv),
+                tr.HorizontalFlip_only_img(),
+                tr.Normalize_xception_tf_only_img(),
+                tr.ToTensor_only_img(),
+            ]
+        )
+        # NOTE: img [1, 3, 256, 256], (min, max) = (0, 255)
+
+        # print("original:", img.shape, img.min(), img.max())
+        testloader_list.append(img_transform(img, composed_transforms_ts))
+        # print(img_transform(img, composed_transforms_ts))
+        testloader_flip_list.append(img_transform(img, composed_transforms_ts_flip))
+    # print(testloader_list)
+    start_time = timeit.default_timer()
+    # One testing epoch
+    # net.eval()
+    # 1 0.5 0.75 1.25 1.5 1.75 ; flip:
+
+    # NOTE: testloader_list[0]['image'].shape = 3, 420, 620
+
+    for iii, sample_batched in enumerate(zip(testloader_list, testloader_flip_list)):
+        inputs, labels = sample_batched[0]["image"], sample_batched[0]["label"]
+        inputs_f, _ = sample_batched[1]["image"], sample_batched[1]["label"]
+        inputs = inputs.unsqueeze(0)
+        inputs_f = inputs_f.unsqueeze(0)
+        inputs = torch.cat((inputs, inputs_f), dim=0)
+        if iii == 0:
+            _, _, h, w = inputs.size()
+        # assert inputs.size() == inputs_f.size()
+
+        # Forward pass of the mini-batch
+
+        # TODO: check requires grad functionality
+        # inputs = Variable(inputs, requires_grad=False)
+
+        with torch.no_grad():
+            if device is not None:
+                inputs = inputs.to(device)
+            # outputs = net.forward(inputs)
+            outputs = net.forward(
+                inputs, adj1_test.to(device), adj3_test.to(device), adj2_test.to(device)
+            )
+            outputs = (outputs[0] + flip(flip_cihp(outputs[1]), dim=-1)) / 2
+            outputs = outputs.unsqueeze(0)
+            if iii > 0:
+                outputs = F.upsample(
+                    outputs, size=(h, w), mode="bilinear", align_corners=True
+                )
+                outputs_final = outputs_final + outputs
+            else:
+                outputs_final = outputs.clone()
+    ################ plot pic
+    predictions = torch.max(outputs_final, 1)[1]
+    # results = predictions.cpu().numpy()
+    # vis_res = decode_labels(results)
+    # parsing_im = Image.fromarray(vis_res[0])
+    # return parsing_im
+
+    hair_mask, face_mask = custom_decode_labels(predictions)
+
+    return outputs_final, hair_mask, face_mask
+
+    # parsing_im.save(output_path+'/{}.png'.format(output_name))
+    # cv2.imwrite(output_path+'/{}_gray.png'.format(output_name), results[0, :, :])
+
+    # end_time = timeit.default_timer()
+    # print('time used for the multi-scale image inference' + ' is :' + str(end_time - start_time))
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_process.py b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d47a0e3fa38dcec491a5ecde5dfb095dbdb4fc9
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/graphonomy/FaceHairMask/graphonomy_process.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torchvision import transforms
+
+
+class Scale_only_img(object):
+    def __init__(self, scale):
+        self.scale = scale
+
+    def __call__(self, sample):
+        img = sample["image"]
+        mask = sample["label"]
+        img = F.interpolate(  # NOTE: requires 4D
+            img, scale_factor=self.scale, mode="bilinear"
+        )
+        # print("Scale:", img.shape, img.min(), img.max())
+        # import ipdb; ipdb.set_trace()
+        return {"image": img, "label": mask}
+
+
+class Normalize_xception_tf_only_img(object):
+    def __call__(self, sample):
+        img = sample["image"]
+        img = (img * 2.0) / 255.0 - 1
+        # print("Normalize:", img.shape, img.min(), img.max())
+        # import ipdb; ipdb.set_trace()
+        return {"image": img, "label": sample["label"]}
+
+
+class ToTensor_only_img(object):
+    def __init__(self):
+        self.rgb2bgr = transforms.Lambda(lambda x: x[[2, 1, 0], ...])
+
+    def __call__(self, sample):
+        # sample: N x C x H x W
+        img = sample["image"]
+        img = torch.squeeze(img, axis=0)
+        # sample: C x H x W
+        img = self.rgb2bgr(img)
+        # print("To Tensor:", img.shape, img.min(), img.max())
+        # img = torch.unsqueeze(img, axis=0)
+        # import ipdb; ipdb.set_trace()
+        return {"image": img, "label": sample["label"]}
+
+
+class HorizontalFlip_only_img(object):
+    def __call__(self, sample):
+        img = sample["image"]
+        mask = sample["label"]
+        img = torch.flip(img, [-1])
+        # print("Horizontal:", img.shape, img.min(), img.max())
+        # import ipdb; ipdb.set_trace()
+        return {"image": img, "label": mask}
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/Alignment84_demo-minimal.ipynb b/insightface/reconstruction/ostec/external/landmark_detector/Alignment84_demo-minimal.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..bc59bf338808f84e4aa1cdf5082b3397acc6513e
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/Alignment84_demo-minimal.ipynb
@@ -0,0 +1,160 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Face Alignment by Hourglass (84 landmarks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:\n",
+      "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
+      "For more information, please see:\n",
+      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
+      "  * https://github.com/tensorflow/addons\n",
+      "  * https://github.com/tensorflow/io (for I/O related ops)\n",
+      "If you depend on functionality not listed there, please file an issue.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf\n",
+    "import menpo.io as mio\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as pt\n",
+    "import networks\n",
+    "from flags import FLAGS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf_heatmap_to_lms(heatmap):\n",
+    "    hs = tf.argmax(tf.reduce_max(heatmap, 2), 1)\n",
+    "    ws = tf.argmax(tf.reduce_max(heatmap, 1), 1)\n",
+    "    lms = tf.transpose(tf.to_float(tf.stack([hs, ws])), perm=[1, 2, 0])\n",
+    "    return lms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From /media/Projects/_Active/Unsupervised_UV/Face_Detection_Alignment-master/models.py:91: The name tf.GraphKeys is deprecated. Please use tf.compat.v1.GraphKeys instead.\n",
+      "\n",
+      "WARNING:tensorflow:From /home/baris/anaconda3/envs/ganfit_stylegan2/lib/python3.6/site-packages/tensorflow_core/contrib/layers/python/layers/layers.py:1057: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Please use `layer.__call__` method instead.\n",
+      "WARNING:tensorflow:From <ipython-input-2-52eace1d17da>:4: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use `tf.cast` instead.\n",
+      "WARNING:tensorflow:From <ipython-input-3-f39427770ee7>:11: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n",
+      "Instructions for updating:\n",
+      "Please use tf.global_variables instead.\n",
+      "INFO:tensorflow:Restoring parameters from ../models/alignment/3D84/model.ckpt-277538\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_path = '../models/alignment/3D84/model.ckpt-277538'\n",
+    "template = mio.import_pickle('./image/template.pkl.gz')\n",
+    "n_landmarks = 84\n",
+    "FLAGS.n_landmarks = 84\n",
+    "with tf.Graph().as_default() as g:\n",
+    "    images_input = tf.placeholder(tf.float32, shape=(None, None, None, 3), name='input_images')\n",
+    "    net_model = networks.DNFaceMultiView('')\n",
+    "    with tf.variable_scope('net'):\n",
+    "        lms_heatmap_prediction,states = net_model._build_network(images_input, datas=None, is_training=False, n_channels=n_landmarks)\n",
+    "        pts_predictions = tf_heatmap_to_lms(lms_heatmap_prediction)\n",
+    "        variables_to_restore = tf.all_variables()\n",
+    "        saver = tf.train.Saver(variables_to_restore)\n",
+    "\n",
+    "config = tf.ConfigProto()\n",
+    "config.gpu_options.allow_growth = True\n",
+    "sess = tf.Session(config=config,graph=g)\n",
+    "\n",
+    "saver.restore(sess, model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.collections.PathCollection at 0x7ff7233b9d68>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQYAAAD8CAYAAACVSwr3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOy9yY5lWXam963dnOa2Zubu5h59RDIzi2SBZJFCSQVBAKGBAM0EaFTSA9RID1BvUA+gkWqgsZ6AggYaSygWSqxMMTMjson0SO+tve1pdrM02MfMI5FkIQNgQEHAFxBA+LVrp917Nf//r2Wiqryzd/bO3tnXzfz/fQHv7J29s++evXMM7+ydvbPfsXeO4Z29s3f2O/bOMbyzd/bOfsfeOYZ39s7e2e/YO8fwzt7ZO/sd+9Ycg4j8tyLyuYj8QkT+9bd1nnf2zt7ZP7zJt6FjEBELfAH8N8Az4K+B/0FVf/IPfrJ39s7e2T+4fVsZw38O/EJVf6WqI/C/Af/dt3Sud/bO3tk/sLlv6bgfAL/52r+fAf/F3/fl9XqlTx4/BAT4OzIYVRC5/6cg6N33dPq16f9FhJTC9AODsRYRg9x/UblLklSno6giUk7z9rP7k6OqoKDy9loEpmtS5OvH1VzOLIacAjknrHVYYxAyiJBTQlXJQA6hXHPOxBgRETKCpggYVDMxKdaAs5asghi5v++siuZynSmlcq/GkHMmKxgjOGuw1qKaSUmn6ytPpG4bxr4v3xVT7ilnYs44ZwEhpYixlpzy9IxSeWZGSDFR1566rgjjCCJUdY3zFVVVEVPCiMEYIeevv6zyv/K11y3TO377kSIib1+/3r2Pu8PI11/9b73br1tKsXxTKC9RpvdcXtj9Wvi63Z1OVd+eRX77bNwth7cr5f78b5er3B/sbo2JfO3zu7u4O+G0tkRk+u/ud6fviPz2Nf2O3T2Jt7tEEGIM/PJXX12q6qO/5xd/y74tx/B3XfVvPXkR+VfAvwJ4/Pgh/8v//G+ATMoJYx2iCVUBTYiA8xWaM4oS44hzFSIGzYmYElYMAsQwsjvsSKI48TTNnOViDjmDKCEqMSUECCmRYmQcewRLSkrKSsqBnCDmhKiSNJMz04IqL8Y5S1s5nLFYCyGk6doG0IF4PFD5ito4DJmqslw9+zXGOsLQkUPi1cU11kLqA4rh2etbUgo0ywXeGlIWUhhYnz4ga6JuGna3O263B0CJ2eBchfMgooSU6bqR+bxFqwYdE84bhuFIUmExmzFrHYfjEVVBpOXs/ARfz/nVr79i2VSIdSxWS778zVectnPWD864utmj4UA4jLTrJZvbC+qmwllH7QxNbZnNWgwB5zzRzagbz5//xT/nZjfw5L1zDoc9msF6i5iGuq2xRjAi5DDiqxoDGIGY8r3zc95N77c8d0MmadmERmzZJCKMKaEhkICUMiFlUIgxMfY7RMvvpaxEjahmNA6IseQ0kqfjGOMQLU415YyZ1pUxnkxGxCJWEFHQBGJLMFBLyuO0JkE1Ipjih8TcbwolY6wnZ7CuBC20nAcVMhYjgvcVVVXjDGhWfOWxxuC8n4Jd2fjWCimX+zKTw/MCUU25RoSogvHCX/7lf//0993A35ZjeAZ89LV/fwi8+PoXVPXfAv8W4J/88HuqKJpTiSAplGgYM8ZkNCsRQTSRVckpMYSeyjtSToSYCQpWMilGNA6EJGSjiDgkR8QUzxRC4tgP6PTgcoqMITIMPSmNxc/mjCJkDFEVKWESY+x9VHbWMHQObw3OCd56lEA83jDue8YEjx629IcdNg1so6FenLDf33L74hUBxcSOoQenBmlmfPjhA8R6bq+ucSmyevSIXXfg0aMlNzd7coKTh2fMTk8JGXabLcP+gEalXS6xYcCZjDhBJLJ+dMLN9QVVXRNjyUiOXcRVnqFLjN0O+/iE/rBl3jQM/Y66XbDd76mMJcfMmxdv6ENXnCEBtS3LuSco1JXQeouxlsP+SONh6A4Ye2DcOP7Pv/rfOT9/QOp+wKPHj0iSOez21O5I276H944YFec8OWesMYDinJDVIhqntwDWGIZxREVJ089iGkgIIWTGGO8dvqbIGBMpZ/oxoumAoMQQyApZSrZgjICGaQPdbQiDCOVYYog5Y4wjpIgxFmMEo3eR32DFkBDEGJypyw+sgFpiTECeEhUDYjCSySlhrUFzxlgh5kxCSSljRIpzkYymQDACJIaxAmPwVU3jLVlBrMEZizMl7hkLYMhGpgy5HBMx5CF9ow38bTmGvwZ+ICKfAc+Bfwn8j3/fl3POxKEnxpFxTAiJmKe0I0VyDiDCmHTy3pk0ZgwZsSVdHsaEcZa+H8hhIOeMGMVWDc5UaI6EpMRUtnyMAzkl8pQNeO8ZxxFna5w3aC6pslUlZyUmiAo5RUQ8KQeMFaxGGmdJXU87rxkPO0w8YJszdrdvCF3Hy19/xWHXIZXDMLB6cIKViG/mmKQkcaxWK3AZF5TF7BHDmIipp2oslzevOHn4HmN3pK0t+12HswGtE0+ePGF/6NCkVK4mRMU5gyZhf9jQNDVt3bDZ7sBknFhaC+tHa9Ynp7x88YKbqy22nuFcg8bM9c0V++NIaBJZI1VTsWgMY3TYFAg5EcbMdjfSzWp2Vze4GtoKmvmMtlI0K2drYf/6K/7jV1+yPm0xUvPpH/8JoV5jb7fIwjOGQGNr1Bi2YaCuPJWrUB1ImhkGLVE/JnKMQMkoYs7kqaQJISHWEMYRRRHJ5Fw2pWKofYkK1lo0R8aQS8kmYOUuLRfEmvtU1zuLorip/KpFS4Q2pdSzzpdg4R1uKtuyCsaUY6qC8xGNgoqWDMdYQAmiWCnHigreWLJO0T8pzijEjkEBLEpG9QDGIp2AsVPpI1SuwqDIdG7j6nJPYjGmZBbeezR/M5LhW3EMqhpF5H8C/g/AAv+rqv7tf+L77A97VBMpBMYEmiMgaAoIkZzf1ksqmZx0SvUyTgw5JYyDGCIWQTRDNuQQGFWJORJzLqUAuTh1MVTOYcQQk9I0M+qqQozQ1jWI0I0Bg5AUQkwcuwMhZLyhlB4h0Hd7jCj97ZblcsWsmaFJ+cmPfs7N7Y71ssJViq2Eul7R73u8tRzjyMnJEs093eGWOILmsjSzsYwRUjzSNHNe/erXdGFAqRn6EQz0PeTn16iAxdJYxc/nMI4QBaktimBbYbWckXNCkxJCJIw3bDY3rNYn1O2C5y/fIAb6LjH2R3IWdoc963XDvG1AI5VVcswMxwCd4mY180VL3bZAxIaSqV1sRo7bN1xcOo7HjgerhrHbo0Ac9lCfcPrep/zhP/1DDt2ItEq9WONzZBgGVB0xRwRlGCMxJjSOjDEjCHkqC2Ia8VZKZE4GzbFEYhG8dyWqZsG5glOQCq7T1ALYqZwqGaEYS1LFCmXza0n/s2ZqbxhTKqtPHJW1OOdKbWwsYvUthkApNwGMZJIDwSAWci5Yls/lWIpSiaIxTdiEYlyccBeDaHFY1ma6IeDEoiSSJopPTBxTwIiUPaARwoimcs/O1xgEGwIq+Rvt4W8rY0BV/wr4q9/nuykl9ofdvWPIUICymBAUY94Ch8aUVC8LIFrKASLWW6wxKAZNJdoLSsygOZBTSR2tAysWayvEaAHGrEO1AHJVVeMrQ0rljL5qyDmj4iAnclYqGxm7A6k/EvoDSuD8bAGDEq4v+PXVluOYaRvQdcU49Bhf09Y1MSeaZs5s3gAdy1WDMzOePX2NrxvE9PQxMWtX5DTSD7A7HtnvB7oAajMmKIe+w4rFuLIBNB7ZRiXdHshAZS3GCbOmoesGZhW4tiIOigq0dcuw2xBnI8aOnK7WbI89IQdSyjhr8bZgAHEMNL44xiEklg/OsUQqJwxDz2q15riPzE7Pubm6wKKcref0x4HzszOQRDaOMEb2+0AdNxxffs7Pugs++N4fsx+PjCFhXE0KPUhxDCkkxhCIKRasJyWSFgC0pPW5lBwGrHcIdkKyMqqlNMmAcw4yOAspC6igGIx5i10bmQKPlHJhApRICmIsFaWktc7hncfacmxLwTNV4W7rvQVKuQe9VYpjKCXIlLkKpWw1CUEQUwoPyblgLTETNaMpM5u1aCzYhZMJQ5VASHnKaCMqCnnAGE/IQhw6nDNYSgn8TexbcwzfxHJOpLgnZ0gxoyKQY6mffLlEZz3WmQkgKgi75lzqMXJZODGXxEsKAJUnRxMV6tpTVXUBmgrMhbUGVHHGcodP51zS1zHGguCrMgyRkBLkEk3T0OF9hcbM40WLa1Zw2HD54g125lk/WNG/viRbRxgCxtYYEYy1zNuW2cxjRBlGw+7qhuubA1XbcOwPiPNst0d2h0uuton9MXCzPVDVDSKOpAcW7YykDhXDuq2xzjAGS+VrLm825DGDMZAhDAlyx3pxynDoyaPQnjaICfh2RrfviVpRN5ZZXbPdbtCoiFe8LZFPjENTxHmHayoqO9JUFcOxx2dDOB5LVpUj7WLBqS9YQds2eMn0IRD6kTAGuiGAZJoq091c8dVPf8RivebJpz/EiWG+XnE4HBEVchpJMZHzSFYpgJ13+KrBqJLFkxMYmzHWoilN5UCaKCRBvMEai3iDKDjrUKSwQ1ruMWsJEEqp740UEGEiCUha2CHVjLMV1nms8yVoiTDh0qScgQIC5imDEAqgbW358A4ArTRRPspIpmBZUvCErLYA7ikjGsvaTJngCnhZHJYgGrE2o1mIqaeAH6V8AshiKKxS/m365/ew74RjEAFnDGIgTXWRqgFjqbzHGDNRaaakXMZgNBGTkicqMcVMTFPJEVNBpnMkp0RSRUmEENGcyGoQhKRpSiNNAb6E+3MoQowZYxSjtjxYhdpWyMJDisQ0klLEdY43F9fUC4fmjlE9zcxxdZuwrsZXFmehnbXk0BHHAevmkCCoo20a1CibTUJjx+uLW246CFrq0M1xxHRhAruE4dizWjTMKmE8HhlCZH8cyBiWqyXN2pWI7+DQ9UQ1XN7cslguISYuX79htV5RNzPGEDAedptr1utzFrMZgw9kFazJQJpq/J5lK0gWfGq43XWk1NNUnsYYKquEYcPMNRgVSIFhM9AhDGEkp0xVVZws5wTNjGoYup64f81uv2e2OmN1olxdfMXpg4cY8Xjv6IZw/35qV+O9RcUS+x5NUqK7swA4XzaUiAcMRsA6hzOCswLGYVwBCI0tayprwZxKyV4wACMTj1p4YGISco5ACURV5bGumhwJ6MSk3VGeMtGWhRa+K3/unI1MmYPe09cpZfw9bZpA7T0wHmNxssaVsgNswQs0E3NVwFHNJK3IOZKxhRXJiSxKyolMIsbvSCnxTewuhS+1YiTdASWa6dOIMQ4rmRgSKkLOefKMQs5peogZIZNjQtWS1WCdIeWCUGfAmYI7GBSVksoZYwsIlBTvLGBL6SKWyoK3MCaHdUJMA3WIVM7RHyPHeCT3iV1WcuoZQkXVnvL8q9fc7EY+++gJ+65jsfTYagbjiK8yLiubN9f0xnA4HGicZXvoeXmx5aYLoMIYikOztqKyBlVhVnvGvqcSjwxHtkfFqNCFCLY4g9vrDeIsi/mcEALewnI9JzlHFzJN65lXa7ohEGJH7Sti6BFr6IYDde0Z+w1IRRyF1hnIPUYTxwPk4Im+YzQ9TVWBjhy3A0OEuvJsjm/wTcU4RAKelBLWCr5p6WLP2NSsFjVNU/Hq5TUn6yXrheVnP/przj/4hAdnp4S+J2mkDwPtYk2MI0G14EU5ItZhfYXNIGaK8GLw1mCNgDGoGNwUTJwTKl8yrMwdKGcwGsiayFM9r1OK4EzRDcQUp02aiSmhYibGQEtwQkAzlTeMeURMPTkTg6GUDHfQpjOCYqdsYqK/8UDG2XIfmktWApAxpNBNdG3EmqIFIQsxl9In50xKeSqZEzFPmXMupbiKIcTCyA1j/4325HfCMajCoQuQ9d4bvhWvZBKBMQKiE51THrihpIZGDGIUtYUDts5PNdqIMzOGEIhZ8LYCsTgnGDv5aM2FUFLFeYdmQ+MLSZbSVM+FgcNhy2q54kf/4cfMZy3jMHLcHbEzw3y15JdfDhy6Dd5e8vDRGQ8rxziOSBppqiUpRIbjhjEZYtzj2hXbzYFnrzdEPMduYOgjTV3RjaX8qazgPFhxtLWlqiviomaIEWxF6yx9ijyqK653I2oEkw2NUY7dkeW8wVvHYdeV+5PMTU4Y46nbmvXSsNlvqMj42jN3M15fbBhGpVnMCGnD9fWIqxqquiYdNzgv5GCZibDZdcQ4cHbSUtc1s/mM9YMzZvOWeVtz3N0UILDKzL2QTcvxMFLPZvT9wPn7H1C1M8aQePL+KWHY8/r5Jc++/IKqqnny/keMOVEvljxYnzOOR5Kp8M7hXNlUla/JaCk1rUzZReHxUyoUuDMBgy2wAUW0hYaSaqvgJZNJGGNQLbRkLrAdGDBoCShGMAjOeCyAZJBIyIJRwWhxYN5AUFPWqpasC7HEnLDiQMPEUAAqGBMKlmEdogZVxWYwrkI1IepwNhMB4wz13c5QpaJkIpmihxBKOXwnFhtCKHjEYv6N9uR3wjGIMdRVXaL+BC5ZWyBeQTDWkDNkCihzV2o4a0hKoQ1xRSwkrmQEmvHeEWLCG4NNkaquERHqui0PPUFdG6zx9ONQziWCmIwxlsPhSB57vLfMlyueP/2y1JR9z/FwJIURjZ7nX72g6wc+/vAxmhJ9Utq64ez0FE1zdrc3NF4xTYVNyu7W8vRnL0ji6IfEze6IE8MQh0mnobSiLJdzDseedt4SQ+DYH6hmLVXl8N7THQaut0duZGTVeGaV4/yTE87OP6Q7bEhjz367pzOOi5sNMSSyCPO5JR47DkPgyWnDOEZu3lzRNj1nT95j8/yKOBQRVN8Fap3qdpSoFTkWBmg2UzIVs9mcJI59P1DFxG6/58F6Sep6fAXOLsgo1cJycj5HFca6IqshhYF6ueL581d89OGH3Fy9wjuHdY7D9paHqxUzb1FGfFMzr2elvhdH5QVrDClTFIE5F6pSLaKFEizkQnmnIRTdgIgBY5CsQCaqYsWhpML5q5ba35RjIiWrzZrBeBAtdOikTNQpS0hkrFFi1kmQNWUNUwLspGAViLtnLnKOxaFPAq6EvAVQKYK+rIGExYqCyaB2Asu5z3YtkCjZtHcFgwsJKucKUP8Ne6K+G45BhKaqJsBGMKqocXhXWAaZHpoVKLs/FvTWuPKZFPVdHzLeOVJK9H1HDoXREKmxzpFypvIVIpbae2xrioMxsPItSQ3OQlQhjj3eWEKODCHy9IvPSRnaWcN4u6WxhsHCsQ9s+8QnHz3h9maLqRpcZTk5XdGPew7HPYtlTUqWsR95dbHj6asdOUd2XU9KMIwJZcRaQyXKbFETYyCFwKz1bLcHkiqzxRznDMcRbl5e0FQVFsfp3PLpe2es5p6hP3Lz+isurrZsdkeG45HdmFieLJitZhgM3Rg5dB3zpuHYWJq2Zn7qOB4O/ObpL6n9CftDz2o1p20DzjV4E+lDAbyMZkLMSK5xlWG763HO08dMrDwPHp7SdXserFyJvE44OV0yKnQjpHEgZS0CKwNhGDh9sCCkwO1upDI1vu5BE9evXrLb7nn00cc0izOMceSs5NThbEvOEes8iil6lyL/KzFVTKnZSzWPqwq+kLWIlzKT7FgVNE6Bx06qhrLZEZ1q9lgcBVqUuUz7Vw3ORLIoOU0KRylaCmHCPLRs8ZQFe9edpIWO905AzCTjtxjV4oI1g6apVBaQjDF+KrMnsFQzIeu93NyIoMbesyPOyr00On8XdAzf1AqgU03ZgUOMBZFSIkweuegPykMwlUMxhVYzghpPQqm9wxghpIx3hj7ZIv6QiaaSPDESRaTiTIUFjJRoaClS6GOXyGMHKbA/7Hn6i8+ZtQ3tzHP7+oKu22Mqy7HvickxJmUcAs1iga1qVnPH8bhjvqw47OEnP30NGK5uNjjv2O+PxEm8spo3NLbgIlXdsJ7XiGQ0QQiB693AfLGg6zuGYWC369kflNnc8GhV89lHj1lUQhTL8XDki5895ZghqHC6nlHZxOPZErdqcdaTw8CJNhz6Coyni5Hj5khdVZw9eFhUeX6OvdkxjHD24Iz5bMaL589w1kNOSOW43R5p25pqKH0cs0qYL1oqn9B4ZDlLHI4JkYqLq2s+/9Ur+sMOXEvSoj1YLVq8FTxg8m+YLxpsPcM6g69abjcHFnVDczLj1dNf0K7P+fizHzIaS93McJVh6AM69oj1hXYko3cbhaIOvFOrGgsx6tRPEwvADfdrzJkiUtI8KSjvG2jSJDISHFOpKwXcTDkTtZzZ3Kv+FSaNhN4Jj6ZMFCBmsFMGkrMr+gspojqV8jM1EPOdMKrwqiV5KdlMLlphVKRcziSR1gng1Kk8vm+xMH9fb8Xfbd8RxwBVVaMK3k/CESkYwN3DzhNrLPe3LVgzeXqJGFeXTEMTlROi1PiYCImJqik8MTlhraXyJSMpAE7R4B8OI95bnMl0/Y4Xz1/y+sULzk5nuBy5ff2M7WYEzYwx8ea6xyk8eniKWkO7mLFczWA4Ilb5xefP2B56EpDFMz9ZEmLi5KzFmsIY5BSpas+sqdEMcRjwlef6do+K0MxmHLuERsuxH4gqzGv4F3/2h5wtHdv9nhdvbnl9vWeMkWbesPYeX1kenp1itQcy/S4zHPdgLIexQ6RoENp2ifEFrD0cOyorbG6fY6slEgfquiXEvkTanFFVtvuErebsjwPLec1q3lLZTE5HsvMIgctrYRgPbHfXBC2ofEoJ0R3OFrr45nakqnyRvqeIfXNgUe1ZnC2pFjtCUl68jDyKI1W9IMobnv7iwOOPfojaE0KssL4ijSNeDNlk7iTIIoaYCnNVVyUxjyFMG97eMwq5qKBQMWgu2UPpbSj0dNa7I+aJ7ZDJMRRK8q0SIk9ZS1lu1hQxlTVFPAdaWLckRbOgI2KULI7KTara/BZXyzkVQZMUKtMYQ1awxmI1oabQrymXay9h8y0rIqJkvXMGd9f4+9t3wzEgOF9NgIlONBITn1weuStMOSEljC2oswWMcUWXngLOGoJSKMZU0GkoQqmQM9YZgmaGFKmzEmLEGymNT8YiRIb9EVLPf/h3/zcPzx9yftKwv7pgexzJKbA5HDl2cLU78E8/Pcc3NV4CxmSuX76i0jWZwO4wcNONqGSurjs2t+Bby7bvyUlovOW9hyuatmI9n7M7Huj7PdlVdDGSXY2I5bAfWS4a6sWc77fCo8cnbLuRr778JT/pR9ZnJyxbw0cfnGIqj0HZ70ZuLw88O16QUPaHAeMcx66Il6qps9LUDuMHrt7cYlxLjglfKX/0/Y857HY8PD/n9vYWEcd8teTVqx3WUxxdPzJvClt0eX2Nrxu6IZFTT1UZDsdSO5MjMSu7/R6JGedrcg60dc1yXrFeNrQny9IfslqwjZFhiFz85pLjbss/+6PPuNz2DCmwnlV8/NFncNiSFuc8+fAPSBJZL08IYw+2YogRZxXRVHospjUkWae9YSDrpCo0GElolvvmLW/NVOuXjYeWUsCILY1KlqmkkIluvKM8zSSL1imTAJFE1oTmUkpYDKTCIGWqco1pJJpqKn+0ZMLquOvSVREMqQQv6wAliy0ObaKzCxUq93RpUinQ/HQD+ral8/e274RjQEpmoFPNB4qzpYHF2UyOMnW2CZU3hFz6VEICO3VgUmAaRAqj4UzhuCtvOPYRNz1AK0AqtVtly6YZx7E0pow9Y7fnR3/zNzw8f4gLR9785hW2belzoqmE683AsD3wn/35H2GdkmJHXRtyajh/v2Y2E16+6jhsBsYhc7uDMFYEPZBDxouhaSzzecPprGG2nBXkejeiUrM79BhrC87RDyzmNSdzz6OTFus8L168QhGqyvGDBy12PsNWglXL4Zj4yc9flg7DEElG6EJiXjekISJiaNuCy4gox+OR1y+PGO+RFOhDJGXPm6s9beM57m6wxtAdDqS6pWkLzz/2PTll/HwBGolqefr0NcZY6rpmtz9ijcN5oe9H0NL6XVU1Ve2wrkazchgT1796TgiBDx+f0VQdZhhp1is++ugJIZ3x5Ysraisslg37quX5y2ekdM4Jhq9+esXjT/4JOwxtO4M4lBQ9pakUpQiZMiXKa0KklKHcqQ4nYVHW0tgUc8JZwU6ZglDoxRATzlqiRmz2E8NRZMh3cV7IOCmZgkwgeZFB21IqpFhwD6SInqRIrFMqGhWZnBgIYmXK0iYGzghOuG8WlFw6PVXzhDFMTdZCYTbImCL6KY7qG27J74ZjmISjdzepKqScivfMlNZTnVK/FMha6jlnLDEXT+utpYta+iTEoQS8MWxjyUhiDGAgiqGqy++M40jlDN42HLfXbK8u+NlPfkZVV+jhhtc3O4a4pzGem+sdqg6y4Z//l3+Gk0wYImIb8hg4e+SIxvLTL54Ru44hjVxtM30/0g8RP3WCemNZzGecPzzlyYMVry5f0+1uwTZcb3ua1tEHRbNwejJj4SyPHpxwe+hwucd7j3OW1cqxOl1jNPPrFxt2h46ryyObYSib0FkkwbLxkCN1UxGHsQBWIZJSYgzC4ThiXSKLwYnF1YkXr69Rzbx3/oDHZzP6Y6BuTkrPAoFZ5YmhlF/HIdHHzGq1YBwD+67HWosRZbPrixrVWqwtoOEwRDSVSDfGiM+CZsfLix0Xtx2fvn+KpMDVxTVPPnyf+SfnbLc93dDhLl4xnJ+x22+oaliePOL6xZecf/wDrnc7Ts+fkHPG+bKGcoyIswUnmFSApf4vmSR3KTcZayDnkpUqBWsgp/v1aawtWaq4t6l6nuZu3KksS62K6DTPQiauYIrwYqZrEMVKnpSVMj2bdH99yqSRmPCFnO8U1oVaTaoTZV8CYUglUBYWZWJSipACpiPm+I9QEg1axDxGClqr5SUYipy05AAleRMDXgOZeoIQBe8NIcXy0qSkgMZWpJTwzhJSugeBnFDELCmSVJg1hjevXtPtd/zkb/6Gk/OHrBYLdteXDDFw6IVnF29wznFxeclnn3zAfrujroVaPMZ7DI7aj/ziFxuubjqqSnj5ckc/WkIItG1FGAO1N7R1zenZirOTOa8vXuD+YaoAACAASURBVLHd3oKx7HcDWDh2keWixkRYLxc8ftCyudlTu8xq1pC0wsnA2cMznj+74Oq6p8twcbvHAsu2QTRjFGwtHIYAQEwDfQpIKENh+iAYD/NFhd5JZo2SNIIavHEM24Gdt2Brxv6AUYjZYQ08OC/t2VmV/aG/X5TLtiFnGMPI6byaqlshpMQ4ZUzWGDRAu5gVB5Yjr15fozqw3R4RlPcfrznrjyzmC07WFVZmXF5tqPuBq2cvqVwmjQNVe8qzz/8j7ekDxFiMheVijvEeVUNlfaGVJ0qxdEkUNWVCULUIlhQjRoqUXKdygEnMZFIip1xAQIpoKt31QtxJHKXMRZhcBFamVF5KkDN3HZET3qWmiNZU05S1FGyjlCK5jAWY+IUi1y44hJtmkCSF2ltiBJPNPbAJYGyRTN8zEqqYSR36+9p3xDGUNCzn8jCLKq2AKUXUVhBcJ0W4YtSiMaO+eNUQyqCLpIoTKZpxEawrCjlyIsRCXWUtopQxCvO64vbymquXr/jlFz/h0ZPHrFdztleX7Ldb+kPi2Cmzpubl5SVnjx/gnWM2WxCGLdVJTYhCvVzz9JdfcnF1pHGZr15u6YZSG1beMvYd8/msTFIyUNfK9eYaTYF9D8MwEEKimrVlqItmPvlgRbtc8vTpMx49eoCXnpNVRSTh61N+9vlX7LoCnm2GMnugsYbagqsa+mPH4TCwWM5YzCtubvY4FbCWpEJVKb5ypc3du3seXUSmGROWQz/gjobGVTiUZrFiGI5kFYYxotmg2dI0DSEGqqrCu4LGW6noE8ydw3mHr2s2h0NhmABva7w3LBdzXl9d8dn3P+Ty4obhMDLGkWeXB7aHyL/404/Z32x4cP4ej88rXrx8g6TAahtJxw2n5w5vG2xomTcVWZXbzY6HZ6eEnIlhAAwpp6JoNHdpv05ofYnmRdJc2IiiCCoNeGbqtJyqkqlBKpcGLBFiLOWv4Q7DkIKPTYAkKhhRhEiezgkJpfTLTDVJadJTMw0oYpI8T92iFOzCGUfUPEnj7STgMhP7MfVtiJJSaRC7K3Hs1Nb9Tey74xjuBR5V4R5yLi/CwD1vaxSJrshBbeaOmjUTN2zR0lp9RxNNE3nu+i9EM8YLKY3MnGNze0u/73j+5Ze8/+EnWAJeI23l+PJmw3bXl4EvwbA6WTLuI48+WYIYrD0hxMDydIWzA2+2iTj2HEZFk4AKiYQzluViRs7KfDljPfelYzFlbrvIcT+izlOvZtisnK8XfPbZB3z14hXPnv+KkwdrGm/w1mJbS+giv/7qiovNSIwJYytmTlisZ0gsTUTbm1vcvOaD99ZA5jBGAoAziPWkOBBjZrfvqBuHy3JPx2GgEo8Rw2LV4H3NOI4onlnqsURUPf0QSJoZY2AcAjkLYy4tvwaQymCyYK0FjTjrWc4q8lhEUv1wZOgT+9sDRgVbZwTDe0/OGWJXlKXHI//+x7/i/OGa+PwZH3z4Pp/94Ie8efUbXl9c0bYNmZHzDz6i2295/usvePj+x1TOcHV9jRHLYr2mrmdlgM+U1hesrsR2zamwAZOqFi1MhGCwdlIRmiKPuBc8WSZmYdI9lNeNYaIUp+Po5CusLayaVSnTVDIl5Zd8n53kO3pzUlcaAUkGkdJiLtNMB5E74d9ErUppOrQIzpd5DpaJMbkDMPU/NQru77bviGOYJvYYi9WMEBDnJomnFK9oIEcpnndSrhnVggTn4pVzyhhbFn9hNTKVtxhpcJI49AOkgOTA7rDjzcs3bG/2nJ61nJ54nv7iOT97eYmQiG6BmVn66y37PfzFn3zMctmQjePh6QrNgfG458XTF2z3O754doUq1NaSjWU+c1zedDx+tGC9aFjMHG3TstnsePrimt2+w/mKZjZDSCwMfPLpI7pdx89/+SWrRcUP/uwz+mPP+qzizauRX//4gtvNFqqK5bwiRyl4CJkxKOqgIqLrhj5YXr/piCSWi4puVMY+kCTcU72ztjToqCpDgsW8Khy6RIz3tLXgXKKtPTHA/thRtzV1ZamyFAlvDISoWG+I40A3Rpzz3OwOrBYztrpjGDrsbZERr9qa5XrJJ++X0YNN3TKEgeNxZGYr9ocdlfd0XcC7mhQzu/2Ro7Psfv2Mjz/oqcg8+OBDnr94Qfcq0afn7Lc7/qv/+i+5+vJHJLfmw+//IcZAtz/iqoa6doTQo+KmCJsnUVCJrlOYJqmZgL08ZZdFwZjF4KB05Bp3jysYU0oBo4DJGClCOmPcFLjitOktWQyaiqbBSCyTqABFpuEuijOF0Yh3US/boqswBitFWp1FIZd3WGZ+CiqWYSxiB53YktJ5rDgx/1jBxzsV2XSj4jCpIOCCTmhyQVqVaXIWBmsKFSQ66eAnRiPGSaBiQEhYIoeuR3IipUQcjtxc7ri+vOXh2Ypqlri8eMWrlxuOhw5b1VgZuL7YINnwJz98RNs6Xr98zQ//+Ps4k9jvj/RxYL+95eL2WBqyspJEaWpDGCOPHp5ytm6pveVkteby8oJjSByPgdViCQhNJcx8w2cfnHC1O1LVng9WFat1y+G4Y962PH9x4PJ24Gqz4eHZGu9KKt8sFnS7LdZ5Vqs5OQy8enMEY9nsDiznc1pvGcOI9baMa1PF6Yi1nnFMrNYzvHc0taOqPHXt8SoM/YiowTnPmAVsRiWRxgStL5OitMynqJ3l2Hd4U5yLACerM263O4aQqdsFlQgiCakrYk785uVLlm3DYnbEGsuj0zWnf/ARtze3dH0moPz8V8+4vbohbQYWleOgR/q+Y7Wa8+Q9w8cf/wFvXj/n1esNJ2cn3L65oF2v8aZne/ma2eoUYx3bzTWnqyUmZ1IMheKmRPZCK+Zpc8q9KlKJ98Nxi4S6NCNZoxNeIaRcujbLuDiw6sq4NgEmZOyuh8FMAqQsJTsDxdqiNbjDKYwoqSjfiiI0m3uq0RkhqQCWO4BDNJW1T0LFTgKojFg3lTIyKX+YmLvf374zjmFK7NCpxTrnNL0YnUqF8oAMkJLBmky8a4Sf2mbNNBiziF3v0F1lCI62nXHc90gc6DZbtpe3nCxnBDIXX/ySV68Hdl2PbxyzByd88bdPmc9qnnx4zpMPz/nql7/ig48/YOYd+82G7TAQhwPXN0fqquF06ejGjHHC9dWRmJUPP5hTWWE2aznsjxy7yOaQWM7neGdoq4p16/nTP/yYZ7d7iDc8fDxj0VqO+57TdYNx8PTzA2N/5OHJDPJITp7louHQj7i6xnpPv98RxU6FsGcxB+uUKALek7pQnifgK0/rHSeLJQ8fNqzWc/bbPZU4Yk50x0SkjCQ7HHaMQ5GXVcaTHcTKM5vX7LYbcGBC4mTRYF3FsetRlJQivq6pjFA5h0zTs7Iqs9mMWms0ZrbbA5JLTT6OI65uaG1iXrf88LMn/G0Y2O8P3A6Bh+s5zhq2x1AGuvY9j977iIvXT7m92fH85WveM4a6nTNePscYYbE6wVrPdrNFBGbzu7zfTmKmssmMKGOYOh6y4rwpTAIF3wpZ8UamknfqoBQmZsDe+YFJJenKupX7WeaThOJuXmRR9BqmbOSuVMgylQVMOgydhsCUOZFiKpxRsjXkaZYj0/lTLN3nxhiCKjoxd3li80S+2Vb/jjiGaZAFmZzLXL4yRSdjrRLy1MbK27HnJR28qw8LWCRT7aVaphHnGAnR0FSJEIq45Hi756vfXAGRR+snhNhxdRk5jJHZcoataq5f3/LgbMGDkyUPztY8/fI3fPjxJ6zXNWM3kmWk7/Zcvz7QLlt+/dWOhw8XVCgX13tU4OSk5WRe01RlyvH17kh2NcZ0GHHMasvaC3/xzz7ly6ev+dkvnvLnf/IpTeMI3Y75osGkyI9/1pHiyPlpW3hvVzOf1bx8eYFxdsqAE12I5NSTczWNuTMkEWJQbjcjdWOorVB7i/eeWeuZtQ4JiauLLarKvo+QlUPOiFOcMVhRfO3BesJYmoxkHMmxDBA5OTmh6zt2twdCN+CMIjkx5ETb1uz3PY7S3NO0hqauGMaxjCqzpd3ZWcvF9S2LxZKlKlGgUuXBwzP+1Hhuu47Lyxtu3lzSzObMa8P/8+Of8sPvfYYPF5w//oj95pIhJH7++Rd87w++R9M0XH71BfaTT1G/wvmGEMdyL0Sqaj5N+XpLXRoDORWwQFEwhZok5zI0yNoJE7AlcNkyHs5QQL8MYDzC2/ZpmYCGNA1+1Vy+r1KmN8skWzYoSQTUTBPMigMqOcddm0DBPMqgmWnIC+V7d31FWQSTCnZSUpepDTv/o8wYpHRW2knkYQpmECKI9UWoIaWWUyk/KylgmcBkJiGJdUIYI8Ykxl4ZhkDbNtMDThy3O/bbjrpuOT0/YbGsePHzL3h1fcXZo4cY63j96oqkyofvPcA74erikgePHrA+mZP7Huvh5rJje9NDBfvbzPl75wxhx/WmJ2lmPq9ZzluWrWcYA7tDZNMFjJRx6LW1PD5Z8r3vnfN//bv/lzEpp6drZpXixUK75vbmijeXI69uBt5/MONsPWOMSuWFza6/F8nUtSdpph8HsmlIGomqpFgGzeyPgbZ1tM5iDaznc7xV5kvLEDNDzIyhZF8hKEYTSRI+GzQo2VRkW9ShY0hQKVXdwvT3KrrDDs2WqmqZWaEPHaKWdBxJlVCJxZCYtQ7nSwmSSVjnmM9nzOriOI/7A8MwkkJfOiA3G5arE07WK7xTXGyZ+QfELHRjRtPI5WbDB08eYvORh4/f4+kXn/P+x0/YXr7Af/R9Fidr9rdXLM9mjHHL+uycnHSi8EDiiHGenMv8UHv3N0hEp3HzRVcjdpIxT8NbRBzeG8aUy0bOATF2AicLKOgmMj0z/WmDO9BQExhXHA9KlsJE3E1YKoHtDnmIbyUYxk2ffb2tesI5sEAs6sc8ZdPIhI+kiYL9RzioBQCZsAORqUFEpvbRqU9iKhWERMoTvUbxvhNdTM4J7wzjMDLETFV5nJn+dsSxY3P1BmMazs5POTtdsHnzlKvLLQ/OzzAYXry8Qozy4YeP8Qir0zNER97/4L1pNqTy8uUVhz6TvEOCMoSRT95/yL//8SW1LwtmuZzhJv375jASkrJetMQIKSc+PV/y8MGKH/30SwKO07MF5+crZo0lhiPjcWR7gKtOOVm2rGpPTAajA8c+c32zZTWfYX2hOscsxOwZj5FkMmItYQgkNbSVwYnjwarBGljNFOMyx0Pm9jAym9fErAzDUBrZjIMhcXN9hGwQOeAbT9M2aFT2wwErmXlTM/YBFUvVeOoqsT8cWc1auj4xGqXv9yybmpyVm+s9xT2XGZ/OlylatS3zLz755AP6EKmbOTebLZURXHfEGaFuZnz2B9/n4nrHb778/6h7cx7bsjQ971nTHs8UcWK8U45V1VU9FLtVpCRIkEAIkCs5kitDAP+CaMviXxA9OgIkh5AsgYJARzIEogeJza6uqszKvGPMcaY97zXIWDtutkGQlWyRqN5ORCTu2XnvibPX+tb3ve/zvmFuJEV5xt31e4a25+LsmDWB07MjtvsDifRw/R4l47ja93XE7NkebaJtfegakjRa/Z3zE+YvuiLsOE4AGJBCf5Q5Sxk3qhAUzkV1ZBABJjyceCI4/RVAixIychsFaERsHgZgmjLEqUEkWCtPlFAT9QxKKp5k10/hOELKaaWIqkbx0bcRqzspHP00GfHhiV8CXpvv9Tj+1iwMSiucj6YTLRzj06oZJsGIFFgv4uRCTh1kFxV2QUxVE4Ku7whBMy8FRgQe77e4sef63VuK2Zy8zOmqnl/8v/+Ch7tHjEnZPjzglOGzH35GV1W4dqBYZlxcHpMoSdf2FKmMdGJ6juaa8X3N4vyEVKV88+aKcbQs5impsjw7ShFS47qeF5fn7JoKNzp8PfI7L1f88et7ru63rBY5L56fslrliGHPZluz7xRvr3ckWcZ6rkhNQlCC+7t7hjEivorcxN7IfU0/BA61nbgSitEHEh3DZTSBeZmRZYo883TNyKH2aB3QRULhA34IFEaRqRQvFXU/MHiFzhKEh7YfUMHTdi1FUXCcL8hVgpGBi5fndIPD+QE3WFKZ0HQNqQY9Ewg5p25qurFnfbT4SPY2iaLIS5COLNf0/UjdNtjR0Tc9sywhTRUq0ey7htSNHA47yiLnb/+d3yNLCz68fY/vWmazObePB968u+I//Q9+RjHCYXPD44c7Vkdz3EHS9C1nZ5dsr94wXxzjiiNUqpFPzA8VfQ5i8tlopabPZHTwxlOFAaEjQyhEdSRTngPT5/PJ/Rh39hGUmfoUHiUUwcdJmw+x0alDZC2Ead7uCSDlx3Ad7wLB28ipDAGl5dR+jxOMOEqNFVnU+MQG5VPDUYrA6FwUlP1NrBiiZmRq9Ez+cvVXfJUIgbUDcYCso6pURJeZkk9yaonEMnQd66MV1nvuNzvwlu3dDa6rOX91QVN3XN3cUbcNymhGB/lyxfnpHN/tCMHwxe/9ECUtiRTUmwNJmdEOHX3bYYqM6uHA6njBMI40LiEEw+VFSZHA5XHJbLZkf7jH6wykjxJgp/jsk4JkuaD75TWr5YLlMuMHn6+5/3BPXzm2e8u7TcWszEmNxEiJNimHQ4XWGpMm1F0HPtBZT+sk1nmKwmD7nlFOO1uARAbWqwWptig9sDuAlgZPR1AZN9cN83lBOQv0NiCcom1H+nZks6sxcmqgSR+PJaMjFZI2gDcDRZZgg0MnkMoMq3u61qGcRrkeOyjatmaw0cr+8LjHucBykSOVYbQ9j5uKoe/xKBQiUqUWJVc395TljKF/4Ec/+Yy66RhHi69iMNBssWZ+fAwm4fbda5IkQc4W/PrNG378w8+ZLT7j3bdf8e4vX3P2yXNsO7B/vGW5vqQ6PDA3BpMeEZUHkYtolIo5FWLSvDwlFAUXw3B0MgmbxOTDEh9FRQGFD08S51h5aCUYA4RgiPRq8fHhlPI7NkPwEimnCkNOjdHJUCWkQAhDIE41gh8R2kTSmYjiq8CUaTHVz1G+MDEcpETL2I8I8m+g8jF8/Po0V5/EGCGWWt67SGX2ASUdNsTeQhAxJCT4gLMDVd1gTEZdVVGDHhzVoaKranxwtNWeDx8ecK7hfrtjlqS8v93xxedfIP3IOMCzFyf8/Oe/4Gd/+BN811LOc2RqOOwOjGPsXxhjkDrj8XbP/tAQBCwLQ54oyjJlu9tjB8f8+Ii2G1Be8uXLApKUf/p//wVZlnN2WvLpi1Pev74GAqYwPN5KTpcl3ltOVnOcc7RdixCBbnCYxFGWOZkxvP32PYOTHM3n7HcVeV6gU/BWUaaGokgpkw6TGjaVZBg8TgzkeUY3Bs5OV2SZxvuRXd3x8FgzujEKsdI4KvZTaa1EZGCMg8eFFu8t/dAy2EiOPj0+pmsa2q4jMSlOpCjTI6xEeokbA1ZaiszQdy1V3dD0Y+QUKMNyntMPln3dMNr4d9zXLWma8vP/5yuO1nPKeY4QmsddHIH+7u//AeujmIplpGC32fC4ueLbN+/4/PNXfP7lD/jqF1+x2/WcPFuzuX+gXBwThMB1B0aZYN1IkuYI+cRRjHRxpfVkfooNPa2eOvsx/EUSLdTBxh0+uNiIJEQMfDwORyYl2BgRJ6bkK6KKUciJ6aCY+haxUxDNgkyq3wkqIyeytdRIL1GMIOR0XyY7dpyeKGISVQCYgnCmU8f3un4rFoYoFNETqOKpDGJ6g6JWfLCQaokLfrLQTuaT4GM6lLcgFHmWge9xQ09bV9xfv8f2HVmR8eaX39B6R1VXHB+VXL3dcnF5ifQtg4PF2QW7Q8O/90e/T11tEUIwzxX7XY2znuOTJbfXb9Ha8P7mkduDZeh7tDKUKZysSroRdocDKs2RbYsI8MXzJaPv+dM/+xWjdfzoxYoXz5c8vL9mPku437X0yjPLPMt5jjZzpBsmt6OiaS1ZpknSBGUM1aHBmByTCHa7A0erAimh63qyLKHIFfPUEoTjcSeo20CZa7JUoEzKcZoxDB2ja3n95sDo4wMupEYDITiGCXWnhY4OSgeD8KRaISxkJiPgOex3NHVFqiGIFC0Ebd1FOOkYMyKlCgil6EfHvrV4D3mSIlNBXTVUVcdyOadIDR7ohh7NkxFI8/iwY2halsdLlkdLpJC8/fWvWK3PUIRJvOTRasZuV3Fzdc3JxSWvvviU169fcyF7svmKm/fvefnpp7TNDq0TkuKCcexJk5TRjyRKf8zMDD7ghf8Oy0YUzEV5clQIKBV3/id+48dKY+odCBm9GNF38V34sQwSxBR9EELsvEwVSmTDRKm0VgEviayIKYz3qahw4Wno8DQWFRPIfPKERPHC5O/ge0uiv+c68m/niuipp85smN4+JlVXzKvUWuGCxHlJEPHNFt6Ct/hxpK4HlvM8lmmj47Dbc/vuLUNT47yjKCSHpkV4x+poSb8fOTlZE/oahGC5nKGM4vMvXsZzXZqSJRJvR4rJjSmFZV7O6JqBICSzPGNZzlgfzTg9WuClpDscKBcrnl+e0ncDx4nFCcdffnODQ7NaLDk5XdI9bpnNMu7uNqhE8PhQkUjohx7XdwQ027qlaVpWiwylIc0y2nrkUA1M/StWszQmYWvFYrlgkRvyTDAEz2YfopY/eGZFipAm4tcZUcqzuW/pvZti0SyLWcKsyGKzCon0sVnatCODi14MMY409UC1O1AdGsQY6NqWQ22pm467uzt6O2KHNkqQJVPJDIfG0w3xfN4Pjt2uQemUJE142FZc3e/ohxGsm6jOAW3ia/vestnsqbYPNE3Drm6oDjuStKDb1rx6foHSkQB9dfPI48Mjwjo+ffGKq7evCeOeYjnn/vGecnbM9uGO4KMlvG4OJNpEjNpkooqjzLgTi4kozjQFE0LzxB39CEgJfoKpTP0vpl/QtCj4EO3SwQ0IYT82H6NTM6LieOKdyknTI1Scyk36k6jhcVGMhUD48aNYakIy/JXG5NOU40kJ+TdwYYjqMvFRxjk1HVAiau21JMpUpzIrngujC23sO+zYk2cmZhpaR1Pt2Tzc0LcNJk2Zr0rqxz3zImewnm7fYpKcru9BCI7WJyyPV3zy4hgZHIKO4CzjYHFhpKtbVkc5Y+e4ua/Z1iNCKPq+ppjpmHuRptiux5RHnByXNFXNUaZYnV/wl19fUTUWqQ3Pnp9S3e/Jipzbmx2r+YyH+xYrSsrFPAbmOBdDbH08g2IMoNhsW3rnMKlhlhiKifw0K3OWs4KjMud4lSDw7CuH1ilKG45WKUF50kSjUNzf77m+axm1Yn28YLFKydOE4KHaVeADqYwThN6DMIYyUyQmUBYZaZoQtKHtBxwe76CuG/qujZWC7aNkXQi0dCQTzDfPNAlmShC3eDRNa+mGiFBXSnK/rbnbtWz2LWPfobSiXJX01tI1A3fvHxGDoyhSPlxfsbu/JpvP8AhSleCsIy8Mm+2epqnQ0jFYzduvXqNCS56mXL19w+rklHpzh+sblDB0bYMLfvIhfKdeDCGmqQMfHZYheEZrp17XE5Q11glKTOIkEReXIJ+QrvEzG4jvi/QC5R1M0XFxFD+lrAUIRCm0JG6OT0nWaoLbIgTKZCiZoFARPsM0vg9hWgzin43q4b+hJqoni2j8flpppSL4mFOp1DTCFD6SdCXc3u1YrpZkaYEfHbgBPw68/uVf4KxjNs8wYmB3+5q373eU6zWHTUueJrTdgdP1Mc8/+4QsjY0h2/QkqaYbLEYadJaxu7vn8uUFN7cPaAXdMHL72LM4VsxKjbEDz14+4+rdDZ98+pyQlLx5f08yOHSx5J/80z9htlzx4pOXvHx2xlhtqfdb3r8fmS1nfPX2PUkyZ1ZKfGexvWTfNhTljKVWID3fXNcYFXd6naRII1lniiLX9BaCE8yLAiEdu2ogoDlaGrIkZkZ4NFIGvHQMAwSdUfcjbdPihhoAPQFulInTnD4o1gtDPQ6oMZAoTTsyMSgESknSNCUImOUZpm+j0Uso0szErM/B0R4G0lQhVUfXO8rzOU1VUzcNiTFoLfE2dtSDDywyQ2ctt9uW603Ly+OReZlRzAqCDwx9x2a/5fr6LfNVyc3NwL7aUmQ5q/MzxsrQ7GqUHtk+PuLGnhfPn6PShG++eo/mDT/8yQ95vH7N8bMf8Hj3niIvmR2f4f2TJkEjeQKgxKi7mMTtgJEgTdTUTjJjQTTFMTV+n6Zp1gWkAy2eaoOokxhtbBILBMKJaaoW0QOCJ7p0nF54+xS0G48pT8E3QYoYoxhcXJCk+2gLsC6OOqO+x38M1vk+12/FwhCl4k+Zw/HM5EMEdgJRORbHxVOqlGVsuxgIGwLe2njGblqa/Q7bD2gpUKHh/vqGzV3Lxfkpm31DkigUntOTNaYoqXYH9KrEiIBUjqoZuP2w58XzI+azkno/w7Y1iVFU1cCbbzacvHrB3e0NYpzzR797xHa75eL8hDRNuNnsSIPl5emC//NPf858teSTz15wejwn2APCdQyDpVzkuLElLQq8g1QJmmb46NjMkoymtxzqmiAEeZpQty2u70hSzXIeuQdlrmNup5Bsdg1Kx8SmWElZhFAYo9lVDaDY1xX7aqTrI1ewMILwFAojFckkMVXpiFGaXCYkhaTreso8JSkS8kxPajvD6CxGBIp0hvORG2nrBi1zUAahI74skQleOMZxoMwMeXHM2I9YO5CUBi91TB1XgUVI6AbL0I/sWougRwvBbDknSQRN3SGl5rCpGDMfeyCj5eL5Cx5rQVJqgnPUdcfoHWU5JyDI8hRnU968e8uLywv6vuHo7JzDwz19vaNcric0vopdfjk1CkXMGEEIBAkqhI8J1rFkj5/cifcUm482TDMCPwXdRAjME5n6uy0wVilPHEqeIK88IQhiSz5WM9G7wfR/epJc+2lE+XS8USJCdKJCMuA+Vg+/+fVbsTBEeIafzjWB4KNwJggIJDHWW8ZsRMnIOAScy0A4ijyLY9J7RwAAIABJREFU5/L2QOg9th/i3FY4bFuzexgROmG7O7A/9Dy/OKbpJ7dcknBxecbQdeAtwwT08H5Ea8mbN3esljMetg8Mg2UMguJ4wbff/JrBHfEf/qQkCEnTeU5PNQ/bPYfHR16dLZmtS8rlgtPzU1YlaHEguJoP9zW5kgxtjcWgnGOxLJFEAdUsmyOliZ4F63nYjywyg0dQpinWOZaZIU1L2iGSmUbbYIPHJAIRJG3TIhUkk0BsX3fsqgHXWR7qgBstInjKwuDsQK4VJkx5j0GgU4G0gqF1ONuxs548TVjNMqQSJFJFQI2wpCpltxkxmSJPIjbO+4FgJaMMzBY5jDG2bl7CYAeG0dH3Prpk0zQSj7oO6z1OCJTUJJlklue4MeDsSNO2CBxpmqO0YtjvwKQ4O3DYe5Yrwfb2A6eXr9jtHhB+QFYDbdtyc3vN0fKIeTnn/fu3tAfPujQMImVtXrJYP+f2/a8Y7Mjq5Dn4AWQSrf/B47wFoxAhkp69kNO/M44MxwkdJ6buYZhERc7H3s/oXJR/C4Fw4TuVo3iS7wusZepfOEB+TFwTxF+LkGpqinq0jmSzmF41PT/ySWTlAI8PI17E0aaUKvI3v8f1W7EwIOApxy+uqxKFx7oYsiFVbE5610e1nZyRZYFERx6/61tsJ1D0XF29p5gV2KHh8boDrcmUJClWBPZUzYhHkKWaZ8/WCD8ghOBw2NINIyrRnJwdgTAUecowDoyj5eax5rDvGazCizkvjzXz1Yrb22sSrWiGQLA968JQzAr+4peveX55RrmYkZmaoTpw+7ZhfbkkN5J68DSbHdLMEP1AsliSDgVZWdB0FhcCm11LkqYYERAKsjzDWsvF6RGYFNVXSB0DXROdMfQVzsafBYGmsyTa0LSew25AocjcCKnCuwGEwjjHLDNIrbBuIMkTqsOAsIGgHJnWZNKTlQWplgglKMqERGrudjXeQZA+6vFd5EUo4QheozJNqjVJakhUihBQ1Xv2rkEXEmUFbTvgnURqTeoc3npssIy1wBpFIQJJbgjO0w0j2qQxfawo6fs2uhmVZLfdMy8T0t2M+WLB0FV4rxAK8llK1++xAb780U/4+l/8Od98/Zbf+dtfcH93z8llyfHZ52w3d6RFQzmbgZA4/wRWEdN0ZYKyCBHLfEKMiJvYH0+BNRHRA0LFY4LWJvIlkQhFHG9GN1VUWrpJEwHTAy2QwqCkm8Czcf/3NvY+xmBBaGTwSBGBLv0YJjt3jOILaKQNCPkdnu77XL8dCwPAdLayIRCRKwGlTFwBfcC5hq4bMcmK1IC3Fh8CTbWnaz2LUvH+m2u6qiJPlzT7PUJDW1Wsnp3h/MDx+pi2H5gv5hwfzTDC0g0jj7sa4R3WNgQXuLx4xtWbay4uVtw9VtzeHRito64HdqMgeMUf/P7nfHj3hrH35POEtqrxnecPf/YTvnpzz/Zxz+999jl2bGm2Dbv7hnyeEEZHbQODdzQHw/nZjKbv8V2LyQvGcWSwI11ncXbkZL0kdD1aS9JMcnzyAuFGTKohpPRDSxgczRgX1zTRWKVRAbZ1xX4fo++M1njnSGcZ1g8EqdFYjs9WuKFltB43ONq6RWcixqGNnsXaUOY5Q++YL3OKLOdhs6ceHSZJGdyIUoLMGPI0Zln2AxwfHWHHnjRZMFst0ElBsCMqUWR5zjgO1E1Lqg29tbTtAZ8oyiKl3tcMLuC1pBmjs1BOAh6Ze5apZ3CSpEgRwWFxZFlKOw7sd1cYHZOyV+tjvM/ZP+xIswSjA2PXcvHsBe/f/Jqx2yGSGfv7a4rVCeVyxcPdNXCJUC3lbMlg4yQBUpyz0TiFmHgM0Wjl3RMdKp7DrA+TroEJQBwl4IOLdOrwUeIcY/Gkn5YfIRAiAeIkxonJ+8CUl6GiABCnP/bjIAZCOz/EexqFcAoRLF7EJqYk9m++z/VbsjBMgoVpZQ7TqhvCJPbwAetyZrMyYsNkoLee+tDQNI7ctLz96mseriuSxHD1/pe8/uU9y5MVR+uSYl7w+LBB5pbziwXnZ0c0HdjgIclR7prj1ZK2zJkvcqqHLcfHc4Z25O52z7b2uKBJy5JX5YLLsxVv316Tz+Y432LyGTokfPmDc/7Zn/6Kx/2BFz/4Ebub94Qw0rYWlWrmi5xx9LixZ9vmXL5aYuhRRnOoIYSRq+st65Mls2XK5fkKoVIy4UjycmouBRgFN9stbrBYG2h7GG1FmWnGYaTaDygt6JqB0XpSrWM5rAJd0xECLIuE9fGSal8xDvB4aKmrmtPFnJ9+/ozN7SNKzcnmBfkyx6QJ223N5mZPYiSJ8NjWooOiXK1JkpS0TDFijfEKmRakZUoIHp0lYCVdUzMvLglSUzUVTbXD+gjVfXjY0A5tJD3PczAJXd0xSEtqUrpxwKiAGBxvN3uWZYZOYwaJ7wfGusYPc6rdnsOh5my9JlGKIi/oUwNKczgccMFztFzxgz/8GTfvvubk+SusHdg83HJy8YLTi5cInaBcz/buHXm5QCU5NkgUEfOmEBhl6IY2uoInrgJ/JXgmPE0ORLSbj1NIVhQkhal/IAhSROKXnJS+IQqoPDJ6VYgguCeQjJQmqjGRHwcNAoVUEV8nXPQT9RNwd2JH8/0E0b81C0OIllQguLjziYnLYK1DS0mRgwsSpQSHfY2RgrFrWC9S3n/7gYerLXpKbn68qhFacXZcMLrAzc0Vy+MTTs+PyYxGCI9JNASoN/dok9G0EYvmx5GuEwjl+PrrK5xKaIaB5WJJtW94tZ6jg2foK9IixUtJZiTPViU3Nx9oqoqLF88ps4RdNdDUFUmaUCQSqRWp0WzediwWkBiFwcTdEUdTDZyerwl+wA0OK4GxZXF+hjGx8bQ/7BmH+KtWyrDbHwCJTuO5f6ihyGNfAWVQoafrR8bRI7wlNZp5anh2sWa0A1cPD2itOSklf/SjH8YPxGi4fPUlMovJXkonHPY1YohS31JHipErEpIk5ejkjDzLPyLQCpmT5ApSydD2WKnQmSTPDfuHLaMfyHDIrIwYHdPhxhHdSvrRQFcTxp71es6bt7cE15KXBYdDSxj3pFnK46HjVM/orSPRmjTPqLYHFquCtoNdVcVeRaIQNqosuxDTvd69fs3Zi89o6or97XuWl58y1hvG4YyursiKGeiUJC2p9ztW62QyI8Vxug+B4CxKxSNC1CaJjwpEKQTOT5km0xMpZVRJjk58lPGLWCJHjgie4KccjAn+ghdP7sAIlXnSOQhJcD6qHYktiyADimgXsBNPQomYwgVMTcvf/PotWRiYzltPrPzp7GVjvt/t/T1nJ8dIAdW+Z1EYdlXLOFTs7u/ZPlYcasvZWUlT77CD45NPXnBoOj55eUJVdyyPlvT1gfn6mBCiEWu33SKtpUg1+0PHel1yf39Ay0DVRn359nFHWaRsNnu0SZGMvPv6W2YnJzhrmc9npEIxKMU3bz8wO1pSGs9QPTCfGcYafN+Rr5YIL/Gh4+TimMELVOi5eTjEqYKW5LMEiWMYI93aDpazyzOUVDRNw+g8bTswWEdTjXRNg8kCn3/5ktevP1DXHp1JtrsKbRKMD1y9r5BBMlumQEKZKz775AXXHz5wc3vHq5dnLJcLms0OPzjMLOfs/Axdlng/sn+o2d0/Mo6eMjUczeYoIfFBQarRieH8bB2nC0mKUhqsJUsN1o/IJJ1CWwPOS/xiTtMNZGWBHUacg7pTBD8glaftFI3wuL6hqw9cHi/48LhDDpZ5WbJrKkw/0I+C6/s9z8/W1HVFsZQgDId9RzEqHv1IMjWsT0/P2d5ekxQzFsslV7sNvq8YXcL+0DP4b7l49SN2mx3l8pg0S/AektkRKp3TtxVFWkS4i4hlvmeiRksZ6dJP4Edi09H7ybItJn+FiDH3WoanIKsJjBu5F1ooXPCTrNlFgCuK4PzHfAyFJriJYzbFLcb+A8gQFw45aYJkcPFIQ5gmK/8OFwYhxLfAgTjgtSGEnwkhjoH/CfgU+Bb4r0MIm3/ljcKEyggT+n2SpSoREGEgS6PUWAtLmmqqpkULj/Q9+21Fsz8wW87Y7fbsN1uOTs/QieZidUzTjZxfXsRor8wAkmH0IDxde6C5HzAmMJsVWBKq6gYjA4NK6PqRo5MZdw8HeitYzDTt4QBKEYDm0HJ5foobLb/65S+YL+boPMX7hkP1SOYDSIV0I0oX2K7FySRi7EPDbu8IKp5Jd1XN0WpOc6hIjSaflSxnM/Aj3jmGbqAdBrrBUdctzgrSRUmhO15//Y7RC7ycUoh0xt3jAeGhzFKOFjnVoebi+TPyMufN11+js4QvP7kkNTCfFayP52TzJSafsRKw2RzYbxrqpkPiWBUZy9UpJBlKCozRSC1RRqHxyESjdUAqDyJizdMisjDwAo9FWIeSgXmagBF0MuDGaNLqJ19Gnhp8KHDKQNuQl4aTceTDY8P6OCFLMg5Nx2oGvdU8HirmeU7XDizmKbtdR9P39HtLlsYpiQPm61Nu7zaYpMdkObvDnk++/CH/4s/+lHNtaDc3LNevUNrw8HDD+fkLurFD6hRF+ZEoFqcNDm00k3IZ6+LRV4kpmYqAinYTngKUIidlGkc+4emlQOIZfVwkovBXxmbsJHVmCtJ98g/F1kXEy4zBxe98iC5L4SdNU5jI1pNJYhp5fp/r/4+K4e+GEO7/ys9/H/g/Qgj/QAjx96ef/7t/5R2mmUwIEJzHKDXFhUFdOxKTkBnJ4dAyL1NUYthstvT1yGH7SNdbZikc2oqqtpw9MxOMN7A6WiGx1Psds/mKpneYRNG0DfSW4EYOfSCbB9rtPZkKmCKjO/SEYBmt4bD3zBcFF+dHXH/zFYuLU7q24+hoibQdWVHgRkvIFZIBhpYwWPoRskVJUaxoqg7nPJvDPS9ffsL2cYtFgYTtriMrC6p9Q17OWa9KFCNZnuD6gappSMqSfdsy9g6hNGkiyDPF7tFjUo2XcVLebBv6wVMmEbGfzWZ4N3B8smK5KvBjz6dffIYUjqN5hshyTtZrjPcM/UB9qPj2ehuDZWXCcpaSJ0uS+ZK0mE1HmrgzKZ0AkXsxjiMyOJRPUCow2I5xEEgl6JqevndoFSjnC8a+Z+yHmA9iHdK2aCU5yg1V61BZyugTghvxwNFqQdOO7Pd7Tk5PeLA9eZJgtKMZbaxGUKgsZRkCwzhAgM3jnlmqcUOF0inz+ZzD7p6kWCC0IzGSy08/Y3tzg3cPmOKI19/8ki++/F3GrsINoBcFItMcHm9ZrE5xbkTq9KOeIFYJHjGxIEN4ioqZ1MnwUbAkhJqYkCLmWkz+HhkiPeqpEvlIsRYxC+Upci7gPuLxhBTooAjBTbmbUWcR81MiJAYm96dz3/1lfsPr34Yk+r8A/tH0/T8C/st/3QtCAO9iA0eqGOYhCGw2NXmWRaGJc5RlRt+P3N1vcN092gRuru7Jy4Sr17+g2lXMl3Nm8zyeAWWgzBWphjSbR8WeSfBCY7uRetuAUuSzhN3mgK0bgvPYwXH3/o5iVvLhdoPSGefH0xk6LfCdQwbFy2fHKCXQiSAtSrwfSLUAKzlfrxAa5FRWe6HpnSdJMw6bLVUbqOuKcfAxVNb16CSJPMRUc1RmKOdj1FvW8/D4SNsFBmvJk4iib6uReVFijGdWFjzeVmx3j9iuQnnBYlaggcXxMT/7w5+SS4kQhjzLKEpFuTris5fPcEPP7m7D3bfv2d3sWc0SZnlEyB2fnnJyfkm5WEb5dZ5TZgVlnlEYKFKDEJIsyymSPDoEbYcMjlwl5DplPltyerzk5Pw8hteGgVQpDIpESUpjKIs5i3LJqpgxLwpyI1kslrRVR5onfPJ8zbzM2O+2XF5c0FQty+WMVWHou4Y0TxnqBpNl5EWJCpKxG+mGQHXoKMoZz1+9YrE4QqsEGTTV9oGj4yNGIUjnR9xcfUMuNYfHO+p9jSlyunqLFIpyMaPa3mNUMpmfgI8P75NoSU67fIygF5MC8gmWEqHGoFQMoVGTZFmGCe4u5USCUnHsKcPHkakUkzyaCUEXFF4JvHyK3IvgGzUFBseTQwxtjgE5/257DAH4JyISNf+HEMI/BM5DCFfxgQ9XQoizf9kLhRB/D/h7AGdn6+gdVxIpLMPo8A6WRws2uwOzNEMKz/1jTaIs3fYD3mX8xZ/9Mev1CZvrLbmZs3p5SZoqQt/z/IvPUSpQFIb37zYkZkSnJdbC/vGB3dUb9q1BJ5JnJ2uu3n1gEAaUZ2Y0L754yS9+dYX2M/7gd86xUnJ7fU+az8hmSz59dkKRKT40I9/+/CtMPuPsuKStKgiBXTPilWK2TOhHR9/W+KBIkpRDPRCEI0hBuSzIg+Hh7oHPPz/BhzFi2GVOVVdUdcf2IY5m56s5s8WKx4c7QtdSzAu6tmW/ddj+HiUcRs24fPUpeeqRUlFMsWqWkc9+8IpZUSCcZ//uA5urLTf7bnIEGtaXn5AmAhOKGNmlQOk4GsvyEp2kaCGRWk55CvHArYz8zh341HH3CseAUpOs3QeCV6Qrxdgvcc7hyiHCW6wlP1T040hZVLRNhcsyDoeKoVxw2DaoRPGDL8758M07moc7Vs9e8frDB2aF4uRoTnuoOV6lDENHJhWyEOQU3N5smBUFNx+umS8rnr18wW6zZ7d7xHZbBDGKr+8agsgY+gapznnz+ls+TwzGpFTbB2Sakc7mPG4eWa5PYrkvInU88kcn45/1k006ouglT8jCmHQN0dodQa9iimCcGpvE43M/WnSiES7ERiN8fH+lDHg7EJSecltHvErA2UnkFCXQUghGosZCCIkf7fd6sP+6C8N/FEL4MD38/7sQ4i9/0xdOi8g/BPjRDz8PgoC148f4b61BOEeR5ORlwvX1ljzTdFXN2FhOjjuU1Gwe7umaltNnl6SpoEgVZEdxNVYpD7cPODtQDw6pD/Qj2G6D8wWoaLga7QxHQjovEGGg2Y08bPcIp1kfZ5TrE978+huSJM6gi9ywyDV1EFy/e4dHkppA3+4JwlB3LWmiefXJc4ZxRAwek83p6wETAvXo6IeOTAuGpufb97ccH+fgepyPFt6hGzjsW4RKcXIA4ZAaDrsdtlfkM02727GrWrQyBB1zN05fPseImkLPqHYj+niBkI48dci+ZnO/od53mDJnfjQDK/FesDpbMTYH/KhJj4poMfYOtEYnCiNA67goCKHRk01eKgVKw2ARJn4gcZ4g/SQE8tPCYwlhjA08bdHK4zoX49REwGiBtQGtDWVSYq3FZ5AZS9sL7NCTq5xXn33C+3c3zJclJ3bNu3fXpEnCydERaIcKlsQIeitRiUJ3I2WRkiQZfdPS7Dcs1idIrbj/8JocQ5bNkVLy+uuvkK5HmwTlA7/+5Z+zOn3J+vwZtu9wSuLxVLt7dDJDBD95EeKJIu7OAiGeQmoFYdITxPfruxyPECYeg9RIHYeSLsBgR7ROEO5Jhv2EnYPJIsQ4GQiFAqk1IgQsURr91KgMT1XN1OCMSdm/+fXXWhhCCB+mr7dCiH8M/B3gRghxOVULl8Dtb3CjKSAmhof2w0CZZ9R1TWIy6qZluchpm4r+cE+WG27efyCRgcdNxfxohZSe2WLO480952cpoxSIsWdfQzkrY/J1WjJ0W5rdyKEZCSJgjGFzd4930OwrZsfn3G/vqTqHViVf/vhLHq9vCMGTz5YsZhmr+Yx+aPnmzTV1O6LSAqMgT1Nev3ng9GLBw9UmJlWfXPDNN+8JBBarGZvNjq6tUSqKdx6vdpSF4eJszXZ3wJgCHwJ1d5h4DD1oydn5Mw67A/0Q0KlktAP7uiabFXHO343kWc4MB1Yydo71yYxyoVHC0HxoGKQnaM3x5Sl2HKPKVCccH5Xs7h+ZJUtmR0uEjvp9kyYEGWE4aVYQ9z8XnZAyEJSMuDEvIJk0BXi0VgihsNYxjh4lfVTuK2B08WfrSLRk6MY4ovM9WsQx9SgiTSlLChJdMU8TuhBomxavFBfPTjhUO56/eoHWmvdv30Z8PYI8mRGcQ2vomorzkyN+9et3vHpxyfpoRbPboJXAhASVHXHY3fNsNmdEsVgdk+iU/fUVn/zoB1y/e03+/DOaww1JPmPs4r+NIBiHPXaMoiYfnpLPJqmzFEBMoo79gBgW85Rf4f1Tz0GgpMTap75EfMi970HYuMhOVYWUceFx3k12rCiHHkNs9sbHKI5HpYqAWS2iilhMvYfvc/0bLwxCiBKQIYTD9P1/Dvz3wP8K/DfAP5i+/i//uns9NWr6YUSrhCLP8QEcGqGgO/Qsy4yH3SNtFVgsHNdXd/RtoFwuWa5PuTwredzeMQ4WnUikUozWo+TI2PVkRcn97RWJFmyrEU/Ajj1FkdGOAhF6lAi0+w0mNUgfSc/3Nzc8Xl3x7PMX3D9suVgnZKpj0wR2uz2ojHIRY+p2uypaj23BYBNMNqfrLYvFHIuiG2ravqMfHGkKj9ueVGcsz5dU+wPKW7r+gNAJVTfQDwPGaNYnR1TVgd4NLI+XtPUe23mWqxXBdwgROD07xVuPHkdMmlCWBiMcvu+QNtDalmI242g1i0j4rGSwnllpaA8j5yefYhKB0pO7L3ikVhFgkqRoDW50qOAILiFg4hnYOhKT4dwQLUTe4whIG6nFRimC7VAqlrQxvTngVIoKPSGNnoSimDGOlt6OaK0Re4cNjlmeYBSEw4gf4Pg053a7BxSJ9GSJpCyX3B8cp6ucJC8xErpqT2o0TdvSNjVGR2+M0gblPd4G5utjhPDc3z/ghCDJMvJ5ye7hket3bzg6ecZ+c0UxX2AJaC0YR8U4SkxqsG6cdvbYP/fq6cMcsXhP6kQfDN71E2My4Ca9kpSCITwh3SLNSggi21NqfHBTI1EhpzBbMUXYfZQnyCjAdi7+2aijeDJuiejGFAL/9ILf8PrrVAznwD+e5qMa+B9DCP+bEOKfAf+zEOK/Bd4A/9VvcjNrLXmWEFyAIDFaUqSaoR8wSlE3Fa49YLTj9vqKpu2wKJbzOeuToyiZPljm85Qky9jv94QgCfT03YAP0LYDt7ua0XuGoYdgmR8tuP1wS5kbFvOc7WHAJAn0cLbOqaoGn6VU+y1hdCzzhG3rubt/JE8zhBOslyV+dDw8VsyWGWOnSTILUtM7SXAjaWG4v9thx5jRuN8PBB94/vwUb3uGAKmCdhwYqoagNPP5jK4dadsahyCVgv3jlrPzI9pmh3eO0WYIpQljT4YkKT3lPGPY75Da0DWOkBjWz5+zPj5iPLT0zYAwjqOzI8aN5Wg9Q5koOhYCsjSNu5UMyDGq/IKVJAakyBBCE4JFKIVUBUI4tNJxwhDifZTSWGs/YtBUiLDfQMSsa2FxLsa+BRlFO6OMMOBskSFtj9KS0XZ0SjEMls4PSJmwnuXsmpHN3S2nl6/Q2vD222u63PAQ9pwfr9BpSa40TdOQFxl/9s+/4u/+J8e4wfPQ1Jy//Bzb7mnbBhUcs/mKh8OeX/3qmh//+CdI4dnvHrDWYgyURfFx9IjwjGMfqyJvJzs0hNET1c1T4FGIo02lHN7byc4dcMFNvoa4SEJACIWgm3Z2H0t/qSbdApHePTUeNQIhewTRu/KUmi2URAuDUZPvSKno5XhasL7H9W+8MIQQfg389F/y3x+A/+z73s/oCCNNTAzzeLjfsJzlHA4HZllGvbmj3rcgKu4/3JLnM6q24Wi1IEslN28PGC3QRvFw/wAIZoVk2zRkxYy+H2jrlq5rCUJikoRU5aTZgqOyReeGoQ9kmeHqrub84hIrAg/Xt6xfvmB/cxcdnmQ8PHygrXpsUJyeH9O3B+p6z35TMV+e8fCw4dnlYoKhgnIdtrE87lqaasARONQdL19ecjhsKGcFbWNJi4TgBamBpCxpmpEkNYyjpTAZh3rg/HKNkAFBBMQYD0FrUuFJM02wivpui5TweGgwRYpKFOvVinZToUxKfjzD6JHqoeLy9DSe+2OtSpJkmMygorMN7wUmLZDeAhKVGFQIuGmKFHdFh/Dxgx5kBPcKHEaoyTof6cpxJyM2wqRCyhSUJThLWWaYQVC1Iz4EsvUxurJ4O3JvW7Ikodp31NXAYrlkEBWqB4VD6ITTZ6c0ux1pvqBxnoVW7CtPXuZsrvcEKfjzn3/FT3//93Ehwbqavqo5Wp+yvb9DEEizkuFQU+82VFXFfLWkXK5pqhqd7NF5gdIGrQXBBaQMZMZ813gMkT8ahUVi2vGfUtUmPuMU1uztk5fhyaAV3a1ikkMrZ8F+xza1o4zVgfC4SfikdES+2SA+ejOCGNAq0tO9JxrljPmbGTgjiCm9SZIQnCVJFFoq9vuGk/UxfTdy9/49Uo28/uotISig5Uc//pLgLTdv35GXBc4FVqsZo9PMS81XP/81ZVkiHGwfanaPN1ibkOYmlpIvz9k9xo5zbQVhtNR1izYlp/MZf/J//TF/9B//Lf78n39NcCP//s9+wtffvuVxU4NKOb08QYpoDLr5asvsaMXDY0ueavLZAjcGsAcOo2dzc+D2foNWUfb7/NU5AocbA9VDw9npjKttHXdeJL5vyVJD08VftCgNzz5bRqFTPWKdZFYUqJVhXzUMo6W6qVjOc7KyYHW8jgj4YaQ9dHTbHfPFnO5QsyhOSbVg/TJBydhJTxODEAGdpJG4reJDroJAYpFao7XBWxd3pyAwKsGFEfkk3hECGGMz1MeGmU5MXBQQOGux3iITQ7AjSDDCYAGhois0TWO8XdcPuHnC6JassxKdpRR5zsNug8stWpVUbh+nM0UKJsFaz9v3NyzmGcWLU8rVnHq7o5gXyK5juz/wx3/6J/zBT3/K9c2Wo7mgGQeSPOXu/paj5THbIDn0UQ3b7e/i8UdJ0iSLwiWT0RPdoIQxNmMnTkIg8knxkbsQpixWEYgVg4kIN+GJgcAT+s2OHomcLEMCFTwEEZ2WQUCInuNgPU5OYqlg6fo4zrR2QEs5pWfzURdqj/XqAAAgAElEQVTkPwqsBKMdv9cz+VuxMEzFFN75WIKOA5KASkuaw56mqkk03NzeRxS79iyO1iAsTbXFes9hf+Di8oy7mz3nz9bs9wcSI+g6ix1HrLMMg0Qi6IeRPEuYL+fcdzuqqmOxPmZT1yRJiTGKD9++58WPf8DDvqMdR/7W7/0QKRT1vmXs4dkPznBtTdsdcDiyckbQBlzL4vgC6w1jt8NZz3azp21tdNLplFme4rqGqvfkieF4mbOtWoIQjOH/o+49eu3Ktiy9b9ntjrmGnmGeyXyVVSVVChAEqKX/rYbUqY4KlUJBVcqnZyOCEbTXHrvdcmrMTT71pBAEKPICEQ2CjOA95JprrjnH+EYSN+M4MOVA1VhWTYMhcjocmM4j1li6zlDKzP7hzOluEEFU67h8+hxtNFZBOIyYXGhdom1XeLvh8uUrpnjGbdaUOeB8i/YZq9SSbyD5BTHOoKykKC0hJznMoByf+4SS5WAU/TeDUFGeFKJM3I1IgVHpy1Tdamm0szJfVqFYQ46z+GW07Oqr2pOSpmo9lIS+vqTkxEVsGcaI23i0cqgE1y+es44ZnSOn44k8z4SosDqwWndoVTj6juP5TIiR/eGR9eqai8tr+ps7kSQbzzgODMPA/s2Z33x7Td1uiOMR470c7CVDcs4Jsxh7ylwWs590cBIzIfoBzZKYpuWGVynypSdQi1aBTCkFt3gixLptYAGvWKWWODy9DBIL2hq08ZScROdQqsVnlMlkkVOnQFqyJ0ALlfpnfP0iCgN8RmAbVJpIOdOfjqzWkd1h5rJVvDsN9PueUjJV3bFeOQ4PR1ZdxXQeuXp2xePdgaZpSSHz8OE9qVh8Y9jtd5Dh1I9sLhvSPKBdw/3HHxnOZ7rNdpH5KvIUWV1u+OHdR/7xH77ln/7TH9msOl5uO85T5uHQs91uhJZci9jl3dsdxnoswmpw3lM1LVP/wO3DQMyG02nHqms4nSKbtSVOI42zdLXjNEcOw0zdWBpnIcykmLnYdGQ8m85xOE7sTz11Y9HOC5Z+KBwOI76CtmlomxprLSZEhvOJrBTXL5+gh8LF6iV+ZYg50TQthEDbdGivIYJSBqMg5ITW4H1FJINdZg+lgBaoCkbENyIvFXUdiAitlCLQEl0oWYtlWmdKjiIuixmVCpVV8hd3oZgWVZbhnEUZz5wjOoggrehI7k90bU2eBkzxRGMZpxlS5mo+k4IM5v7+777lr//5L7z7eMevvr76kuPpzUzthVXw9qef+OYrw82nTFc3HHdHjKuZ48jzV1/xw/dv2D0OdK3g8F1TEWLAxolSLHPKOAohpEXGXMjMoCVlKufPsudCTEli8BKEVPAWQkxoZShK4ZzGKoWag8wMltWnlb0jKgeU0ZgkEN2iNNYgBqukcFoLyMVA1p8dmIpizRJaE0kF/L9IUEv5W6pwlr+LKFuxfzjQeMu7H/7I2B8Zx0DV1LQrR3860dRbhuNIu96QhoGSzcIaHIlR0W1X9EeZYPfDicp5Upyp246XL15xPJzpVh7yxP3NB3KybLdbpnni+puX/Pkvbzjc3/Hbf/cPTGHmPMP28prL60vSOOJ85nwOHM4j61WDQ2YXzjsBasxCJLq7v6Hr1tzcD9S1ZQ4RrQytltv49n7Het2RY2YOMyln1hdrlPFs25pxmjkcjzhv8N5jrOJ0V1Am0208zmq6pqaUwvwY8L5gXaFqG9LxyHbzFfXaknMkp4hWVjYN3ixdp1swZsJtUHoRK8GXvbhOcekcFhSclXSkojOqJNIi69XKys+JYRHbRARPJEKnkg3WFukCPqc+54SipeRECQEywuKoLJwHbOWooyf0E6v1CjVMxLqmpIjShk83t/zuV6/ZH3uU82xfPOV8vOU0zly0HmcyZtWALTzue3zVMMeZ958e+OrVFVWzou+PlNhz/eI1Vf2v+MMf/sCT0lJVPauLjjieiJXDNB1d3YhhyRrmGDBlSeuOhZy1gGEXeb9KiZKgZLUMi2XQXrIml8RkJJ6uJIHhFiXCKGvkc9QKrC6LelFT1dXiJxIvhMwtEsbJXEhpWYEaoxZdgyHl+V+m7VqKbuZzoOjxNKJLj6saVJ4I55H94z3OWrZXl7QuY+stGUdz3XK6u+fYBy6uN1RtJbkSqUAZUdowzSPjMLC+uCTME7WzjFOkH3qc9yLcyXq5HS7QKvH0cs0//fv/wItnV7y8rOhD4nQ6c/lkKy107bm/v+X9hxuMqSnGkWKiWwnirO0cH2/OnMYjRmlOvRjyU06UXHDWUK0bfvpwx3rbcbntOOwHisqsNh1GO9rWEeLIdBwxytPWXgag84R2iawUThdqp1BZhn22trjOQYZ1dcXF5hrtZToNivVmu6D3kV/vLAZNGGdAZgiS0WgW5d2ScuScvI0XuS5oSgwUXdDGC/FY/qoKoFRBjgHtHQnQRS+Dt4zKipDAOUOKEa0NKUi2RaZg8JACShdaXzGNZ4zz1C3EGVxIaOd5urni4+5AGB/ZXW3ZNBXvPt3y9OUTyBOn3YHn69dM4UTb1cRU2G5WpDjSDwMvX33F4+7Ei9evGN8eGOcEqcc3Lbaq2e8PfP16y+3djufdFuc8aEVGimdBURknGwilMCmToyKVhCJSkqKtPXOQ8B6TZLDuKkcIGW8dBUWYJjIKkzNzihhtKSmTSoKcSbYQk5gMxx7K8tRgScMuWZKyrdYYLbJoZy3WGswCccn8Sxw+KkXOCkNmHCfOx0cqW6Pjnk9v/kJ/PGC15+rFM0roGc+F9YXj8mrDT9+/hWRoVg3TNLJZKd58/466tfz01/foasvN3SfGQRHTnqdPrvDNCmM1myfXfHzzV5JeoUrFb7/9mpt37/j45i3ZWy5WLf/Df/9v2J1H3v74hlwkaeriasOn23uGY09lN/LGTmdsvebqskKrzMPdHXM4UyhMEfp+ZHO5BQq1LtStZxhHvnr9lIebPYfSUzcOZyyr9YqcC/MwYOoKax3NSnGaC48PgqWPMVF7AaD4tqPEQLuu2NYd62rLxetrvKuYponae6q2QxvJYC5pkdwi1sNQEsoaecrpSrYRMWKsTLpNYdmhO8HGKdm7S5EopHmUWqGWW8xYSsgob8WzsiSKaTRFG4rV2CXo1WgBrWjn5NBlJ14DbQhhoLvcoJ2VNOzHPUZHaieUqqareWosHz9+4P7hwK+/eUFTPeXtuwcur9aEs+X3373j5dMLSj7TrGvG/sDlZsPusGN3r+mahjd//Y6nT0UN+fHDO1arK756/ZJhnJmZ6E9Hut0D1q/RdeHrb1+AazG2IpeCd/K5hGkiLIYlXWbhg0TZE07ThDWSG1KygjKjjCaGSJhncowUZK+YUpCuwMimSH5NXoaJRYKZECZDKZGMwmpFzEtRzoUyReI008dESrM8P37G1y+iMFA+p/gW4jyBVigNDzefmKfIlDLd5gKnMg+nExcXa/pzzxxn+sORqmlAeapqzWE/SL7hesOuN9R5x6pz9Asa/umr19x9fE/uNHEKUGoa72m31xzPJ46PN7z+3W/4P/7wF/7xH/8tc0ic50IpHl/XrFvDcNyTQ+TYB7TO+G4DaZD3stZYH3l4d8+cha0wTwnfNBQylda8fPWC/V62IZ8+7vGVJoQeZzsu1muMMYQ4U5Tl9HDgV6+fcft4YHc4UznLNM+oJJ3D5cUaYk/TVBilePHqNVSQYiK7vGxlMsYZcpZBlzb+C0NAWIPqiyCnSMwSRSmslqEgSfiGxlv5s1LSUZSUKHmZA7BEqoGIdZwjz8tQebEPE9NiNRZtbymAdmQjHaNAYZDfV05oJ6xHZz22Sth6zxgz2lpymGkdHPqZTbcmh8R+P4DJrFcNjw83XKw3HPYR6yqMjlg0T548Zb87YLQj50AsNauu4dyPlOnI8XCgKMv64jlK13z68MCL6w6KJc0Dtmk4Hg7YZmZz+VpmAFIVqaqWMvdQBMAaYsRaD6XQNC25ZNZVTSpAmknFUNcFXeKXYhtiwBjDHOTZNwVoV4GCYpojpSRZ0WrB0LOkhoGiXgyfKSbJ0iwFU4SE5v5FzhjEW4bWME0ztiTSdGLqj6Q0MY8T18/WPNzdUnmH9VKpHz/d4itPCgFvYDofcHWL9pbb2x7rFYfDGYPmatux3W4Iw5EQxNJ9f/tI1XWczhPbq8DuYc8UE7/9V99yPJ646gqPp5Ef3rzDuRbvDCUpYlLc3xwxDnJSxKmn6xqMdwuwNhKCtG6pSHCLURlSxJqGMJ7RxnM89LSNtNqbrqPynroyDHNAkRmOJ5zS3D3uub3f01QVaYrEomlrxXrV4ivFXCxeV1z619Srmhgmai87b+e9hJ8UCVFNMeOMAuNFrUdCnrDi6ycL/vwzzF8m2kreyIIVks6jFLIMCJActSQDRFVI8yyCHadQWd7NJZclOGehGbGIe4qSKTuGQlq6lMKcFMY6QgkondFlpKo8OXcM5z268lSVp6kzJUzcHXo2lxvW3tFer5jGI3M4s95eUNeex4c9SkV8U+OcFV6ErQhRodYV/cONeCSMZx7OtE8KU4am65YYOgEP11VNDGeSKqI2VIKAzzEypwWipgXK6myGnEhF4LuSiSmrSGU8KgaMWmLxkGiEqnGkFHHaY6yhTlHgPUlRF43ViRBnclG0n+ttjMyLmIwiaV/WKHKKxJgoxlDZn3fUfxFJVNJoRowqNA60NvT7d6jYM80R6xy7+zu8hTnBPA5sVg5dDDkqtldXHA49vm2ZxyPrzYZ37z+irEHZilQq4iz5AI+Pe9r1CutWFN9SNRVX1yt2uzseP77j2fNn/PVP3/MP315ScmR32FHVLf15h9WKcRq4v90RY49zjrxMgU1KdE5WTqfDkRwD85SYxoGmqdDGoY3j6uklpzkzTZk5J7LKGOuYxsx2vWJKmao2pJSYZshG0U+Z1jdM5wBZ6Nht7Wm9HCzTG56unvHk2wtiCKxXMiBr6gZDIn3JI9ConBaQiDABtLFgKkqxMikvwg34rMpTVsJjrLUYA3rxCshgS/BlaPlxpZWg6VURGa6GrEWEo5SRIrQMMFNM5Aw5zSgkSNZ4J65NDM4KzcoYg6stba3pfItVhvWzZ6g5oig03jKMka6RjcnxfCLMgc12wzgVxjAzTQGU5zwlpilgVeLJVUeYC85aHm/v+fXf/47KNdRNR0wT+/0DtUq0Xcv+OKFIzP2R/eMtIcqQdO73oB0pDKQsOR0AJUlCdkxqARB9Dr3Nn6eS0qktLszPyHmJY1xkzsvw0GjRj1TOYY0mY6mso7Ie5xzGCNau6zratsXXLZvNhvV6Q7Pa0K5W1HWNq5ufdSZ/EYVB8vsUc5jppwi6EIcTOUaJAjOaFBNVc8Hm4inWrXj4dKSqN9iqpjKCuDofHun7kfN5wroakkIXqOsKV3uMbQSUYS0fP9wtN2jAGgXDiG8abFVx3h/ofOHQz8xT5HTYs7nYYE3iHIRkbYvH5ozXVvIhjaGqKuZxIofCOAVylJSm2nlWtaftai42NWEOHPoRZwrGKobjRFXVzDFT1w05Rg53PU1rqLzjtAt/k7YaRW2FAdC5DT52/P2vv+Hlb7+mblu6rsGbCmMqTCmgLHVTobQhpglnZNg4DRMkSfkSLmBBWSO0Ja3Q3osIx3i09+QFTUaWP6uUkjj7lCQxL/YhrK0oQYxC1rhFKCWUb50iGTEWZeR7NynKwI5CKZZU5DBppUHVEmcXAbdCY3C2Yp4nrLZohKC86TrmMYGueNyP8lRJirZypKDozz113TJOiXEaUVoxHc+8/3RDfzozzxM/vHnLSKTvz0xz4e5eoD26JNYXa+7vdxRVUcKEKoqmrrn79I4URvH1ZMlzKCUvGLsgBSDnJT2KJYauyPeY5dmVciCTSSVjTVngLZ+pT1Gw8YXFaSkhM4Ktn/FWUbT5YtnOpYj6VHAvMg9aCnNMP08T/YsoDGX59zzPbC62nG7fEYdASHIjNU7TtR37/Z5u1VL7ipAs/Tiw2Wx59+FhQbudgMSPf/kB8Is5JcuhGEdUCdRNS1UrtBXTyvH+QH8+cndzz/PnTwkYbm/vSMVzOPYcdyect8QYSMOR4bznfDhhK492Fe2q5WJl6TYbuRVjZBwnsSZbUa2FICaX603NME70Y8CYgjaO0z7inaZpLLHAOM6Ec8Y5iyJxniacFYFQ5SqurzdsVoYtK9bVltffPGP98ikpBkxReN9IarWvUdpjXY1KEppidIVZb7BNS1M3FMDZgvOSop2zQrtOlJbWoZxH64IyQm36XABSlNs6l4Ky8lTRxaCUl0yEupaBcpHAVYPGmJqsLQaZJ3lXgbJk18htqmVwppDPbFpMQVpbKudxvsK6iqqSLk8rK3v8FMhTwhuLdg0lOT7cnSgUuu6C9XrL4+5EioFhgr6XUccwJ5wX9aHvLjkderaVGPi2myucVXy6u2dTO9EUaM/YH4hT4HDcsT8PXF29YB4HSipf8Ow5TeKAVIt4S+Xl9i/ENC2Rd8uchoJzFrLktBZt0Vpw71rJc6CQCEnWlTEljNForagbRyjydEnL/9sY6dq0tijjvng4rPM/N7rylzFjKMjOnwSnuzeM9zf0pxP15kLyHI5n1hdrfvfr3/H2hzeUnJn6gc3FJT+8+YkXL5/x6cMNcR7pXj5BVSu0c/RTolld0DQNrlo4/3PP+ejYPT4QZ8W3v3rOd7//K69+9TWnuXD74Q3ffPWSn+4nHu7OFOO52l6hc4+yijIeePH8mikUxnHERNg+vSbEE/0xLAnKiWlKrC6v2O12XD+9pNIFmw39sUc5w3icwSpc3fDtty9Q1nLaPXA+j9QK6sbwuBMGwEUr6PjOeprZsK5W/Pa//a+WWxPadoXzjpIKyliM9YSSsW0ncBAlRcgaL59BKWhf43ImTj14TcryNpauxMmgkiKI/Thh8BSzCJ2KtP7kIIImZb+YeAxQjF7UjJMkOpUZlSNuoQoZrclhQqOJeQZfyyaDBLEnKS9MRY2YiNIsK7pKY7VhU7XM00RbKpSreBI9/ftPrDvD7x8+8fT5a05DpjaRpsk87grvP33i8nLDNAYed4rtVUvpB358f8PXrwttZXnz5h1VvSKkkVAs42HHPI9cXr9kmiO11cSxx/sa/B3l8pI0w3S+wbcbjJPOzOZMWbJAVSlYAzFn+R6KrKy1kg1DzkVUiUUtyVbIQVfSJYuWQdSg1kDOAQiE2RBLwZkljKaULytp2VwgXg2rCCHQT8PPOpO/iI7hs13XWcP58YZpGNDOYkxhHHvarsV6R5hO5JionEGpwml/Tw5nbj58wtlCt9pg61ZWYxaG/sw8jwzTwNOXL6iaijFE2kZBTkIXCDNV63l4OHI47PG2cH3REFNhDKBcxeXFFuNa9vszRVeQIynL7KPpOnQpDKeelCdMVVE3FZWvOJ7OWKuI85mCx3jH+7dvGY4DRiuSyrx8usU4w7Ef6McJu0S/3+16YojUgHcKmxWddmy3FS9//Xd45/BVRdO1OK0k4s4K9MN4jc2BfIo4a3B1i69qAZQuAvQSIznJbaO1QRsr/+hl2IjQmEoSN2hebieKDB/LAilRy6BSGbM4AyXcpOSMMm4pSgvxydYopxeFJKR5xroG57xIhfMMpkXlRSylFYaA0nL7GQu1Fbbl5xvNKthcb7nYNuRpYlXVjOcDj7tHQsp4q3FI5OD+NInoK2ceHo4Y3y7GMS+iomaN1uKKHKZAs94yno6sVjXrzQVFGVKCECd2j4/M48g0B3TVst/do8qSpJULOUlKdUqRaRaFZsHIQdfS7pfP9mtlEWStdEnDPArk5kuPJhsfmcYVJNkiSVEofBmOGmvlzwFhaGhV2N3fcD4+8ri7/1ln8hdRGEoBZ5XcmKeDwDwpHB5vKLlQ14XKGR5v77i4WJOL5cW33+CcxVUNWRUe7x+JcSLPg6xqYsC7zDiOKMDaRH/ckzK8f/uBw33mxdMNYU6yzqkc4/GRuvJYVXi4u0E7Q1NVVHXDdrsmZk278thmjVOKNASunlxiK4dxhqZZ07Ye23achkBJPa13VNrhveLPf/or9fYCZwyhJJ4+u8B6y7kfGIaJaR6JceI8ReY+4ay0jWGKrF3hervl69/8GzZPLvAaKmexzglKrCw5hhHSnMmuwtYSOV9CQKUAqVCSrLpKCpAS1gkqHVhSlgpxiX5DSWoyxqOsQiVY/ICCD/sidhJ7oVJ6KQpJ/P85y5ZjGcpl0QV/iVBT1Wo5bBlFkrkQGlNViwwbjKlQJYohq2rQrsL7Bu08ORZJXBof6NqK3cMjzhkOxz2N07x7/4Hj6cRXr16hJk2aJobpjDaGOSjWjcUYzYe7B47jQE6JFBOqGLrKsd+f2Tx9xk/ff0cIgfMwM4VEjIW2u2DsB7TNxBDpugtub97L560cEoWgKEow8CGFZWuQvnQEn59alEQik4oipERdtcRUsAvAddlZiGeChaiuhdCkS0HlSMmBmGasTpyGM3MUDklXN6gc2d3/X3nN//dfv4jCoIA0nnHxga4yVI1MWMPkWW8vuLvb47wnl4pxCkzTDGFk6iPn/ZnWWrbbS548f0UKka6umceRVbfCVzWlBPYPD9zvDhz6HmyNX1VUq47+dOR8GDjud3SrLa9ePGWKsidZX6x4+fo50zxKgInXzOPE2I9kPBdPrzHaEudADIY4JaqqQhuPUrDyFRaD9zV39zuK1mRliKlwsa5xJhNiYg6Fc39GYUhFcdhNOC+3rCnwdNXx7bNf8c0//AP1qoJS8M0KqxRWOxEW6Yx1mrAcaoUmLedWL6vIYgqKmTKcifsDKWbQEsCTs+zIlTIY7RYxlFkcg3J762UuUkomJbnxUsqEGJYQ1iSZBsZj7bKaNHYhG7MYhJApvJEUplTkwOSsRUJOxFiNsU44A2EWN60WZLrMGhztpqHeVHIoioeoSPPA+XzCKsO+Fwfsh5sH+jlyddWRAgxDIBbZdmgdsMrw3Y+3OF1wWnIgYwz0Q0CXwNu3H/CN6DS69ZrD8QxhJs8jx9v3qJxom4aqclxeXBLiQEkzxhTIk9C0jWwWykJV0vpvkXE5y+ZEF4spGUdGE2U+tqRdlxKlg1Of1/r6y0VQShH0m5YNxfk00fqGeO5JOfH9H/+Z/+0//kcOH378WWfyF1IYEv1xh0oDx90ZYxy3n3ZcPnnO48M91jhyWJ4QKK6uWx5ujpymRLNecXt7ImX4+O4tp9MJ7RoyojevqppuvaE/Hagrw7qrOJ0nmq4iK8Oxn1hfbVDao1E0bcVhOKNzwVeO7dNL7u7uUGS6rqYoh69bCpIH0U8jrmvRxqCrijhF4jDIX4aqQVuP8Z6H+wfqpiKHEe8N1+sW52u0VrLeTAVS5DxEysI7sFqxaipeP/+ai9cvZYiVIqVEAZws70lrRByTA9RdjasqlLVYKxuDMAeMsaQxkKJFtSvMegNWhGRKaYyR6Hi1qBTJiRhnSpyl7U2LU6+IeEYZBVbw97ZyLKFsYBxoS0mBEiI5zKiS0EqsxSkl6VZKWVyDipwmiBMpjhhjxH0ZZbKOqdHKYRdhFSiUtbSNp61rmvaS2jfYqnDqA85p2QKEQNtUDMESpgnnLRfrmpILD/uecewZQmG78nTWc7/vCblQdODJ9RpvPcYYNo0jTz0xjoQUMZXYyOc5kNNM17QYrZnCjKlqxmFGl0zImoQFxEtSUAJeUWKwAtF8aKWxWngJSmkKhhI1zLL+NAoZRBeFztI1zPPnZPi8SLNFLZbmhPOe4/GA0vDTd3/gvHvgqtMQ+p91Jn8RhSHHSOMKcRrFDTZPFCUJ1AAXmzUPd3uqtmJzsUZTOJ7OHO5vGE5ntpctl0+vadcbyWtUDmsMtzdnUoSxn9iuLpiHwOEws6oMF5cXpJRwRgQ9xlW0bcM0jwyHgX4eUAqODx847O+5e/+O99/9GePsEvk2s1pVTOPIX//5e4xraBqPNpp+jhSCrDOdYXc4cvXkQuSyKfBk2+Cc7LV3t/eM44RdXHlhEMty5Q1VbdnUVzx59YJmJfHvRhWqZSftvMOoQs5K+AjOQIzSxqdIjhM5BdHLpySFokTiNKG8bG1iCPJyzWHZkomSEQXGagG9aovKCeZRLNWy3ZQnRSnkIKlMxntijpQ0L1QhEfgQIyUCZck/yGIpVqbIz80JncOCUy/keSamCaU0cylEFSlKS5ey7PiN0bSbNZVT6EpROU8eFdtmRZkTpnjmccL7mpDBEVl1HoXGlIjRmr6fGFPiYrvhp59uAM1pPzDGmXYt9vFSMuchcNVZUsx03YbdMDJmee+nMH6ZFzjr2W43pMVUxWeSNkVQeboARhyqy3YmFfGPsMjDC5KQnXURUVUpTPOXtx4oGd6KKVUvmZcFbw3nYeB8PtK1Nf/07/8nzjc3xHHHcXfHx7dvftaZ/EVsJbRW3L77jvF0YMaT+562rbm/e+Dy6oqMYnO1xRjP8TiwO0ygjjx59oppHvjqm+d8eH+DcZrDw4niWqztaLaFtqnxFTzueh7PEVfXzHkmzQf2+wmlPIfDiRfPn7CqMm9/fE9/Tvzu3/1bTBp598M7bAnsjhNaP+H6yUtyCEzbDa51/PjDT5hKALG+qVDxEbJi6jN+pennnuPxLMNCrVCqwtmRm7sz5z5ijWa76kho9o+B1kumxMsnl1w9e8Jmu0WbTIkzOENdSSfgikJnha4rnJgbmOczKbc4HUGlL5Jj4wsxZbRVeIlOJg57ivLYqqGkIL6HnMgqkmaFtaJb0AVSClhryDSoJDv5HANGaUgZFT4ToYMYiRaEvHEC3hHRVKGojMVRMNL5xCzSZKVISoA9KiWUNuQiyHNnNSUbYs5YLUSoXArGrekwpPUIFGJXuLoemHPg6eUFb+931MozDGd+6PccVprXTy/43a+e8+cfPzFOE66pWdWWkGZev3jCn797w9//6lcMxxmlAiF7nKjPFusAACAASURBVE+ch8Ddp1v86oqht4QYWQ9ncnbcf3jL5vopq8sXzPMsn1MI8pkZYBk4OpOZowwZtZbhuVIZZ6EUgzGZnORpoEiSS7ms2kuRglgW6Cwqy/wpJ6YghX9/6rEq8Zf//B843r3n+voZ3/3h9yRtuL0/MISfJ4n+ZXQMKeOdJ/YDBc04Z/a7PXXTgFLs9wfq2nM6nSQjUQ2kueGb3/yKVAxhjhwfd/SHE+dRMNwxjhRr2Fy23N3c00891hq8E3VeTjOlBM6nEykqWheZwkRJ0l5bpTieZ6w29MdALhVJG1IxjMOAUYYwBaz2rNZryZU8H5mjIUY5hFXlCSEKQzKLpdxVnn4onE8igHFWk9H044Qpmq1zPLnYcnF9Rd3UaO0EJOI9tkSMaXBao7UX1eIXYY3InpVR0qrHSJwjJUTmsyQ9lRCIIVPmEZXFGh2HPWWOYncOGSaJtU9xlmGmkRUh2i6gEb0AWZaOQJnF/JSW8YGkLStYisLfthwpFtJyKGQXqYmfm+K8TOC1QXzKstIrxVKUQyuF95UoNwukOFGwWN/h1pfU9Zp1W7GpHM47nDGkEKStj7DvM7vDkXE6s141VL7i8XHP+dRTUsCbSN3UFAVTzByOA4qJMUClEsMQuL66JhnPs5ff8unTLblYxpg4TwNxHrFWmI5usU9Pk3RspWTmJDoNEMQdLFkUySx0K1GbasTdasySVLU8OyjgjHRLetmOpFhw2uCN4dwPvP3L75n7R9I88uf/9X9hvXI8Pp45nmZy/v8/iepnf+WcmPszwzmT54mqXdH3gYJhtWq5uHrC4+Mjw3Aml8Q8RGI2vPn+ey4uWw77M3XjMSqjrBB1YpjJ8Uie94Q0MwfhAzSVxVnDOAROx5Gv/u63rNuK7fqCccxIaphhGAdO5zP39/coK3bgFGe8Cpi6YbNxnA87ZjRN1xLDKHp1Z0gpEKaRKUVynKmNyLFRhnXjOO8SlIxdAg6HSWC1V13Fy+srXr56xcXlFav1hm5dYayTDURVYYhobUkKFkzoEkEmK0fR3mswBmcK2iqsMcThTJ5lRZuVpiytv6kkSEaCUqQQpDBBks1CXG4/iuzmSwYwuIXZUBaFnawnF5GSdkuba6BESoGMkZlEmhcFoFqeIp+lxAJQ/byWs6Z8MVblnMhaNiZFLwnTVgu4w1pQmWZlMSUyzwFnBJybUGhbUdc1xlbs95GQDbX36JwoITH0gcaJuKhuG+aS6DYXrFYdIWbIoxQzrbi7/0SeDvjGL8YlOO0+UZKlhCC8CaVwlUdnjXMVKLE+awXG/a1I5iLGKxYOpjWLUc2aJU2KxX4NTpslhQpxx7Kk4i5MhlPf8+ZP/8zjhx853t+gtSWoirc/PdIP5QsP4ud8/SIKg0JyKU3Tokph/3CL0oX1pqMfB5rNGuIC1kyBu4cjhYkyR66eXtOPE21T8Xi3o+k2hOlE262ofcup1/hKLNpGWaaYcc7zmVb88acfqGs4j4F+yswl8OrlM8I84CuDb1oxYlnLat0wziOrJqFU4nw4crldY51hDjO+toQ4M83ikEvTIFaCLErBdefJKpOL3ARt7dHak5Pssp9uL3j6/CXb1Yqm8jTeooumchathNaTkWm+9VLASGJRJktaYUwyvMwhoZQImoo2aCd5nizveL1MvSmZFGeMkVaVLJFpWmtUnuU9XETvXxKSqqoUqagvs4lCQRkjsw1jSSFAlNwKbTWu9uLLUGBcBVoSKigyNLWfD4MS+a8CyBDHGcqE9U4QZ7L0XyLa1KIlKVSVp65bVpsLKufw3nKxXmG0ZZgTOUXGKTFry9u3H9l0FZ23VHVNnBKxWCxI8lUYGM4P+Npxue6w1DgrSWg5TNgcufn0js3lNY8PNzx99pLGC/NDOiaz4PcBbYkLNBcUIcmIVgaKIlW2y+eelnVvRoAsJatFfi6zDKUypSTJsEiJnANGF6YwUpRmeniHyQPTmPj+z2+Zk6HbXlNVNc41+J/pu/5FFIZCoZ8n6toKmGQY0drCAp54ct0yThPDMHDuZ5EXP3vKFGdIMzrNHPZHhqw5ng5UyhFDAjTZKFJInB5HvLOorPDWksJAZuR4GHDesTudmYYTqsDHT+84H+457ndY78hKM48D3hnG84kcA+f9I+TCZlXhLZJFYQDjRZKyRIsZIwlLjdc8v5A4+wwCFw2RcY6QCxfO8vqbr1lfbWi6RgwyStE2ragZjUUrjfMOaw0FS1aGYoQQXNIsJCELpEnksQoZQqYkRGdtsLaSz3bRIyiW+UCM5CjqRYW4LImKGIromlJBFbdEreVFeKMEXc9SiIrgzvWSIUEKpCiOWRnHKcmdmBeQznLj5TBhQKS8ehFHFYM1lpIdKmtBryu1EJAVRom02FiFsgbvtEBpTU3rK5p2hUPTeks/J1JJlBSJAzzeHbG1E7es1szTiFHgjSPOI6DYn2faVYNSiZjh4TAQw0zdeOqmpV6tmTMM/cynd3/k8eEWkgwJVcmEIkVaK1lNKrWkt5OYk4jLjCrEmLFaoXVeKFqZGPOXYquUkcbCSIchgjPQFMZpIiQ4PHzEKMUP33/guB9om4b5dODth08M48wUI/lnjhN/EYUhL/Sfw3CiHwKurmnbjqfPrqjqlsebW7rNhvE8MO0PPHv5nN3jgSfPr+kPPUp5illhvePx8RGtPZebGnLg+PBInCzWCQLNUHBWVoEpGOIcWa8983igqz3eG549eyZCkpJ586c/cTxPOOd58vQF2iisSgz7I+1mQyoRbSJhGgTvdTqgFTROYYzs4n3lWLcV4tUKNK0nqsJpStii2FrHi8stm+2Kyou82FiH8x12CTlVWguxmMU0oxK6iNU2xrz4DQqkWZKhiiaGQM6S1aDLEp5eMirHL8NGluGgXcRUMYhlOqcimH2rFj5CAaPI5C/qwJIjYZrkqaHUMhjM4rkwdnliiMQ3lyidQJwxi81YaZEKK1NJtzANqKREsGUdaIfVyGeMkrWlluGrNZqmdmjAq4itPM9ev6K1Bq09tTOsNxu8dwxjwGhHiJoTik+PjwJaXZ4IOWaqyjCe9rz/1DOcA7mfmMZRFIQ54FXCakd/2jMdD9R1LSi+FMnzxGZzIXi4EIRf6oWipClYZ764cAuIzkOLUlTKs8EqTYkKlbNsgT9/RkZjrWGaMylpnJbMFYBhmnnYP3L34Qf++L//XnQPNjHnSHVxxbNXX3GxXXF5eSHzip/x9YsoDPM04VUFueF8PtM2NS++/Yb9fsc8nnn/4y0//fmvgGVKMtRKc2bT1ZyP/bK312hV0agObWXfrnLEG4OyicfTmfX1NVXbMPQTu92JkiWQ9f72yOPNyP39wGq1wWIZRginidMkVuBqc0koUFUNqTiyX1MytI3ndOiZhok8nZmmAVLBORjHglaWummYc+L7t3vqYhj7CT0nnjdb/uHlS/67/+a/5l//67+nqS1dt2JzcUHTNQJH1QbnNN47KUqmQCpfZOSlFFRO2EUcZYxHVRVFJ0xVY5pqWWktZjWzWK1VXlSfyxMiiWtSlULJk5i8KoVymkKkGL4QnMtizy4FwYmVReKb44I+k+DVaAzKQFW3GCddStF+Ac/K7t05v5DiPMp4zFKgjFao2pK8xxjH5x6lKIc2LVq3xAmc6bC+IU0zm03Fb373G1ZNTbNquFq1mKLxy+3atZZVozDe8t2He8J0xCA5EMfdga+fPaOpW+YY6NqGDx8e0Ara1ZZhCHy8+cjYj/iSePh0x/WLV/T9Eb99yU9/+k/M85lzf6SkGaekA3JVzTichWOqljTszwKvpQNKOYmPwhaKsaA01hSs1mgiIciI1phCVoUpBE7DmdNPf+G//M//I4/vf6RpazSWw/1APxZiCHz/l7+yPx55vL1FpfFnnclfRGEAUYP1x0e6rgXtaSpFSplxyrQXHVdPnkNMbDcdYUrEOBFiZAyFfjhSikx/607jnCGEGVTk8fEB16xo1xuU1Uz9kXmcsRrmMHO5fcIYLP08sLnc0m46+vOOmGeOcyTmgvKOOJ85Hx5QqhBLJMeReZyIIXE+zYIDz0mQaFqwaSkkYpFb4HgeSRGM88QJLuuGi3XDy69e03WVZCxUNb4SEZdGyWFSCx04lwW0ar6w2o01i37AolTEYkA7dFm4CVmERGVxQ2oy+UsrqkXkVaSIls/DL60pSZHGRJkTaQpQRB9RQqSEJOu4EFBZugMQCpQ2FqXl96SMRse0DNtEiamN3PQs0BC9OAxR0n1gFCVLanNKEZIwOowuaCUcSdSXX4j2ldiWM3ir0AbWneXqyYbKypCy8o62q1G5cJ6TRCEOMx5N0zpClP/+mDNjnGgcnKeRUxj5+uVzDg8PAn7VmpQMYz8wjj0pJYG4hEQZj+iiIM+i68iKMA+LxiRgKJSSCDGi1DK4zWlRLMrgUSzV8gRLCxNTkVELbdoaeVanIAni+4/veP/+O15+9ZyfvvtJpP4f7yhGoL/740zTNBwejlS157Ow+v/p1y+iMGitOR4PlBxwVcOrb14T5pnpNLDZbthsNqgyM54fOe53xJhp64rdYRBOQlMzjAPDfMZ7hzUJVRJtbVGuJitFd7Xi408/oU1E6YBxCoqiaDidDzSrDd26ZRyPxBIYx5FhjljviXEU8dA84BvL4XTg7n6H94b9/sRxv8dYOdBGWbzXeF+jjeJq02CL4N2UNgTgsvU8vb7i61//mtWqxjpLXXmcFXmzdhatsuQQfg590fJ2LTlJQGr5bPWV/EjlK0G6SyonOUyUEtElyo8t+vyFSi43tjJUbYOtBHKqjf7SXWD0MkRTEDNq4TBQ+BKvBmopNFo2DVk6BWuVINNLRueEQTImChGlEiUWtBHGQF48G+LSXDYNqmCXrU2OSWAvpOW2FQdi0cI+sHUjP08bFJrKGprKsmpaaqtxzlFXjhAjMSTmqMgoDmeJmKtdoZ8UIUraeoyJYZRnBM5R9Ir7+wcSMJwPzKFw2O04HR+4v71HpcA4TuSiuPvwI856hqkX/oG8vkQwVsC7hayd5fuWoiypXDlFsWQrQcyDIhdDRqOWrq4ksVA/3t6we7jn9uaR7//4Z+YQGA4j11dXoBPKGKZJtm7ffPs1XVMvG7CfcSb/vzzg/2+/8qL3Hvsz7aqjqj1hGNFKM57vsS7zuNvLpqFpOT7esn3+glTgNPaEIlBMg15SlWUXfpwaVu0a7yzn/Q5yYjiLIAY0c4TLdUETaSq5WawqTGMUfkAQJ19XexnGl0zo9xhVUzCEXBjGgRhHUIUxSTs9zzPzfKZ2hsoWhuW/U0pmPo08f/aMr371LZebNdZZfN1Q+Vo8+krevBi3FAGDKoJUx2i8laFiKXKjaF1QFlm/zQFIaERqa4wnjhPqS1qyrCDTnMjzTBmnxSwlyklyEdaClbWaFAuD1YsEV1oNyU1QMkjUSxq26PoVWhuKgjgFtBbOZE5ZIAg5I3F48ueuliwJrWT2IZxCSSXLuhbBk9KLPqMSYZdRGLMUdSSYWJ40FSVBTIlV1XKx6VitOrq6JaRE21aSfzkl5hAX2XdhHkdUTNgiPINpODMPA94Lvfnq6RXGNdRdJ6KrJOvXdVMxHg9QLHW3lUYuZ4rONL6hqWtRNCJIPb3oMxSLtHwZrudiSCF/IWKVHAVylzIsa0mloa4MMQVSToRx4o//5Z9JsRADVK6mUJinkfOoOTyeKCHx/OkVD7sDxjrWm9XPOpO/iMIQQ2R/PFKUZrXqGPuJ/f5A1dVQNLdvPjEnDXpDwbJ+8gyjCzFHaWWnidq0rLsV3htSHDjNhmGYmeaJceoZzge2K8eyKyAkzZOVRSPPAOc0KU3EECmxcHNzx3FMxJTkNqdwOhxJofBw97ioCmGchL+njeyalXE0lQz7rJe27jwmVNGMw0xtHd9+8w1d24ol2jmclahyNMszAkzmiz4Aikz7M2DkfZ7mQQqgtuSomXFQVctTw1CKEIpNVX+ROn+2+GpjhSAUA3EKlBgx1qK1wrXCFEgsOYvGLP4HvbTwUjBkr1jQzkCSqL1CgaJQxaCs7OD5P6l7s1jLsvu877emPZ3hjjV1N4tNNrvZItmkZJJipDCiolCWIMOWrUCOncEOElhGHCB+TZCHPBnIW14cxTFiOTJsSZEBJ5ZlWIEGa7QmihQjkiJ7HqrqVt17zz3jntaYh3WqODVNtSMo7f1S3efue3HOPnutvdb//32/b5+xCXnyj3slZUoCKctcOEbmFZKQufuBAZlnMylzLSKnPktCEMSQKHSup0hVgDEY81BmXeKSwhQFp6fHyDxboBW5biEVURmEEpyvBogwLSRKq/2WITKbTlltWtp+h9gHt/iYmE6npBQYhhElwQ4tffDIqiYUE3xMrC7Ocv0niX0BV4EwhMCjIrHUGXxDzEVcJQWKfL4UCh/i/gGwt1wLsN6RRIAgeP4Ln2c6P6DvcmBz2w3EYGnbkeAsKI0uNW3Xcu14gg+e8i2O9LeFJDoCNubMA1MW2G6Zpct9j9GKV+4umE0mYBRXFys++rEP8frdC3RKjH0LIlBMpvTrJdEFiuqIKCPVROPMlDt3z1ChJFEixcDFgy277cBHP/YtnD1Y4ofE/NaUfnlFQiOMYTvmFUEj4eToCDcMFM2UB4uOl1674B3vuEEIsN1sCS4wbTR2bPH7PXzfB5IcGH0eLMFGnn3iNu956t1MD2ZUhUJrkCqi908GLeUjCk+SCYlEG41UJULF/TSeDUulmeQ2Y8hciJBZavuwmESQe5yYUEhTEmVCSZ0vdvR5qR8NKcsL85ZAyJzY7D2mqMmpaT63yJRBhATJ7VOlFMHnbApibnkS8nt0bkSXBUnp/eomIVVJcBatciBvemQDChmxlyRJZDBsSGHPQxRZPfnQvhx8XlWlMucvAD6B8B5L3hZGIdGlxiApbeI9z7yX+7ue9WaRJ8goqOuSew+uqJDI44ooFXFwpKComxm7IedEuhTZrK6YzWekmHjx5Q11ISi8YNJ27IaAC5bpfMa8OaA+bFicvcb08IRpM8mu17RHtYn8+TKvIm87SblVme+Y8Ih6FQQQcm0ipoRPiSQFfbflS7//Gc5e+xyCRL8bWC47tCnox0w0K3XB4UFNPa3YrjeIGLh9+5irs/O3NCbfFisGgKo0zA+PMMogkkSmiDIli8sNVZNJvXfv3OXp576Few8WKJGx2boxGFPhxxGdDKfTCevdyOS4wmlDt90QXQKd2K5XxBDY7lqa+ZSuzwh4XUjKImK9xwVLuxspqwLrcoGobVvGYUCqxNmDC0KC6dTgnWPsHE3T4L2HkCiUxgdB8FAYw+ATQztw/eCQ208+lYGdWu7rBwZJSdq7EfEu77kBlQI5aSOHzSqtUboEIkFIgjZEVeBTgriPSgMgtymlyH8/PYx/kyVCF3kLIApCEsiqRpsa9quHMGZkuZIm1zTygjtnISb2kFKJQBJtQoisLZDa7NFhYb+90HtLcKY+JwE+5T59ihKEfuSslEKTZAGYLNcWWfshyNueJCVC6ayCEIqk9h0QKYlSYHSJUDrDarRB+AEh9Z5eFCkLSWEkZT0hij1JKQSuHcxQVZGFVM7l1U2UVAomjSEBvfWP7PPzw1OOpqAjhBDo+jF3iGRF8oEu5FwINT3l/msvoqUk+C53IBJAzn54uEXIEX/pkQkql4b3qzoy6MWFHCAcg8OOPZf37nF67SYxwuXZhnEMnB7P8SEyWMu0qbhx65Bpk8OPpKpJKrHZbHiw/LewKyGVoCwr+n7LanmfJGCz2VKVitjnvMrt2KOqhulshioK3NDibEIZQ9/3yBHKfb7BZJJvvN1mw2a7Q2iwrqcqFNFnnX89rRjHjqFrmR+VyOjYbXtczBLTtu9RSlHVFS54RjdAGLlatlw7nhKcZ7sbv8zUC0CSBGuxdqRsMnRVa8OtkxPe+c4nmU1qtFa5v60kENAiIRPEEFBopDSIJIgUpH3oqdSGJHTWDpCXtkLJLJKRewSYUqiYC1hSZy1Ech6lMhchSY13PkNetcx2cL3nYiKR2oBiD0vJLTVUFiUl9kq+XJXMXEeTVzZC5O2B0gZERVIaH/N2Te6x6mlPk5a6yMYg8vkpBHzIkujMgdT7Aus+YFdKZFE9ApoKpZDIfWp0fk3pLFRSUme8mdT58xmd49yIaKGyZF4JhmFk5zxowXrX0TlBiFAWeTC4GKhMibVQF4bBefqu5fVX7iJEyegT42CxfQdJoFUmZyXgYrHk5OY7GNoNfbvD2ZAnYLmvveydlErnidaHLP78ykJyivlFpTK30yiTO0V9SwyO+/de4eK8wwbF1WrHar2l6wZunR6itcb2luWmhZCYVArnR57/0hnevsUx+ccysv8/HlJI6lLl/aCLKAllM2F9tSQgaMpsePmW555lvV5iYsy+d5kVcsqUtNbmhOaipneOi+UGpQLOB0YbKMuaoi7ZtTvKsmQ+kQQCMVrqwrBdtyhZUCtJUWiCz5mBhQa8w2iFlHHfJhTsup6u36ELnYnJSuQEIZFQ3lMbyTh4Cqm5cXrC0XwChH3rbq82FIq4T0hWUucCXszeA/mwHaky2CWJLC4ShAwODZ44OKJXBGnwPpFUvY8o0yRREIXCDdv9vj+gyzpDcbUmqr35KXlkUQMZ2x5CNkZlx3TGxyul8iSlJEKSVZjSILQiRL8X6qQvm6qUxqcAlHnCMhqpy5z8rCTODZnvkPJONkeuqWxTlio7OEOeGIILe46B2NeHZPZ1CJVDXZxDyOwnkTEnSrPH35uiwCiZE866DqM1UUA3ZAOdFNAOA7pU9BFMmUN0YhiRIbdmnfVcXC0pdCZKaa3w3jE6i3Me7xOr1ZJut0YlxWpxnyeeepaEpJ4d5qJqzNoQ4ojUOWRW6bwqUyS0UjlyI2UGRXZY5gnT7VWrV8sFq4sHPHjtFYbRMtiR2WyOKkpu3rxOP4wcHUyzzmFwLJZbRpdYryJSVoS3GDjzNpkYBEolrl0/IQnBcnlFaRQXDx6QvCFJw/XHHkeQqMuSbrsijgNl0VBoT1knhE7oqsI7zzBEgstxYWVZM6srIKv8bDKUpcQYyfJqlSElCNYbjzK5gpwZOglTKJpS4PoRrSWjdQilKUuNtQ479Cij8mCPlmA90eeOQLCeYfQczWqOj+YILCn0SMIjMo+RIFQijDsEiZRU3nunvKdOD1mLIhGdI0WRK9Fjls2G5POSmYTSWbodfSQ6hTYl2kgSBcLUeASmNJgqJ2IbbfDWEp3P2yClSVqhTJ6UpFaImFHoSSpMXaNNmaXO+y6Ekjrbf0sNRiOLTH6qJ3N0NcURkKpEqhohFcoIQoCyyrBeIfNKJT9BXWZ6jgMx5CV3cg5dZpKzEgohC9J+8oAcqSeVIiSJj4LClCjdUCKpipLkHWVdczybcHwwIwaPDXmg9qMnesd2GNkNlkLFLCwSgs1m4PTkOq/fWSPJpCSlJEVd8TCfI0aJGz3DONAuFxBC/hvR4UZNN7QMo80EaZFrKkJogtOokENytEg4lwneSMkQEjFlFHy+vrlYulqcc3l2DzuuOL+6YlaX1GWZTXvasLi8ZH44o+8tu6sd487SlAYlLduupx/a/SruLYzJb3aCEOLHhBDnQojPfcVrx0KInxdCvLD/9+grfvbfCSFeFEJ8SQjxfX+kN6Ek4xhw/Y7jk+sM257lxQOqak51OEMoyfs+8CxGS8bdmtXqin4MoME7y9gOVEVFUU9wCQ4ONd5H7Aht27FpW0xRMIb8xJtOGrp2II0enR565CNa5iV3ZzMktdrHtisJYRhYriN1WVObbJTxo6cqC+oyI8DZ9/NJMPSJaVVxcnRIoRNKhr2sNrsi0/7LT7bPAqJIHixKk/bVeLF/enib9/5+dGRwsMUkiSr35CjU3i6uoCiRJvf/AxJV1QglqOqGmB52OiCJiCoM6BJZZA9EipEQH7YpDaosEbrCx5jx+SnnRpRVA6ZAyPTlLqSpiVKD0jnCXhukNrgUiG7M8mtpspJSSIj7LYqQe+1NBsRI06BMlcU+pIyf22suBGpv/hLEFDLyLWZYjDElSZQ5e1NqjFIYVVBrw42b19BCUxtJpSTb3UhvI6YsKRS5DpQKkuuJySBSwMZIaSTHR0dMmwmTeso49GiVsNYxDpZt1xJsx9X5grKq6bYDPhbsug1nL32RQjikKRm6DZL8nUohv5w8FdlDbjOEplEmU7aFABRKgO1aLu69wvFhzasvvU4c9yJqkbdku23L4ckRq9WaxcUKGwLH1+aUhaTrh+zKrQyDe2t7iT/KiuF/B77/a177b4FfTCk9Dfzi/v8RQrwP+EvA+/e/86NCiG+qrBAIZlPDdjfw+ksvUFY1Yw+mbHDesVwsCMOA7Vpee+MBu1Cg6swdSCm3mqoywbBF+JHloiPGyGbTo1RBUTVcO5zRbZcYJWiahm5UjEFweDxhdbVGaI0TmgeLHVc7i5SKqipZbzpG7zm/2nF/sWY2qxi8ZLEZkUWZOwdSMQ4DRgQMAW1KegvveuIW1w5nzOuaiTYU5TTH2MsM2RAh846rqkKZGUVRoJVGoggx4cYR2/c5gt5ZpM+ekqKeo6qaFBKqqIjGICu1r/4bopKE4JEYpNYEPxC8JzqHHwb85hK32RCHESWyqo9gkSlTpWLI50ldIpXC1DNMM8NUDdQNLiWKakIxO0bXc3Q9QWqFKUsciXo2h+gopg1aZLiIHS3BjplL4UcQIhdUpUB49lgz/cjzoczeOBYjwTm87fBDR3Iut3NNAXikMo98Gro0mZWpJEkZysmMoiy5cXjME6ePUZgpdVVRKo2PkqvNiAyS2Dm6TYvTE6aTHFJ7+eAet2/fYnFxSQwDr77+OoXJq5PJNN+X3a7dsyYT0UdCWeISDKPjyW/5Ni7uPSC6nrqeEKMjYoBETGEvIFN7f4oghox711qTYkIJyfJqsux24wAAIABJREFUw/biHt1qxaf/1W+zWVrCGHCjRRcCO4yM3ciw27K6HAgRHr95SmEE1o9sOlh3I5vWkvYGrz/q8U0nhpTSrwJXX/PyDwI/vv/vHwf+/Fe8/lMppTGl9ArwIvDt3/RNCIGzGWM+DAMnJ6foosJay2q54Mmn34PUkm2XFWaLxRlNPaEsC4K3uFHRto4oJFIlgshhoP0wEBKcHDT4ccSO4RGkNHiLEgltMinYjQEdO0aXe/t7CwsxeLp+YPSBcbQUhWZwA34c0FpgbZ+FOkJmI5KQBJ+QWtMUJUVVk6RBSpNBJUnlHje5cpxiRKomZ1bscWnCW4TP+oLkRqQyBJn5ilVRYH3OpNDFnoQU2dcYss03ZX01osxL/hTIgxGBNhVqcoyazjHTKSJ4fL/F+bykFdi9mClb3GOE5MY8YbgRby2mKMFUBCGz3sC5rGwUOZkreY/tBux2hTQG129zazZkI5XUOusy9qXNkCxq323IkXXkJGfyBOf3ysjsNswcCJlykVVphdYPYTFZJyCQe9hrhrsooZnVFVJlC74pC0aX6F1g6yKBiPMJ6yyt00SgmUx54/4FVVVSac2kmdP1jrIQGUzsPD7mOtB6u2VyNMXtqdveW2bXnmC1XuDHHoHEe5t3YDKj8h4+0ABCjPttUca/a63yii5YFuf3WV6ecXG+ZmwHxjEitWR5OeBtojQm53wamM0nuVYhPVdrx2Y3stv1uav0FieGf1Mdw42U0hlASulMCHF9//rjwG99xXl39q/9a49Eou8GxqFD6przywe5Ci4UN2/e4mBes+4GkpFsuzXRjnlQFZoYBMJkGIYxgtWVJzqBKUqMzfxBKQXRD/gQ0MKz61vGoWM2PwCfk4RMiHgriF4g9kk/RjiSFnRd2Md/5cr4drtDC0ldT1DC0dQFpCFPCFKxvmqZ1RMm04ayrPc3P1lWLbL01xMw0ZJoiEmi9gRggkWVJWEY0M4SUsLajt9oj/hHlze49IZT4/krt3Z84iQSRMiqzJgNUQKR7dNSEkbPL12V/PgbMy6s5FqZ+C9u93zioENIQT+sMVKhlcSHmL0QKFIMJCOIYW8ZJuH7EVUqpPNEp1BFVkmGmFFlUkqSzUXIMLaIOGTnZsxV+DRYQpIIHYiiJKlA9BahSkTSIHTG15UFIUV+ZTXlH5zNuXSKE+35T2+t+a6DLt/kRH511fATD45YOMWJ8fzwyTkfNQ+QKtcuonP5sRczG7NoDAqFlgpVlATXgtCsB8e8qamA3brFGEndVKzbLdENvOff+VY+93ufJqmCoRuYH80Z1iuausFuW/weSvPiH3yRk3e8B0PEE7ha3GN+cgM7BKoJJLnXmuzToZTKhV4p5KPCow9Zgv4wDvD8zotsl+cs7i+xQ1aKzg5L1lcdo7WYQnH92pzFKns3bt+csd6sWKwG+iFSForB7qX0b1ES/cctcHqz0uebVj2EED8C/AjA0bRiPmtYW4czgm63wZQNpjA004r1doENgthdMrRbrp/cxMgiS51bhfee6D2Qyc3OWpLwFKVhUhmaqsgFRKmY1oZ+dHib06eriaFdZanpGBVdiIQEcw0pOJx1eJvwezOQ77fsNi2mrLNlOEm87TBa5D0v+ek+rUrqpiLTdhL6Ua86G2Li6JHFDFVUoHPvX6ZMSYguEZLOMubg+fX1nL+7eByb8gLvwhn+9p0DUEu+czKQVI5bHweL3isfRQr88qrgb7/WMO5/73wU/E8vNaR3Wj553RJT9kNEDyIKUAYfMh8i2R1OjznEROa2ZhhDxr1pR3SZxGwQjGObe/TjkPMNUswkKSJjb9FSE6VEE+k2G3Q9za5Co/J2kJwPoQuND55fXk35X+4ePnrfl97wo3eOsaPlu09afm095389O3l0PS6d4e/dv0k/2/EdTQYIK2XwfkAVuUBaFiWFqdh5Sz8O+BiZlIahd4xB5Bb3YGmqAuv0PmtD8Lk/+AJGlXgl6B00LuG9xqhcr7HtBoWnXS2R6iW8u8m7nnqGaVlSHl2jMnmvX9dl1ickn7UbKe5JTYLgMxlLxUjc15b60dKu1tx7/TUuL1YczBuG1hGtxzpPWZUcHxZc7VrcYLn9xCm7dsu29wxWUNSGdnBokbJnpPyT4TE8EELc2g/wW8BDWdUd4B1fcd4TwL03+wMppb+bUvpISukjVaF5cHGfddfhfEvd1CSRMHXBbN7QbVpSgqY03Do85WQ+zfg0GxFBosuScqLZbjYIKTNRKUS0yvbdpjRomejGgDElo/M5Yt05ij1aa0yJKAWDc7TDiCbRdSOd9bSjw0WYzedsdz12DDSVodu21KVBuITvE7UpWV21GC05mE6zO45IiCF3D1J2OSafctVfwW/2R/zNe8/xH732If7aq8/wa7tjQszhr17k5uZPrR57NAgeHmOU/P3Xp2y3W7p24OJ8QRSeTbdl026woefH70weDa4v/57g79+Z4sZc2LTOYocW7ztCyL6PnA+RsvLZjfiuZ9yt0YUhRZeFRwTcuCXZFh0ltmsJdrff/mQwiosZGit0ymnQ3iJTgYy5tSn2Ia8uWIahxdqe0Vn+4f2Dr3vfNkl+6vKE8+UFP/Hg6Ouuh0XxT9t30ruBYdjxW9ua//7qw/zI2Uf5W+4TvGhuU1dFhrPsmZTbwWZatI0sO0dTG0iB2axkO/RoKbi6XHN863reXtQli1WHMQrrLUkkSm2YTmfM51NUCpRKQHSstmtInn4YkSJiux2eh9unzBTNhjSIPneWkoDgAy4IiLBab1mtLYdHB1xdrDCFJApLVLljVBaKfgc2FTkcefTYEcbR0XU9w5i7E6VRlOZPZmL4GeCv7v/7rwL/9Cte/0tCiFII8S7gaeB3vtkfE0Kw2YxMmgoCjLanbmom05Jx2FA3BWVh2CwGiqLO2u+iph1HVK2I1mWTmihpasVu59FFifci9+B9nmEz/z9gnaAfHCdHU0brCdYTouD02gwfBIOXoDS9dQxjJMZ9QdB6QtKECCHlbYOSMDiPt2O2BUvFwWTO4elxLrTtY89D9ITI3iOQP/NvDKf82PpdLEIJCC5Dwf98+Rg/90Cz2a44v7zi7M4bXDrzptftKhScvXHOvTcecLFYcPfuGathx9ndO7zy6htc+jdfPp47zW63ZrdZP5I4h7AnG5fZkIPam5mSQBaJspngdlsECkLEDbmgmSiQlclFTHLWZQoDPNxm6OwBIAaEKhBl1j5YHxj7lrZd45xleXnBSy+8xCvPv8Sl/8af9+XX77MIb/7zZSiJ45bf6k/4B7tnuNpf12Wq+Wf6w1xc+wBD31MXBTaCtw6UziHDCERpIESaOkvLvYvImLg8v4cQeR/f7+Es1gUqkWtVIcF8dpix+D5ycX5BKfeOUwKb7QZdzYjREGUBe0VGCJa+71A6a2aQCR8DOMv26gHr5Qo7Jq4WG+anB+y6LVfr7EZ97PFjNruAVBlWM2zXXC5H2s4iiyyK8tbTVBWm0I9Srf6oxzedRoQQPwl8N3AqhLgD/A/A/wj8tBDivwReB34YIKX0eSHETwNfADzwX6e0R+P+a47gPZNmwm63pTY108MTbt68zmp1hWkq0JK+37HsPd52ED2HWhNtiy4nFGr/Re33vUll9v4QPJPZnHGwCJW4fjJjvQt0O8uNazOIPncjCs24GzlfrBl8frqNo6V34GwGZDjnmBzUvHGx5fR4ju0GDqc126sr5o8fU6mC7cJhk+T2rRNmTUaXV7LZY8sKkDlFKsrcnvwnu3dh01cPXpsU/3h7m3ddfoEQso/gsOlZ0XzddZuFjrMH9+nHlsnhLDsdXWC13eIjTB5/jtbMv+735rHjhS/+ISIFjubH1E1FPZthCoPtl6iiIEQJ3iJVZgN4Hwg+IuWIbmrqcoIly4tpN6QYcYMnypyXgIByUjP2LT7loN/WOpQx/MrqgJ+4OOHSG47lyCfa3+Op/lX60SKkYHraslNf7wacxZbQdUzVN/r5jtdeep2fvfYJ3NfsqR2al29/Lyef/UUmzYRqF9ilLB4KIbBuHc4OHE4V9fwwM0S3AeU63OXAjQPNwaxhs8uKVwHoxrDdbCkrzf0H9zi+cROXFN3916k++D6kqRnGlkk9paxLNutLjD4k7mPri6KkMGTaldHEbkfsd7TbBYu7z7NdXiBlj5SOq0tPDIZSKR57cs7zL1xRqOxBf+Y9p/zBFy+wweGjQAtBO3im0ylSS67fOmZ19cccaptS+ssppVspJZNSeiKl9PdSSouU0n+QUnp6/+/VV5z/t1JKT6WU3ptS+hd/lDeRSDgXICl0UVBPaurDIybTObvNlvms5upinSEoAorCZE+BkNSFIEfeg485rOXZD32A3jpKkxWVQ9cSraNsqn3QqebatXl+yhORo8pyaicxSGZKEaMiROitQ2lFU5cYo5lUOTCkKosc4qIySFQrzZASk7KgqWqUUZnuuwdvZBhrxqUlFxlHz1Us3/R6bNSEk9MjTudHHE0O+Qv16xTiq+dXQ+CHDh7w9Puf4fY7n6TdDiy2I3reUE1rJk3Dx8c/QKevRnrp6PjOzW/TW0dAEEXkV7oj/sbrT/NDzz/DX3/lPfzSZU30DlQk4kgiEYNDxIGyKJD7QB1CyOcJQfIJ9m7QlHK3xY0tyTtwPZHcnvzly4a/c/8Glz7DQ65ixT+rPsan4uP4GFnuej54/uuo6L76fSfPd9nPYozkO7pPob/m54bA94o/xJQNW/nmFuONaNBCUhlBU2tcDFgXMYVh8Pm7HMdEIeH0+k1GH/FCZvJ0U7FabmnqCqM1wXvGEFFYtFIcHB+w3axoJgXN0SkuFth+gymqLH+2I0XRAJnapNVex6kkPu7jD3ZjBgaPPZvzu7jg6Hcd1kLwDh8T6/Waq6ueusqt4cduzbl/vgCRMzVTilifiFIymxY4b7la7bD+mz6fv+p4WygfQVAWmrKsaWZTjCloV5cILCIlnA+0vceOnpASpjBZ5KEEXdcxnRUE62jqCfPTU155/g+RQnLQlEwmE5wdWK22SCkYupayNiglMeUEEIyjRyCyuCcEykozWIcPHr/PMSgMDGNPVZdIIygrwziOmMIQE1gXiMlzcjilbkqcDyhpQOTJLpJ5iFIlOtvzxtldZqF906txIi3To0OObj/O0e3H+O4Ty994bMk1k2sUp8bzX71jyydvwuHBIY/ffifvfvpp3n37MezVmiJ5Jkbxrdzh+93vMU8dpMTEbfje3a/znH2Jzg5sNkv+5WrCP+zfwyJ+eTvzdy5u8S8XNc550CXKGIQQFJNptnGbCrdrCX7IXIEAQhtQ+w5EAmIghEQKHlD4YSCMHT95de3r6gNeaH738CP0fW79fkid8T2732Dqt5ASs7Dj+8ff4UPiPtO64ZMnA//54aucqBx+fKotf+3aXb7/HYr3vO9bOFLjm17XA9Fx+7HrMI4URZHvB+fRWtO7sK8AaHQBd167y62TKUrGHGW4W1I3M7TMhjaXxD7tWxG95fzuG3vk/ogsKzabJf1g0UbTdWsSkmY2w7kBkXIittqHyHTbDsjkMQLcffkFNusVq+WG0YK1ARcF23bk6PSYoQuMwXMwqfE+Mg4apNgrSTX96CiNYrSe6zdPkMHmovJbON4WtmshBT4GqiJHxE1Eg7WW5cUlxzdu8vrrr+YbDoEpS4ah5fT4Gi6M2DayuNpRNIf0w8huXDG0HlMWXLt+g243koRkGD2Vd4ze0biAH+bUxuDdSFErfMppxoiQ1Wc5FojaFJRSMqtrXHQkl6gazdD2ed8WApO6Yrf1gKYqSrTUDENHNZnm/fqeYSCUZHSes7MF7bbj4+JT/ML04zjx5a+hEIH/7OQB00lDKmpESlSq4ZOl4N+/dYEAPAlBQeKA4Bw6eN5RP0Hf7phLSdttCaPHE/k2/YDn4gPc0BGsQwyW0Y8YZdC65uf1c7ivuQ1skvzE6iafOHqBMI5Zpykj0RnGYYcoDYIAIRHjgCoUoe1IIaKbAkYHUuQAoTRCiAQnSQouv0F9oFUzTFFmiEvwvLd9nvcNL2MkTCcTitmESXNMVedu1bumiR8wdzK6LqUMoInZ5fqXTy/4384fw/Ll7YSKjn8vfBYX8ucpiqxEtDbuUfQJF2MOo99dMZ0dcLW8oqmrzPAsJiyulhRliUyOwbqca5n2UBUhSX6k3W7QpaMsCoLYP0zKIsu5gbqZIIPHOnBOUBaG+cERSuS4u7PXXmVsW8bOM3aJAI/qHVmJecDyYoESius35jy4f0k/evrBYoxh21rm0xopyGSrCFrX9Ns3fwh9o+NtMTFkiEfCJ4uh4OBgyvZixfHxKaXxWKfohg7nOpQwSDnh2s3rXF1c0boWP3qMBB8cgxXEmNBa4oNncXaGUpH1pkNWmhigmh4gSYzDSBSasgFh0z7MRRJFFp3YANXePzBYxzhGispgR0v0gdIIDqY1hZb0FqaTgkmjCNEhRSIg+Szv4F+ED7AKDUe253vc73Nr9wLTesq38zqH4vf5Od7HkppT7fiPT875rvkmS5cFGZKKAxRCaAQJ6SxWZMONKTShczmfwSjm168zFzfwQ491I4OPOSzlwYie5ELhuEk5nEbCWnx97QJg4Q1RKAoCyXaEKBj3oioTJd47qiYLpJzL/AQpE3F08JDjQiRYj7cJabK+4kTa/erkq4956pjVino6JwWPQWCKism0QUhFUSqkzqlcxhgCEpEkIqXcqpY5KKcqa77raAfc4f9Y3GQRCmZ+y4cufoXm6v/hCycf4OXn/iauPsbsLvG//hOM55/KAxzFGCOLq45n3nGDX7p3xmEzwe6ftr2DXJcu0HJg8BEfAiEo6CwSTfSR5mjC+YMzTh97AuEjTW2IApzz1FXD8vI+CU1pCkbrmEwLVsslIoyslhdcXl6y3OwIKREf4ufdwO3bj9PUhq4qUMHTbbfsxvygsD7lmD8pUEIyqQzdYBHBcbHqUfKPufj4J3FkNZtBIzi+diNLPWMkDSPDIDi8dsorbzxAiIKxzxFeNibadscwOna9pz7IlKFxbBFKc3p6SnI21y7I8edV7zBeYjC4EWSUuJBwoyWkHCFX1yXbrkcAzjqauqE02WrsfaAQCu8skz2OviolaRxBFRzPahIC73oEhs9wk3/Chx89kZep4f+SH+X7Dns+ps6ZHMz5mLjku2efYVpqzHSGFpqkDD5lkEcmQ+vsYoxxn/JUgR8gJlzIzsmh3yFTNi2l6NFasd1mFLyKgXoyJ8WAGwJCGwqVrekH9KzfpLB5rBzBj9iY8DuPLGvKRiCNxluLkJKh3WGqCmF0JjPZbo+wS3vyZMpKTCn31XfNnyte4h8N7/2qp7kh8Mn4+ZwQRUJJaGbzbNKqG4xS6LLAaI0QEi01YbSIKLKJS+agHyUN3nuSgE8cdfy7ky8QXeDVl1/m1fEOXzh8H394888QVJHvu9k16k/+dYZfUxSv/iaDS/Rdx66bIeSSw7piaEcm0xIhBU1dMQyOMeaA3H6wQI0qFYUpiAh0VeOsI6WANjVFU2GHNXHwTBrB0HdUzTHR90CiKgx+7FFSsDhfcPngDNsP7LZdhtb4jAq8fv04J4PHgEZQTyeMwbHtHEJGBhepRUAXhhAt3UiG6JoCkXYZ3fcWjrfHxABM55O8lBSey7MlKSZ0YVitWs5WPe3O88TtYx7cPeP9738v7XpLP0Z8knuZboMqPNW8op5GTo+Pubz3Kr0PLHqJrCdsOyjnNSCoa1hsBtq+xxQ1u+2IDYLB9ow2MnioqpLTmYFo0fKQo2sN3WZJUzco4ZFEqqbmlRfPuPHku6nrCboosrGnNPzf8tu+bpkepOE3p9/Jnz3+bWpToffGKWU0xJSfSKnKRpsUCdsNrRZom5BFRTUz7DZXzOYN1gbG9YrBDqAm9H3H4bHC9Vvun6157PYT2DASIphJybDb4JhxeHCMjLlT8IPpRX7Svx/3FQO1IPDD9cuMvSWliKkKdF2glUKMFtd7zLRCVhqbPEbkoJiiqPAh14VUikDAKYntdgife/Qf5g2CGvmZ8F5W1Bymjj8tvsAH9X3U9Wt4Fyiqmhhj5hEUZh+I61ks1ly7cZNu7HKor8oT0nZzhfCC6bQhhJg1LjEHDiHg+GSOEI/x0+KTjyaFR/eeqYgf+YsUZ7/Lou14bDZn07UkN/ChZ27wyp0Vl9uO3o4YXWCUoR8svbVcPzogAuv1Nqs/rWW3uOD0uY9i7YBJKdvfZU1d5BWgBIT0+AjWegQDm+WKpoRXPv873HnxJS4vrxicxwbw48Dp6Qn1pKFuSlLoGX1P28Mbd5YUJYxO0lRZTFcV+YEwn3iaZsrl5Qo75rbsWzneFhODVJKyMuAHfB+oqoogch/9YrWhqibMDyZZlyBygMr8oGG3G9h0Pf2Y2YRtn9s1N27eot3tcCEwjpGx6zg+Pma1vGR2eMzh3GB9n5flUnJ8fMCD1X1cDLicpJIHU1lQG0mkQEmPlgUqGSojURhOmrx83nrNk1JgymxCSinbyN+sxQiwpEILiWSPPUtZbehVwcOMKLmPnpdVxbhb0XaWugl4HNgRa7N+w3vH2cUVs7ni8GiOGzZst2uCgNG2jENLQCOSRwSo5jXYLQlJEjXfqs6JwD8PT7NMFcdi4IfqV/jO6S6nIBlBWebCZBgsWlcUU5URdKpCpeyRyJDTgJKKkLJqVKmHbbmCIY7omMEtH/Iv84z7UobDaJMdiFZny7oxBJEYxxZioCybTM+SJboy9O05X/rc8zz25LM8+c4J9+68wXQ+Y91usz1ckbsoSWaWpcqmqKIs2YY371aI2SkpBgabuReu65BFyfnVBdePTxlDoLUDRmfgbZR5lbLedRxcmxK0pO96iqqimR/gbKAxDRKNHXuGYeDg4IgYLHIPtpHKABE7jJAsb7z4Iu1yxdA73OgQURDcwHR+gHeeuioZhh58T5SJ1bpDKEVuBmm8d2glcS5l1adzhH2qlZSSwbk3/ezf6HhbTAwgsOOAJGIwjLZnudoSUubx79YrpofHXF5eorXi4vKSyfQGmkCpC8pC46xDKk0IjmeffYpP/+ZvYnR2bAaRaCrJEjg8mJCwuBiRStIUBWZS01lLxqvvXX/OUxiJkAmtSpQp6Nsd1aRGI9AyMp1O6DeemBRVVeVsiT3dKFjPYd2xYvJ1n/ZYjkihiEEQQ/4CpQKhRmIzwSD2gwwInt3iEpcEpp4gbIf1DrdoYXLAtut4+cUvcnytZdhNUaHn05/5PI+9+ylMKVhtlqSomE5LutWG0+MZUk+R2uBTBo3+KXHGh/WDnICUAoU2jKNGmxIlDaMdUUIhlUSnRCSLdETI4bJaKiw2R7srg0yC5DvcaCm1ZrvbkIJn8FnsI+sa2UxJSLTWRAzW7yDuIwcLzWa5ZL3pefftxwnB89rL99i+9+P8fHof22vfzWE/8B+endF88bMczE94xzufQTDS9YGqziI4UiZnS60QwAEd6zf5PtL2kqKskHJkYwPzeYmMAWcjiQ2zUnO5ddniHvtc0xCStu8YxwyDiQRCjNy78wbv+9jHkVGxG0bmzYSyMIBA6iIneEeRi4M64kNid36fOy++zNBtGffsBOcD02lNDIHT69domhrrBoZ+JCTFpuuZT2seXO3QGry1NIdT1quWo4MZhamoy5JN3yKTwvz/7JX4NztyXABNPWW0HcE5bt064UsvvEaUgsXVmqdu3GZ45SXeeftxDuYz1ssdIhoknslkgo2JoiqIMbFeXdHu1kybEkRm7R1fP2ZxuSbEAFogQ2ToeppyxnKxoUiKIcS8PHU5+u1oWmaQ6jjiokdqhZbgkdTJYR20y56jqspI8xgQukLqHFv/Z8zz/LT94Nct03+ofjmzCpzfpzvnQppWBX3b4cqMrzc68yP6tuX+2X3GwTE/OuDXlxU/F97Pmop5us1s9QanX/o012/MaNstX3rjjCK9iyvzMcbigNpteO71X+Wp7V2K8hmuXzc5OzLJnLeZCsauIySHMDVGemRZkGQkhBGtC5LKqLmgcsR8lHmlUDUzxuUCrU3mCFiL9yM+REDh7IAqSmzvGAaHVxVF2RCdy8naKASJMSV+Z3vA704+wiAOKCZrzBf/Mc/d/xzD0HJn8iyL+K2k/VZglWr+/uIJ/lR4kvfdf43J0YpxeZeT64/hx5GoFEpECCHXIbTiE91n+OfFxwjyKzojfsT+q5+i7SyVkfiQMm7OZ2StNpoxhOwGlRobs5xakBhczoUIQWIKRYiW5XrLYrni+OA6ybeQGobBklBI4fAhh8uEEDFGsd10XLtxg5efN+y2IzyMuE8R4QNHB3M84GPIFnXT4McBkxTb1qGLEkFiMqvoOouQOqtPxyHLv0VOVNfy38KJIaacNwCRumy4WLa0uwvGQSG0YrMLbFYLvE/EANPZAevlJTYGnHP4MOZi2DjivGO1WOK9J6QGmTIYtNYlUmaGfzd6mjKbnnyI2MFn2rBQiBDxDiotuHEyxybBtsutybI0WGuZVor64JDLiyUqKeZH07y/TpHa5Iixsqj4iLqDFPCz9tm8TJcjf6F8mY+YS7zPkBmZNKqsQRckpTKMZddhzIhTidF6dmPLC6++zmdfvMO968/xwlN/nrgfIBsxYfOBH2bdWy7u/AHdOBLe80kW7/+LoPM5fXHA7177Pu5d7Li2GtBhQXM4Q5YlUhd4m0jK0O5alHVMq1OiT4jYo8qCFAJCJYQpSSJlgrU2BBEzPLas8eOADC4TjgWIkB2p1juGrmMYBrphIJYN0XbEzGjG2ZFI5Gdf9Tz/5HeR9u/ZVofYb/8rfPZTP4m+/zts/+wPwtfUB6Iq+PTxx3n83h1eunsX4xPV1COFoygVXgj8GDk8KDBa8RxvMFrJL4T3EppjZLfA/fZPk177bUQlkVriqVCqxrcbktEMu46mmkDMuoW2t9llGyNNzUCKAAAgAElEQVTaGCxgAGMMw+gYuw0P7t3jdH4MRLz3HBxeQytISaNlIiWP2wNmRPJ8/rOfxfYb2nZL14/E6B6F1VjruH7jhM1qQ1FoSIFda6nnE14/u6SeVDgXmE6m7LodWsKsrmi7ESkjpqjwoce+RYLT22JikEKiYo5UX6x2uAT3H2zwLhGl5mBes11umTQTikJRNILFywuMMcSQmM4aBmsppGD0npe++CWUUSy2PVEnSiVZLBeQYNo0RNeTXKCKmSXYdQNKKFwMtD5Ry8BT776FMIbYWuwI01lNKSxVVVBXBYO1oEoODhuO5odU04akzJ6LaEBoqrLkO+KCD6dfpDCCopkSlUZ5CFEhzQSZAjYmVHJcXS3QsuT8zku8/MbrfPrlM8ZU8NrlltE6ktboD/03iK8ZIOiS/oN/jmu/8hk+8qd/gF+YfA/Irz4n6YKL9/4An//8j3KoEk/evMW10yPEjceIZZGLhXFKZy13zq44PZlRlopAgCKg6YlFsfd+gEn78NUo+MWLkh+7c8yFU5waz39ycp/vaEa27RVD77hcrnB9Syk1QyfpSo0pOs4Xl7zwyhlB1bz4wR95NCk8OkyJ+9Yf5PHVp9hOTt703gnNEf/nL/8GQkhu3zzl8Zcm3D495OTogNOTawgB7f0tpipQUvBhdY/l7/4Mz985p6k1Z1ct29GyiZrTecmm71kNGdaa+o6jeU1VS04PJ5xvB4IQ2DFQGEFVFODZY95zm1BQ8NJnPs2z734aU5ZMqgJZZsGbJEfWGaPReIbRUZiSaAfOz1d0rYOQjYEqReaTElUUXD54QGkE7cZhg6C3lnv3tpiizEQoEbl3foUShrrWDONAbWBSS+6ed+yGgUq9ucr2Gx1vi4lBPHKdOXywrFYrIFBO53Rdy3zWsLpacftd72JyOMFajy4aUhyz/98HDk5nvHz/kqYsWC93HJyegLMMLnByNCM4x8npESrlCcSmiE0C6T2rncXHiBKCUsL0eIbSBcEL2t5mE0ryxCiZNZoQIqMLHNQNTVVTN/8vdW8aq+t61vf9rnt6hvd91157OoNtjo3BJo4DHGHwBDWBQAwJQQVSSlPRUFWpKjWfK/Vb+ylSv+RD07SqKkQbKUoCSRAUQotUJqUGAhSKkWcfn7PPtIc1vsPzPPfYD9d7jI/PNsRSWx3f0taS1l5rvdPz3MN1/f+/f1C4hlEfvQuKBK+1IsbSrVaayJQXTYM6lhiphWYNy7RAqlw/esT9V1/i/HrHx195yL1tZTvtaFSGVWA/Lcj6zmPfwzLepus8t556C4f98NifWcIJT964hbWOlx49QpzlRj8SnCMtQvOigSciLCnjvZKnKcdcuLJgO/VktBqheX71VeHvP79hqdoOe5g8//D+W4m3E+9tj0gpMe0mjBnI9gQksmzPeGW35eG2sDrdcL3fU4abj33OuT9lfbLBLZfk/o0/Yw8XLF//IfwHfoJ76zu8NF3w/Gf+FR+t1zw4f0gnwsk4HJWunnm34+lba55/9SFLSvR9hzkklpjwfsPl9prt3nDnxoqHDxc21bKfI6uhZ4yFqzlqGldrBKPhOsVo8KySpCIv379PbJVgK5fbPXeHExqWlPeAZYmJuDROTzbc+9zzPHj5ObZXV7TWSLUhx1V/t91z9+lTBu+4uNwiDupS2UXlkhoLpSaWKWGcwwj0QeXaMnZKAa87PAow/mrGm0ISXWrBe3W7xbhQqgVn2e623Nis6QeHdcK46nnnu9/FYTfRD2tW/chmtULEMe13SIP1yYbahJOTFalAjJHBG6YlsloHpBZqnNkvkUzEuMK0ZEqrlFbZjB5a1WryMnN1eYULhloOR7inFoxMgXHVszlZYY7CKG9V0y/t6Op8jQhtGsYZUlPmgvaXjbIZqcQ88+jVl/jk5z7DH997lV9ZnuHjH/6v2f3E/4D8+H9D9xf+Hbq+x4aAmS8f+x7a+YLsR1qaOPWPp/V08yVjZ+hs4fSJt3J1mHj4wgs8ePAKD6/PuJwm5qq6ylygVE+lp8ig4bbJYK3BlKIY91L56XvjFyeF10Zshn928VbyErXqng3Br+lXA1OeuV4m5jTw5J1bmltZHfYrvC6ZLtkuldPP/CJSXs8ttC1zcv5p/F/+T2FzV1mQ4y1eee/f5BdfGvjUpz/P5eWW1dizOd1gpNF1HTdOTgBLZ5S7aI0oECdnailKf24Ga4Wrw8LVFGk1c9J5VsEhxpFyodXCVBRCPMdITJnge2pJDL1nv53Z9CtayeqerEa5FbmxWQ88ePkFyiExzRXvLK1l7SZUpUjfuLHBWz2OAZSUOOxnalNty2roqFlra16MRihWIQwD47jCuA6Mx3uvQUNfxXhT7BiMGAyJnApn58fItK4n2IUlzQybDcPQkWJmc7JRaW9rmKlSQ8e46Xnl0SUuBE7WAWM8h/2By+0eby3jOHB+fsnJxhPPr1iytoRyqTjfK2ylJvWsWxDrWa1X3H90jfEdnW+UZKhFmOaMsxqauuoswVuCM1qxtg6/CkCl5RnpTr6If6cVGkISjytg0d2EWEvoVOv++8894IUb38L87N8Cd9z6rW+TP/CTvOXeU0wf+1+59cKv8/Bdf0M9Cq+Nkll/6n/Hm0pwgfcMiY+dKQTktWFr5tn7/yeXF6/y6G0f4PdvfZitWbGpe95//dt8/f5z0ITVODLcvEVOmjlZcceoT6eT3TKDdUjTjOuH8fFFrbMSOJTM/rAg1lOHgPjKXDKfHr+FP3rbd3BwG/p0zbPzJ7h977d48I1/7XWvq5WE/YN/SZwPjI/+iPTg3Vw99X4QZRp8Q/o8Lzz13j99r14bruPqvT/M5lP3uHnzBt3YMe8nQtAb6fbJhi4MVIR1Xzm/VqrWYU5a4LOe7W6LcY6cM0uEu6cjqWX6Brt5x2tH9tTUTWoQYi6IGFbDCE5BPWEYFQWfM91RS+CcY3t5xby/Zn91n931FfM8U45/dL3usN6xTBPDKnK4Xuj6niUK1lVqAe/AKguXFGeGcYCqCe6rsWOaF8Z1TxeEeRacf7wU/Svek1/VT/9/NMSgeoDtDsRSbUfLldA5Ls4vcM4ybm5ipPLwlZe42i2IVBbnuZ4nxFiutzucU+x4rjO73cw0zQzDwMmq15xIpzmJqTWK8wxDYJq1LTh2jhsnK3IRXjn9Vn7jvf8Ff/T9/4DPfd/f46XTZ5WGfFTbOem5deuUdTfSF4szvb6OVrGtaTGveVqeVAFnX0tr0WLTUhpzgnkfWaaJaX9QGew+Mz/7o2+40JsNvPz0dxGsZZkfT/uVsjAvmS+0J/n9C9UdfMlf4L3z5/nA5sBLd76d37r9PWztGkTY2jW/ceMv8/zqm9j0A71GQWFswR7DbUuNNCpLWqhpVmWjNBpwNzzetXfLLMRjjsS4WXH31grXOV5ZfxO/e/cjHPwJiDCHG/zeyfupt7/+sfgvb9VwdHjifVw/8W16sSgZk8/5dzK7zWMfPw832Zze5MnbGw5xoUhlmWZ2lxP5EFl3hhgjOSX1FLTKnDR67zAVDnOmucDlfiYWzTZ1ptF5wYh8MQCoJE3j9n2nIUJVg4274Ln/4CHb3YEi2vo1PmCCxzdBWiTP05Eu3ihZ06e6vuPGyUhNsx5zmjIynXeKrEPIKSuMKCViLnTBk5ak2pRj9kgqhdtPPkFDbeWHw//Ltuv/P4ZgSKWyNKHvLHGaSblhm3Dn1m1yblCV4bdME7dvn1JyRGxltyycX20RhOAd24tzjPNMsUCuOG/ITaipsNqssL1TrUFBvQDB4aVxshqopvHKrW/j8n3/MUt3E0TI422ee89P8urd9+EGh4SeYBrSHE1GZBzBaCCrVChV9GxuGnmeWa6viYc9pmaolZSTQjRKYkoL5w/OePXV+zy4PnBz7GB167HvUQonPLMa2L/r+1+/WwCwjt1f/BuMwfE78Wli/fJbTPi8f5q3vvvt/Mnbvvv17TogG8fv3fogTzz9FE8881ZMmZivz8n5GCtnACuqQKyKIeMoYvqpp7d08vp9aieVHz99mUYj9L3uQqxw0jt+s/vmNz4+hke33kv7stcl1pOf/TFyhYt3/sAXW5WvjSIOfTJvHH6+5OzsnIdXO2yL7M8v2F3twFVu3LxB7wKpNEW4twqtknLBGUuqGu1DSYz9DU2fqoWucwzG8pYnb0ErLLkRc2WeDyxVGIYRay3OCDHBrVunbE5uHC3WjUIiLzNXV49Y9ld403N29oDD0khJZewn614xgVVdxNN+ovOWknbM8545Foy3dEGhQ33XUWtlTglrBStCLaqAfHT/IaY1Yl6+0tv0Fceb4ijRWuPi4pLzRwe2c+aJp5/klZfu47tTbt485dOf/Tx3bp3y9FN3ePmVh6RZteXzkjjZDOwnhXhaKey3B2zwXFwcKMZwMnZcXe1JS2Z3fkkrjcN2T+80/muOjXBzw72HV+x2C7sf/3Hal63Y1Xbce/eP8tbrT7BZjYzeYTuD9egNg9DSrIlOpmkw9JG9UEsmzwu5gtRGnA/s/JolRuZYdUUxgSfv3OV779zmxbzV1fTLxlj2PPP2t/Opr1iku8mN9V3+JLvHkjd3bs2nfv93uX77dzz29y9ax+m6MWxWtJZIDZxT7Jij0apAqJRssTYfic3wkc01vK3wM6+ealfCJf79Gy/z/vCQe1aY54IdGn3vmWN6LGBFx+OLY7k/5dvf/W7+1WMKj/prBlsz5UsmFVsT77z3q9RWOVycwfg2bp3epLWCD4F/PT/JJz7yg0x+gzlcIB/7x6w/+9tHPqc51gocj85n+t5ANlwXj5QDSy0qPR9HdtMB7yzbXWHsC8UbNkNgXg5cXB9419ufQWjKbxDDw3svMnaBzXrF/f0lL3zhd3nx+XuUNDGMgbtP3GR7dcHZ2Z60NG7dDhgyqRTGvsNMnile0nvD9X6m6wI5zko+Ew/tS1LUXODy7JJ9rJyOJ+A8vHzxFd77N443xcSACBdbDefwubA7HOiGniUeSGWFlcbJZsP5xR5nG9fzzGqzYj5s1WwjhSGoxNjUisFzOFxyslmz6dWuLdawTLNmFBRD8Wr1Nv3AcnnFfk4Yb2hfoS0W+1vkXHGm4a2CTY3T6DFTCuId5hjvXpYFOgWGVqpmW4ojdCtGN9BKJJZKywuxGJYl4SQTa+Gv5o/zS+39r7Niu5b5SP0Ed29t2DCxfYzUeih73v6Od/EHacscHkNtahN3n3iSTd3rMeLLxi2zYLuBmAuVQi5RY+lE9R26aYtgjvmTpRzjLS3fdXLJd5+ck0sil0KKQs0GYwNuZbBOA4ORxk2zcNH6N14CioR9w/fXbeL26R1W9cDevlG1uGkHvptP8lu8hysGTuqBD03/hrd093FPP8HohGXe0UyH84HfiU/y8+Y7SMfzfl3dxn/332Gpjfzp32K2hs4apBp8MPTjyHZ74KX757z19oqTsWNZEpuVZ15EQ2cr1FrIMbM5fRuHeebR/Rf5xme+Dt/1lKbdti70THFWMZ83HHaZaUpYEdY3VpyfP6JllTW74Kil0A0DoTbmeOBqP3FYGp0TDk0hsrnq8eaw6EQeUyKWyOZkIMaIFGU3rLvwhvfuzxpviomhtsY0CyVnbt69y4MHj6jNMY6OB6++yt27d7hz9w7b7RUxZ6w3pJ1i3aZYSamyvrHmsL1iP00kM7JguemFPsBhmel8T981dttEFEvX64d/fXbBPCVKqYg1sHsEmyfe8By7+YLgLKY2zSvwRn0RGYr3SkjOgt/0rO/epJRM12kgqx9vamZCykgpGOmJOTIvwmAM4XSDcYbr3ZbvqVcM8x/zK1lFUTfNwg+Fz/It6wvcE99AnL/Av5i/6XVqStcyH5g/zls2lg8ePs5vufe/bgX1LfN9fILOW/5K/Ti/ZL79dROPp/AjJy9gQscSEyVlrO81sNVozoJvCVqncXE+YL5IeFarb8sRKQJFvpihYE2jzJlJOvygAbA/tHqBf7L7htc9f98y7zMv8/v1LW94Xj8QnuPd73iaj8bP84v1Pa8zpXkKP7R6gQ+tZ743/zbT1RUxJ2qotKfv0DmLt45gMiEEsIFf2X3rG4xt4nvCB/8D5Au/TS6ZVTeQakEMbPcHsBapjd2cabZgnWe729EHQ8wQBJa4EPoeG3qcsZy/8iKlFEQyGI9NBywVauVw2PPCZz/Fw/sPWZbEsBrpvWG/KFZQ6dGVq93EenBghZoy14ei5qsl40KgC4bt9cx+yUiJpASrsdfowVbwpuc6X3IyOIbxjZPxnzXeFBNDjIlcVUW3200gHkdmNZ5y+eiM1XrDNG2ZDzviopJUH0YOi65uqVaNfreCGA0mWfY74sphfUfZXuOcZZkm5iWzch5bYZ5nWuGL2KtaGuH/+lnSd/6d151nTYk88/lfYjMOiBGCs1jRLAhpBdsKzQX8aiRPB3bLHt+PBBcIw4majIzQmqHWSIqRUgqb1Zp+7HHG0CjkdOAwJT5gn+c7wyv041odp31PqneRJnw/kfX1PX5++1bOkuOWS/z19Yt882lk3d/iyd3E0/nT/Ep6Jxe145SZHwzP8Wx7lZICz8aX8fwhv8p7uaTnpsz8YPc5vt1fkXNPP/bsLvfKVmgGL0YtwuYYXvPazkE8UhOtOcSCVEuZF3LLtJSgc6zGkbgkSimUlnCm59lwn7aCX5rezkUN3DILH5VP8eH1FX8hH/iF6R2c145bNvIjJy/zgXEmxp6/IntO5i/wC7uv47wGbpqFH149z7PyEmWqX5TV912P7zu8M5QUISU63+HE0Axc8HiNh2xuU14LzrFCZzX8KDXLqrdc7ReWFBmeusPJ6MlxoQuWew92CJY+dNRSuTy/AGdoy0TwjhwnmjjuP7zCHRmZh+sLcozs50RKlWHdcXa+pVQtiDqvnE1XK8F7mlQOtXJ5vSBeiNUQSKRouHG6ZokF3wWkFJwIEhwiGllYS6XvB6b5a9JE1ViWBavYVExr1CrERTsOMS4cDjtySbRa1eQzqZttNyV1lNbE9mqHOMvFxTnGGMZekWSH/czdO6ccDhO1avExL4WyFKpoEUfQMJlbj/6Q+fd+hu1f+lHKeBs/nfHMc7/E120/QTPQuwEvxwh2IwgGasG0wGF3zWZ9g249EjqP7VfUtGDF05owdB3FG8y4whiL1IxzuvOoOeN9YO08cZ6ZDpH99Q7nIxtrGboRsRoP/9FN4a+al5n314h1lBSZ94G+61n1jo9cP+Qj5dUj5KOSq5Bn7XOb0PH+cMkH7W8DlUQlhJElW9ZjIC97HA1rlTdhjpOBtUa9EEYZAaZVmtGJmCr82tUJ//OjuzzKnts28hMnL/Gsmxn6wP4wcdjvNUFbLN9qH/LtNx7igTgtmAxtSXwg3OPDt86wnSr6Ws0YEYxTl+aHxzM+ND5UJL0IKUZyLeQGxggnd+5ono87TtrJaZLVEokIXjy3ZOa8vXFyaLszMIaUMtNuR3djTW5ODU1NmaSrccX5buZmb1kNHVfbHbRGKpUpZroucH1xxvrkBuvTJ9VnsRzY7q8YOkvcRw67A/vrR7z0whdY5kiriTRFtUbj6DvPPC/a4h4sKTeERGkdhi3DEDjf79msetI8IdJpQpqF4B21Nm7dWnO43NMd2aNLzuTy1WVXvkkmBsEaR+8d0zxTxTGsHCk1nnjqKQ77iWk/IaIxblYqyVouL6/BCF3XE3rLHCeevPs0n3npCzjv8NagBqVGnAsna8NhqwnVxsGShCSZ2jzGJGiVzdhz+PRv8k2Pfgeq0HmnHYdVjx8GxGsUXbCCbWoMMNZTa6PrB9UzlEoraLGrd2CUGVFjwvcj/dAdq/tO208xUmpjtVkjxrCkFeubllQKKSauLi6gXTGstOpdOo/xHm8NaZqQ1tTBVxNxiRgBCQHbRBmDcYFqMEeqsPOe2gq5agyccZYQAnE5kPdbrO3wQdH4Jc26TY0V73U72uRYwbeWivAbu1P+wf0n/zQgpnT8j5dv56dOMt82vkKJkZIq1ReqaOepZfWnDEOnmPWYYV6QIpiWKSVjjaE00dVPRJOnW4ZWWVKmWcE5Qy8Gg2XeXiLF4tc9zVqc83gLJXSaIF4K/27/af7R9N7XHyfyQvzYP6YeH6tiNN8hJ2IGd0xAT7kQiobieDF47/C+kgqUooi3tEScdUyHHf3QYyRjneP87BG9d9x95zv5tV/+N2wvt7SmGZ1L3JNygZKotlGrZbVeYcUwzQvD4FmWA2KUE2mMYYmaUiZzxGCoLXPMOFSNT4OU9bpqYlXK/lWMN8fEUDUcJtdEjEWTkMIGSiJGcMHy1NM3eOG5h4ybke1+ollPTBnvHZvRU9LMPFem3Q4yPH37hNsnA3Ha0g8BR+HRZaTQCIOnLBOxRqIYpjmyHizDEHh4fkmwhv1caNXiA8p9dE4/ZPGId9ghYFqFIhgsOHd8MyvVGoxpWKDME94FXIzqwZ8PtM4dJw4hZ8B09MEhYolxoh4KlAN2mXEiVCOa0lRVXZeTwxlPrBUDpDlhTdaIOr/C+pFle4kQNDl6GOik6YrYCjllrBhC1zFuNiqUiQtpyVQXtFePQUpCYkJcp/SgFHF0NNPRWtZUpRb4Xx7eeUxAjOWfb5/hA7cecnJ6i2l3SU2J9bpX5Op6TamFkjO2VkK4hV97CollVn5kTplDTogpquJDKcg2jFgRUjYqT6YS+p7Tm3ex3tNaxVlDLJXO9CxLwdiCeMt3DzvMxSf4ud03cNkGZH9G+KN/Tn3x95hT0gCY0pDaGLxmYs4pU4HdnJhiYj9b7mxGeu846TK7pEbAwzTh+xtcXZ7zxFNPEJdI13Wc379PzXCxv+LTH/897n3hBY2Qc1AalOaROmFsICfLyc1T8nLgxmbFtEwYBxfXe5oRrqfIzdEzx4U+BK63GeMKpnpubtaINey2B/rQ4YeO/SGxvdhx8+Rr0CvR0POhVItrVQEpneUwJQyJMXRsL7b040CZs354BvXZC+S0cHa+o1rL9lBIpWK9x1lFdRnj8J1hH7U/bQwsTXDGsc+ZzhqcFfVOYEhLYhZPcI2YoGsgQ8VUwZSmFJ7SEGvUaWiBWsjW6ypaElIstQAygPW4YQ05EYbAsl8QBN8N9F4UfZ8hz4qlGzcbjLmpgpWSMfMecYFlvobs2V8fSFkddqUJwRl8N+AkMMWJljJd2FCyTpwpZw2sNZ4yRVpRGE5YDfTBkeaZeNiRllk7Dd2J6i6kgXV67EEoBaIphADiHFIi1giP8uMvo7PaUYvKe/thzXTQdG3nGtIizgToHcu8QEvavSmGk80JKS74MDCWhXbkPrScyGmmFA2sGYLDmwZ+xDbB2kA7SpUbagGvVghdp4KklCjTnvf3hfekX+fv/aOfZ5oXMGqE6o/O2FqVIWmttkKr6OdNE+YlcufGmqvrA3uBOycrpss9S1MBW2uWmBaaCDfu3OW5T/8JXdczdAOTJD5+7wWWaSEtC+Pacri8JpVGF0a22xnfd8R5jzFCyYneh2M+qCorvbFYa+iCIxdD54VxPVCXhrWWIRiuriNZPF70PUEiwX8NTgwiolg0gZJnfAhMS+XkZCBNkcNUKQ5cF9hn7bFvOo+b4WT0OC+cnV3TrwZ2c2K1GqglY+jZbRdOb2jQS84eZyutWRyOpQmGzNgFQmeZc2F5+4eYvvXHuFrdwR7OufvJf8ndwydprUK1WGmaxNwKgiWlhDeB1ix1KVSTcEOPcZ7gjYpbjOC8oRp1g/Y+gDGINdS00LKnpoQfBlUTWU1rrrVqJLqp1LiwXitpaL0+VVK0FeI0Kweyav7l4B0xJj2jZ2GeF6xVrJpDicvGGYJXRiatYlpBKvTOIRWsNCrQW8siHjHti/mS1gUqx+31kqgyc8clHuY3tsNuMVNKQ6TQxGKdoeRECIFWFNfmjKcLnlYt0zJjnSXXggt6Q1gRWhcQKg2LdyfU3FgFreBr0bnDtoQzonbwajHW4o0i/k0F4w3SOXIZyEvESeNkvaHSmOaZcRxADNv9hEETxByGVButCc67oyBKWDIsy8JqDKSS6IZAmiLWeUW6Wccy7Vh2V3ot7/d0Xcf9l1/i6tED6rJw82TgsL/CGEuOGWMK4hVFl0qB1EhjIeSZ4rvjcXUPNEptynLY7+idYz0OXMUtKSZN/RJLzYWyzEgtDEP/lWQiX3G8KSaG1mB/mBg7x37ec/dGR50jKTq2UyY4DW2R0jjskp69bFG3X+hZ4kLJlbELbHc7NmOHkwqmJ8eGNI0WD12mJeU+JOOARM5gg2OZJ6Zv+Aj79/3UFyXJZXWb+9/6k2w+9c941/UfkqNGkHur7pt6DNZNxWCNtuOstQSDSqNbUzwbTYVB3lNTpOWGOEON2nz2PoBBMVxHtyiu0ATSYYcJA9YWLJVcIjFnUtKgF+c6WqvUqlCVHNUpqroAcE4QMWxOlDBlvNUdgNWvh90Vy2HCia5EUq3qMhpU0+FE3X4FUa6heN09LAt1qVSp/O1br/LfPnwry5ekagWp/M3+nt7YryVe01GmPdFlgg+0OFOoGOtpXjFvLRdyybRjWzg3cK3qxGQMJWUEKFnPzL7raDnjfE+LkWqEbhxJMSLG4q0hzjO1WGywfGx5kp+9fCtntcN/z4cxf/Bz+Bd+hxi1oBmcVVhKq/Rea0cgGGlMsdCscLWb6DAY61jyjA0rBgIpFkptlCaUGFnmhBFLPwxcPDrj4csvstvuabWCFFqphHGkMKvQzgvzPLFZr0ml4pzjsN+Ti2VJqsqkKsuhVIsYRxUh5qQ4f0kYgroqaRhTuX1rzTJNlPzVFR/fHJJoEQyFcWUZBw3M8E642m6P5iQ47PR8bW1jaYnzqwtEHNTExcWOJoZp0ZuF1hj6QVePUgmdx1kUY24a6dhCm7MGiRqnK8r1X/qxN/oUXMcL7/zrmlMtVdmUVjRIxhp814FU/MrT9T3ed+qVpVkAACAASURBVOqTQI5kXv3aSqblinGBWgp5v5BKo4qhlqStQefViCWCFEj7GesDdZopU2TZbYnTRFsiBvX/17RQpi1pWSgp0azgg6df9QQDvXcabBrUGu6MxRpFyqV5T4sz3hlcZ3F+wPhewTE+IFIR1/QrhtaMZlwmDXqxVrBUPtKf8XdvPs9dGxEad13kP7v7Ct85ntNSRjGaDWcdArRZ24jW6oXuqEiuBGfxXU/Xj0jL+tkb5WjWFI9ZmkfuYszYCjYbPBZqwfsVDqHsJ8XOlUQTh3MWyYXfeLTmp8/fwVntASENN1k+8B8xvfV9eO+oNWOcwQocDpFUsyo+W8NYC6J+hlIKru+JMWPE0+dGd6xtpFKJubAZO07uPElpSuByznB1cY41luAckqGVTC1gjGWeF03wwlBKwYtmThjvyaWyn7Wl32rDWEMtOvk7Z6m5slpvaA1izuTWgKJeodUKYws+fA0SnESUyOwFXUEdLBktJOVERhFnNc+kGBlWK5iF1QgxLlzvFoxTiXSMldE3xrFHRBiHDoNW4GPNmOCo1XCIC4lC5wb2aQLjqePjVY9Lf5NYKsF6pFnsEcXWxIARTHBY57G+Q4zBdrq1d+KOEA+rDIMWsc1RxWN7Sxd0Wy6gZhpxiBFi0ur2OHQ0EZrVIBRrQMrR7eiUHVhaBBfo3JGuFCPNHNmVwelkhMUY1es70yhpwTTRHUoTxc6HAWM0ILXkGeu1RdiqwXRemQz1aAYrR2cohbpUxBu+q7/mu1afwHWBUiulwD4GXM2kolVyjEHEEKeCsZlu0MiAVo7HgFZopWC8w21OaTFhTQEcLig6vpmsrd1jJ0LFZoqdMw0qHtv10BIpLTgDzVpKrfzs/utfh60HwHWUb/v34MXfwRpDCJ4lRqRW7YA5IaeCc/rZ1lSIRWtXnTV0N26wzIkmeo0Zo5AXH3qmqzNurEdePLtPcI6LV1+iVQjBIiRs1zNt9+wjjINjWQpSGyk1+tWggcBSOUxFUYB1xh13bmle6L0i+g77hXe94wZfePEBp+s1jUw/DLS8cHl5QXCO9fhG1eifNd4UO4ZaK7YTXj27RqhQG1IERwURxs3Io0Om63taM8xLIWdw3Q0enU/sdwudc+TScFK5MRgGlzl/+DKu92Rb2c7KXLjazcw5oWxuw/pkoFbY7hbM4eyxz6+Pl9QkCPYY3GExPlBroRmNGadEDA3n1J4MgZwTKTUoMyZn2txIc8Y7q2fvdsSJYzC2A+GoGHSEcaQSaMXh3UDnBkJY0Q8DXedwaF6ELeBSwcYFl7WXPnYdQcBhCNZg00TLRXmM1xNl32hF8xnGsKLzI7YAJdHShG0NKw5jOqw3UHTFFFN1q5xm8rJlmmdiTZgu4AdtcbaWkVyPHgGP60d816mfpGS69QndetCjSkxHwrvFi8ECwQmhga2Vfgisho7VOOKdxxrohp71esPYrxj6AWs93gecc5iho1uvsMd6hLGdhvRUwRr/FbNCWd1mOxdydZAjnW30Q0cVr5ZqY5jniHPKD52XhA0jxjp2ccYEKGlhFQJ9UDrTy5/7Y+J8YLu74s7dJ9kvke12h7SCs1Vxa0sFP9IFR8kO40aGYYV3npQz1ndcTYHLQ6Y01TQcUmFeMqenI43GEDypNp67d4FPHXVphGYhasv8+uqa05MTrh88/tr+SuNNMTE4ZwkCFo8To3APUykYXNexLJneGfZzQ7xns+qZ9jMi+Rgrp5CVaZrZHZSSlHMjLZlbd05IMTHNWZVlU8FUoWAYNx15mnFGW5H+9/8p5NfnHpoSedfzv4z3Ht/1Chn1BqzQQkdKi6YutY5alVFQqmCNrqwt7qlLZpkSEjzeeZap4Z2l1kTNBlrTbW9WfYA9dknEafqSCUFDXapBqtN+ftWVI1h/7NcbOnu0W7eKEUvoOgRHaaqKk+pw4woTnFqEUZxeFvlT9yT1CBBFaxelYFpRMnJWXwBiyFnwfYeVrDsYimLGakUsSuRqej4Xp6pDY7TeETqru5mWoRacoNv0XGhVA3QNjRKzhhFVPb113aAFKWugaWFSRGhUrBFsMxgxuK7HW4sRMJRjp6Fx2zw+05L9GTFXljjhnaLxh64jGGU2Nho5N0yrR8pzYew9MSZlVhSDKRCso7ONVsF3I7TK5eWOORae/+xnsEat0yUnJYS3jBHNv7Reg5pjSrTj9Yyx7HYT3geWmPHO07kjbKjBskSkFk7GgJOKH6GawrjpiTVhxeKksru6ZrX66iTRb4qJwRjDFCOhcwwnJ9x9y9NMcyLOC6ELesFQkRKJS+bV8wuSCFe7hTQf6UkhEKuw3qwYhlETgRYIodcVyXqWFAm9o1iPDQaTK6Uput77Dv+Z36H/2E9j9o+gNfx0xrs+83M8c/0nWGsYvEcEaGpy8TSMDRhxuKHHdcew3QapQSsLpnpKFnw/aNGsWWzXqHmhFBXoSFOHqTUGK0a308bpp2NF22XGI1ZAtKtixGBth7iA80E1+n2Hs7olF1QMJmLpxvHYJbFamzh6+41oO9Q7beEaMYjXjEVjBRsE44QmHmmNVjNSC4ji6yhF7dc58msXI3/7uW/kh5/7Fv6TF76JX9+d6O6hVb1ZQ8BYj2nHoii6itecaS1jUOEOVXMQXlupa9UMBmM91Io1+jd88LTW6I5U71Ya3lpt8RpDrZWeSquVkhZaLfz48HkCr+dHmJIIf/wLRwyfw3vLrVXPGADnyTlhjWLTdGJT4tOcEs0I05TISeMAMIYi4J0j0rEsMyEEQvBcnb9K3we6zoI1LHEmxgURQ04JrDCOmjVRcWw2G0pR+GxMRbtiBmLJGOtZDjNd6BHrCE6vk76zeKc8iLZkaIbOBvbXO+qXW/X/nPGmqDG0Y3HHmJ6aGy+9/Ig5F4auI8bIlCJD52lSMLYR9wu1WGKqOGNp1lCaIRiPc+Cd59H9S1wwzIeF1WrDknfUAhKsfnBLIVYInX6Y01KxwVA+9685vffb3Bw8J6uRTd9TO8EDzgjUithezVNylAT7gPWOahzNQBa9OcvcoDP4XttzxjWqKTjFPSI1UYxDWiO3SmePknAvSG16M5oMTbS33qBhcVZj/ZSuLRg5FjsB5wN6LhDA0kRvTls06MRZ0Z3NEZHvvWLcTNCkbxFL8VV3YRUwlkajJU3ablmLgM3qDWtd4De2G/6763d8UeT0sAT++/O3kk8LH+wvjp2yP9XuO6euRBF0osmG3JJShkRfe10SGEO1QMvUqhkYDdWiNLFqegOkGkiReFRxlgYiiXKM7DNJi7vfOW6p9dP87PwOzlrPjTbxlnv/B3/08V+l7wdy1kT1J09vEDohne9xLqiLsRVurVfkGqm1YcWQayN4TRGzIixlIRdDvzZKyJ4jPgQ+99nPsL98RKmwzHv6UEmxYY1j3Awcpszhesd63Wm7sUDvAhhLbkJcNAvVWkOMic3GsZ8NQ++oKdF8j6kHrOmQZlm2mY4Oa1To5W1H138NTgy1Nmpz5BTZx0wuSpsJveewvyZGx+2bHbvtxGqz4mq3gBEePDjHG+i8ZZoi3gj2mASVDxNPf92TUCr7+Vq9GN5oMalk5rggGPw4EELH9WGmGsELet51Qsv6wdcStfaGrmYlJaxzenE3gVYwRrfNGSEYwTRDFs2hoDVsZ8jNasGyVt0iI0iO1CZ03bG41sA2o208vZO17WnAHVV5tegkYETAu2NlW4t4UvU4QDPUpsaZ1qAZwdI0IJWKk4rxDuqIYDH5QHUegx4rNPxASUlNKsYUlrlijeZpGmuxUqip8Y+2z7xB+bg0wz+5fgsfHq+oOR6Zg0ezWCwImirurKVUIVijHaKmR5ImhtYKGA3G0ch4MNbRajl2K4TcKsFUWm5gs7ZHm8HakSkfcGLIriMvM7UWPtSd8cH+AZXMi2dbfvPqs/zfGHIuUAulWXLL+KohNUPouNrtsFZN4U6ETGN3mDHVUFshxgU7ro+5IobDkjh7+JAmjWk/cfboPnNMlJJ0Z2gNpUC/GY8BM3LMnBRq9cRSFOuSCnGOYA0nJ2t2Dy8BS8xwuhpItSLekpeJ9XpFzRVrA64X0nygZUc/dJQ8s8xfg2nXIuCkcZUmOtuxNMfJqqOkzHRlsANQhf2cOb86I9eMcR3T7orx9i1qK9iWMN6zWQWWZc9w4wab2zf5wvP38MGxnRZa1VRqZyuD92AEawspqVhm4z3rvmN65v18/j0/TB5uMcRL3vPS/8YzD/4YxNKOYSsZMOKAQqmR0gRHVE8CHsTgOoO0QmkOWTImCDYWWhHEQpNeadIYYixYDti+oxqvN7pv5CXiva5AJVXdSVhB8qLb5qaiK5FGteqXoB07FyVDjYgRvPU001DYUqBaj7FGLy4rGFaYlMAaUjVQo0bLl4qpRY8lNZNzw7pA6HpYEs5ZHtXHe/3PisfEqPDYCjkYmveUWOmcx9uq23QszmregrUdIlBLptkOU4UmqCrD2OPxyGGMOhWDdTrFuEZNwFKodaFUg6tCa3vysY39sfgE//TwNs5qxy2Z+T7zSYq9hzVCPw7st9fkqsewVOVIfZ5Z9R2pFnZzgaIt2ZIqhUoIA7tdIoTEjfXI5SHy9ne8k+vrPbiBV159mRef/wKHiwuGYCjAPBus73AsXFxkpt2BJ566y4Oza8aVQn/nAg9fPMOEQMmZ3W5PMAbrCte7mT5YDrsDm80GsMSUORl6Ws30ncdVi/GewQjie7V8fxXjTVFjAFhSJkfIy6JtoGFgv59orpFLAhZSrSwlk5uj7w3W6NaqFCg0rGlIXDBG6Hp48MrLTIfEftILrhl33Ipqgcg6Qy2wpEZwniFYlnd8Bw+e/Q/J420QYepu8kdv/1Gev/PNVCuUCsZaSsuA0o3EWEqK1KIg25KrhsG2jNSKwahAqDVVZFpPTgZJMzUmbUPq3UBNhZwjOSbSfsaIqhJpmiikPUTdxtcCNUZa1puYY83Dek9t2jqz1mG9P3Z6vCY/WatxaT5gnSP0K4zvEBuoqSBpIS8LrTRy1hTwlI4gldawCDGq94DmuCOPL+rdNlGFYMdWpRHdgWCVslyipmH5oIg2iwrFrHXYsCH4gDHaCWjlaOc+HkGstYgJtAotaSermUIuC832R1bGxJz1sX/zcJP/aff1X9QwnLeBf9G+mc+ap+i7njBq65jamKbE0A2sg6XrBg0iQlvj1mnNwByBPSWjOLXaGMYOayyHaU+p8ODF57h49IDt+UM1caVMzsK0LJScmWOlFmGz6iFlnMCSE7lqvWk3zRRFybLEiD8a+jwOaQZvPb3Vwq63mrmpNvmi2RUlM6x6jCmM41cnif5zJwYR+WkReSAiH/+S7/1XIvKSiPzh8d9f+5L/+y9F5LMi8ikR+ei/zZNoDZzr6LxDxDGsB6b5wDxHjBNKTCypsR71DDbFhAsjw2pgv58xBoVwAq5zLNuoyUf7ic5aXDMU6XDWUDHUYvDjiKDuPSOwHh03T1acvedH3sgWtIFPft1HVVwiRsElVEytWGOhOqhaYW9VI85KbWQsRRwNozESaUGartLWVur+mjZNsMyQdSfQKljRzADbeUw4SngR/FEmbAS0+6CqyqOHVI8oNdFyQaxW/l03YE2HGVaEzZrQdQSjwixjLaELmJKhFMq8sEyZvFQonlwsYClNC4HiBsRYME29A0Vdez85foHuy4p6HYW/NTxHPpKe5HixtdbIi8YELDkr4u4wYcTjfFBVptEibz2qHw0VHzwueG2bUqlVDVWlHVWE6QBU5Bj6gxOWIvijI/fnlne8QcOQxfHqO7+PJUWuzi50cjbC1X7m4npP13ls00KsiJBzoRqHMxwj/pRZQSushkGBw9YipeCNMB327HbXxPmg2oMELRfSElmvO3KC0Af61chuujqi59RyXkUwzhGTwopzEkX3m4Z1TROrRIg1E5dI75zmm0gjpwrScCLMacaZpqCar2L82+wYfgb4gcd8/++31p49/vtlABH5i8BPAO89/s4/FJE/V3Kl2+SMdQbnVZK63e1YZjXDOKvxc5dXO2rT1lRtWoBzzWBa1QucSopgncVJR6uB9apXs1PRAg6xYpxnvVmzxAVntD++7juCd8Tu9LHPcQqnmKraA0FIyWoYa6zkUiglEaNQxaqJpzUqVm3DaKo2zZCzQUojHrTNGa+uSVfX1GUmN6+hNQ0twhkNt/WdcsJrzHpxuKPy7bUoODFatRYtIDYUuYaorLOZ45kdwR7bhi1mbEukKSFpoVxcsb1amFMh5co0T6Rl0pZlE6UQSdOsRKM3X6VSauZ7bhT+85OX/lT5aCN/99ZLfG9/jrVe0ylL0Vg3a4+kZ0fKlZgixhqs1xJlK00LbcEgTjSurQolAQjkrK+lVlVeGsG6AF2vAqdlocSF1AS8dnFyyZx9heNO6k5JMXOyPqXmQggdTRLb6wsVxNnC4XCg1UJwjjhPrG9slBrtLa1qrKJx6vatwG4/I1Quzx8hcUfLkZQTxoC1QvD/D3Xv+qTZepb3/Z7jWus9dPcc9kl7CyEkJOtgoRMILIFQjMEOMQoYHKfKST6kki9JpSqf8hdQlUpVPjmpVIJNxeBCYCMXGJwyIphCEkhCkiECGYE2W6d90N4z0z3d72Gt9ZzufLjXjE6zbclxqjbry8z09HT3vO9az3M/931dvytgrFZhPjim8YB3HTEMGDH0XSDnGYMn50qujhCcxgF6i/eqwtxuBxBDt9jEG0YZGQjeWELoGKxl6FaEf9+2axH5oDHm27/Jr/de4BdFZAY+Z4x5Evge4CP/5m8CuQi0Qr894+pw5DAKTQrzNCm5d9ZwkFob1ns6b3hhKtAywSspyDo16iCVKSeGoaOkiTRB6AIlKYpss9lQ0gg4juMOQ6OLkWvrDX26ZHrA4rAuVzTrqKWqCYlCFa0UvGQaa0BDTlVG61WHwPLQVoM4r9ZoY1mdbKiXjanN2CLYBtEaigHjBGvdMvt3mKIWIiMoo6ACzqunwlrC4oLURqEuLMHrsQnrdXHpe6iiU5BWaeMl9ZhJh4m5VPYTSHS0KgRjNMG5WSoeIxVLpYkSoo1thNjRasY7DWt59+klf+36kUZbjg+NOkL0MCfRiY4RKo5SCi1VjMyY6tjLSG/AxBN852m1YKqKxYpov8aUgpGMcQGwNAqIw3gPtWKNoyL4qCYsaw02OtKoQcU37LwcI772cuMF1hhCaIiFnDXdLIlwZiPGKs2pmIYzaDWYM+OU6LYrxeqXgvenbE7W3L68jbTK4bDnePc2++OBVCquNHLLGCOsNz3HsVCLUFKh6wZqLuA7nDME59hue56+dZvYBY7HI30XFl2I0A093sFul4jG0DkdyzobMdZxfdPhXSDNI5sQCHFNMOWbfIT1+v/SY/hvjTGfWo4a9xC+jwNf+qrPeXr52Ddcxpj/2hjzCWPMJ8ZUCD5inQZl7K8OlFxoNrAdIhHBdx2tqTy28w5pguRM1+msueuMjvGoECx13muJ6Sz7w5H97oq56At7PB40et5Czg0DnGw6rl9b84bn/iX26xOPauKttz+I805DW2teEO06+y4NXJ0xUpBWlYtoKrUaGqp2FlHisllENyVVxPT4fsCFgHOOXDNGEdPLjrGMEH0AF7Cxo1mn2gLLojbsgIC1Xk1QzqsK0Ooho0nFGkOZZnItyqE47JBpZLx75O75JcfUKN4vZ1ujqkzfU8wyDqQhOdGw+C7iHbRUMKWC8eAqzju1XDqDCQv5KWgSs+/Ur1AbdM4QjFByJmVhbjr2q0V3vTpN6gepjXKcqLVRhUV/0VNyxSA6wVkcg0JTZacxVBM0vq2ooE2Cx8jMT3Wf/wYNg5eC++Q/wVjD3ctL7aXgMEuTs45qQ4+hQ5a0aZbFWb+xHl9v3thipJBywjidEtHg1vPPczi/wIunix1ZhGY81hqmuRAHz2EcacYw57yctwxzTqxWnv1xp0KxReSVc9LogZy4ezkyDEGBtf1A33d00WNKYVgNBG/pYiB2axWHfYvMx3/XheF/A14FvBl4Dvifl48/yNz5wBpGRP4PEXm7iLy9D440z2y3Ky4v9hz3I+TCjdMNupAqnr0lIXpHP/TcuXOJt5au86yiR6rjdNPRknL7fFzrTjhWzm7cYHu2YTpqJzlES5knpeaIJYSe067noUce4vHbn+BNX/gVhvlCzVjzBd/9pX/O6+cn6QO0xSRTTSXVBE7PcwWDmA4XemK/VjWcbWp5lqqiEwM2jzSx5GypRoh9Rzw5A9eB97pjFdXqm1bxxmnjTTKYBjZgugEzrPFDRwsrUvDa8CSo4Gm1XRqFyhGQkqn7u6QXnuHquRe488JzPH9xyfOXVxwK7MYj9XhFTQkpjS46bJmJJEhHyrhj3k+LwaghYsGjxpySFXjaMvl4hS0jFNVROKPdd+ccvh/Yrk+oWVfJII5mHIfjRC6GXcpMhyv2h0lTyLMsEumAkaoz/VGTsaXJQvOq1DwvjdZ7XM1MlUL1YGrB54mr8x3v6m/xTvdltEMkWBqvzZ8nPv0JjPGE0JGz5jvMc6U1x+V+wpoFaiuqwzDWk+bGzdMN0WnTdDV0dNFqs1Sg857QhLsXe+aiD/Q4jpRiiH3k8uqIj045EyJ0FsJqwz4l5lzAOa6OCREVUgXvaQh9F9nPleOY1SJ/rKycIDUxpowpjWsnA+s+EpzQd5azk56Hz3pOr70Ifv9Frn+ncaWIPH/v98aYnwF+ffnj08DLv+pTnwCe/Sa+3v0z8WF/oAp0q07luimxXQ1cHSZitJjoScvsF2vwwXF1mFj3vVYZpnCcCkYK3gSqONadIc1QSmXVe5gUAmJE58rBe0y/Yr7aY13kq9c3Y1R6TENFRLZirEPRJdroCb02qMQYBL0RLBbbZMkhdBjRMaVlCRdxgRA75Ti4DuegALHvMK1gQ1Cl5JyhDxjnmVqj73oNMG2NQsPawmB6TC0InjYfdRFdSl4j2qg77vaUqTCNR1o9kiRSqro9nUDwEdcFnHU4qrJfqyZ71dywJhKtUd0BQjAgYjFtxtUBEa2moo9IFn573PIPLx7hdg3cdJn/4uFLfvD6BNOMEw82ExGqFfI8UgfHOFc6WxlzIcSMOI8zHhN0CiA2EpCl0dzuux6lVsQF6nhAjMEtY1sjDSONbjXwwfE6v1sfpS17YcPwGf/t1EffQvmzD9PmpToykOaMCY1S9AhUpaIqeIMVReHbIDgf6KxOcrrNCm8cBqvuz5r0fXQDc5qwTYixY5qr2qZzxhhPbY1xnGl2MRGKUEshS6TvA5djVmqVhbOzLVfP3iE4lbQbU+hXW7aryLPP3OL0kTOibazWHWncsQp6v3V9/+9/KvGgyxjz2Ff98ceBexOLfwb8HWNMZ4x5JfCdwO9/E18PY4UyXy6iG230gFBKJSzij9PrZ5SSVSZdKwHIuTKltMyUHaZpnoO1TsvNUim1Mh4TJ6sesJyebljFQLCWYD1pmri8c8H5xRXPP/Y2/ujb38u4JFEd4xkfe+Jv8Nn167TSqOa+9r7hljcyqfrRL409q74DrMMapV/f654bHzHOKE4Nj/iA8SoB9t5j1HABScNla620ort0iD1Y7kt+bYVaVBWIj7CUyiUnzfc0UEXUKj03prGQ5yO5eXIqesSpDakN7x0BSwiOYJ0q+iQQfcBbbX55rze+FzC5YcWC62jS0WyH6XpMHPjt43X+3p3HuVUjguFWjfy9L9/gt8/VARr6gHUQgme92WKNpxyO1DRSy8RxPJAOd5lKY7o8J+12jPsj835HS1mPlGIREUwtNBFVzxpwKAuDXClNg2pDDLx//o5vmEpU65G3/xQxaHPUewW8iLE452lGNxMxynM0Bu0pedVVeO+JfaDvO5xpxL5jmmbmog+zoJ6XWhrzVGitARZnRUG9LEpVo54UmmEpGBiPI1YgJeVk1lq53CnIRu8r6FfDwvURzkJHLYKrHpkTK2MZ/IaWGyfrU4b16bf0jH8z48r3oc3D1xpjnjbG/JfA/2SM+SNjzKeA9wD/PYCIfBr4x8C/Bv4F8N+IyIPDDb/u2q4VXBqd1cbRsmlHb7CmkUri5OZ1nVcvmC2sg9aIzmGksT9OiFTmqeIMWBcJscdYjzeW1dkaWuWwP5LnhPolBe8s+90V+3HiUzffTbVfN660gd+/9lcWVaK+gbUKtlUwKupTDb0CU+w9yANaKjpjsMaCMdojsA4RqzeyXwQ8ov+eWhQmK00bdMZS7/EMTEWsBpG0XJdcTAXP1DmBNGpR+lCpiToeafORfJyZ5on9dMVsA6N4qovU2jQjNHZEr5Bbu1C6q0RMCAQs3bDCOE+uOrpsolORVgRjDXY90J1eI15/GLM+4R9efiMDchbLz71whquNWisuBjw6Xeh6PUrO00yeJigTOWfm/RViDCYlHFbBsVbd360UbCtIA1uWvA5xtNygFT2/SkOCwxjDnQeE3ACYzQ3Othuq6NSmlHqf+4CxSNNsyCbq5bDOsF13OGOYxgS1srl2wjCsMC5ynCdSzjTvCdGzXa1pEvR0VfVreq8TtK8oOS1OPCF2dLHDGsPxcOSQdOqjTndDqrphqjpU77/1sOLO7R3ONwyGuOoQEwndBu8jzgfCao3Ub+oxvH99M1OJ//QBH/4H/4bP/2ngp7+VH8Ia6KLhaqcPiZ7lIOdCHwIYx6br+PKzX2bOlTgsKdEGjBFsCDjjNIwGvcnWqzO8daSmUJbj2OhlIg6OkyFyuRsJwZMQhiA47zhcHTm4B4ek7t0WjKehuYvOaFFaq9EcgapdcI2897iWkdzIBry1OG8R4xd+3zIbxymG3oEpmToZpTohtNBhXQHrmceJfiWIrDBZff/OO2qe9WajUtMMxiM1U2tCcqHmTB5npnlmf3WX1veY0OF9YD4cSTnRdw5TMqUGYli8H8arkzA4iljI+vtaC9Y7mhEIATLED48d3QAAIABJREFUvseenBJPzui3W+bDXW7lB+83t2tQaXIplHTEuV6lzlSFmdAQE3DRUW0lRqHWAZwl1sK8WInFVmLfq4I0a7XgJFOsVXaFcD860FT1st8w8wMXB3M8p4qhVc0raU3ZV1VgHAvBCx6F5yqhWbFp0ffMc2KzHThcXZJ84/rqDL8oylde1ZnzdIDaqK0RY6CkwrDqaHmmjIkWI3Hdq1u2FRKWmAv7UZuMoMe5Iej33axXyshsldPthsPlXaxxhPUKb6rGUQWrlV3n9WgaFqLZt/JMfkuf/f/T5ZzFGCFVqDnTeVV4RR9w3uCAvvO0pNLjlCutqC7BW6XY5JyJfYc4h12OITklas4cxyO5CFEKvRH2+yPWGVabFbY1Ts+2OOOoaWTT9g/8GTd1p+JlUbqzsZEiHmdVlkxLGB2YUUrTBcM6pGjVYDCqoLNWsfOCuiYxiDSKUWeVLQmpBd8yelMUnAGRQp0SaZ6RkvUIUAqSZkgTNjdkGqGMGK2kSVX0WJEnmsw4lu9XsuZCtEwulSKVRoY6I2gxlmuhNo3ec8ESrahFuKlqtFpHspVqFEFWLbiuo9XKQ/HBM/MbLpPmA/N4RRkTOc/MKSmf0HdUq8yHUnXxa84xjxO0mTwdyKng7oX31IZUoVo0eVuWBXGBuVrvF6OVo6XM3+qe/IaphCkJ9wfv56GbW61Qpd3vEQTnKK2ghxN1PYoI0XsVELWKkUwtmWHTsbs6cOd8JHpHLQkbdDMITnAxUpsw58JxTuTcKGIwMSA1qVmszpRS1JLdIJeiUGNjdVEZOpoIuZRlQ/RLVCBYI/SrDhM8Bl3MjBNd1LzneDxgQ3jAO/Li10vDK2Etz9/e0Yrh7rHw6I2BWhve6Vuyv9yxmwu5CFl0fBa8BSnkCr2peBeYDhPeNoY+UHLSF/H0OvurAzVl7NqRjhU7OKQkhs2a/voNxMD29IySE2+5/UE+8vAPU8xXXkjfMt9393cxTl2MLoD3Vg1STTmEtSpBuA+C2IFqHa42xOr4TJqCZ3GeIhBFWQtNMjQotSLHBLlAP1DFQlYidojKM6x1D1Gj0Ns0Lco//RmlFpw0ZoQ0zZScSOOekjL7w4HN9jq+H3Sxag1xTtV5VztyhTAHEgYjB+pCrm7ZYLynSqPliTk5qlUVwb0si3lutC89iQmBg9fj09/drvlfz5/4muNEpPKTw+e5unPJOB0JncfmSjSGKVdi9DgGKoIpM3U+cGjCag27UQjWEFojdQNGLFe7C3z0RB911JuFQGWcM97BdHm1wG4NRjzfLc9BNPxy+g7O6TmTkcNHfx7z1O/x56VQBcKCUWutcZxmrGl0iyzbOG03W8OC+odtv+X0ZIWkTHCB86sLXPA8dnpCcx3r64/z8le8ij/77D/TH4VGqYZWZ0II1NIYeq95qi5gLQytkebK5dWozshoSKVxPM4KZZkm+i5ysvKMhxELdD5AOtBtNwydx5aKk0LwAznvKNUiL5LA9WLXS2JhEEGbcEXu+/LneWS7WjMdJ1ZDRObMnBqd0zLcL/kaMehkopSG9Qbv9c3tg2MeGzVDbgYXHEEszSiWLHpFmz320EP42NN1HW2aeVn+PP3+Q/xu9zaO4YR1ueJ7736I1+w/A90KfNGd36hdWUVHQelRzlCI9FbPkqUJxkeSaUQbMSbRqgpuJPj7/EKsEDCUUu5rIYJxCo7tOto0okMYiymy5Eca8n6ZfXt1Y5RcmCcF1aTpiMwzaZyorWEWxgHG4EKkVLWj33nuFtYFDJm5FoZVoCRVV/pokblSREXg1WaqNE32FqG0SMkCPVhplMMOg+XtXPJfbSq/eHycOzVww878RHiKN+enuVWEWoUyFrwTivUwHdneuAZGF9kmBhM7TE6UJticcLFTs9k849H3rzWhVYWqVoSWK7ap8ctYpW1ba/md6ZR/kt7GHem4xsTfNZ/idenz/I9f/H1GEXwImCljrceKHm0FQ4yOXDU+QFkRRRkxLRFjR6szxp6oYKt5qoxIyZr/WQ2PidCsUY9PqpisfQCMU7GYiKpIEVxTFqkJloL2E6z36t6thqnMdL4jBM/ZSUeHpYjFGkcXLZ14aEKZGpuzFS0HrDj6ECmXI4f6rRGcXhILQ5OGFMMxTZqWRMM5j7HKUqxSSUlNUquhI+eKVcIoznpSUVXkOiqVaL0ayPPEWODhaLm8yBgxdN2KxhGHcOPaGXd3B/puoB/WDKuIDD1NZl6fP8fjz34cRPjzk9fxsYd+kN+6+aNs647vP36Ut5mnAYNfzuOCU16CWWNropoeH+1C6Kk0pZnQfITFhJVbxRqlF1kEaZ7qDN4vEu4mGOvVlKXYJKoXTEnqN2hVGQXStLR1Aj6Q0yV5TtRcaaWSW2F77ZpWRCVjwwrfBehXtJRYbzeM80xqldB1fKw9wq+713ORVlw/Hvmx+sd8j3mW4uN9hoJUi+SGUAndhAj0TiclrWSaMbzDfZnvPX0GGyJpPHI4jhymmeeff5b56opnH38HHz75Pi7tmtPhwI/JZ3h7fUbl3stRq1pHTjPRO6SPtJSwtlFspHcRQyWnETGW4KNa0HNbrPANaZUPjTf52fErE4kLBn6pvYG/SeJqTKoQlWU0GQGj4ixnmhrXmiNJxtNozdAFi3eOYKtKsRFC7Li4OihSvos457n50IbmPLdu3aGlmT56bBdZO6++iOopYhicp+aZnAurIZBLVX6l0yNzqo3WCqvo6WMgOkPNBttFhlAJwamwahvJudJ85bgbOb15gzKOrLo1NlSm9K11DV4aC0MTUqnMU6bmSrAWiQEMzHNhve6ILtFt1jx/Z0fnLUPn6Yyn5Kr+9SpU18gZjM0Y1xGCIydNsl53K3KrDMMaUxOtKSWp0vDB67zagsVpIpF3PHXyej5y44coy5Ri50/4wPY9hPHDvKU8he02uuPXAkaDTn23wmE17ToGjHFQZppRIAtNaFLxXcTmmVYVqAINFwzWRWpDd0q7ZGqi489WRef0CLnpKNIsuDRElPRTAWOZ04FpnticbRVJlzPBBs2d7CJJoNuu6TcnTLefp0yFP/RP8Kv+LfcTp89Z84/sW7nz9DmvGz/D2fVruku3gOsW3JwRnK3UvU53ahHwllIb3RCYxwNTLoy1khay0BdvvJnfuvGDFKvHtUu34RflzVgrvLU9S4gdNc16rEDL/Pl4pPmIixamgg1Nw3ii1zJdhDlp0IueMrUy+YXDy79hTJmN57fCG9QbUzJqRdPRpHV20asYRIQqWpZ744nOYKxliAFqIoRAY2J/MIToidnpBpMzZzcfYXV6Q0V0rWJaZV7GyH0/0FIhz9DHwLEkrEDfRy7v7AirFaVcULLQWmMz9HgpBGtxVlWo9JYqlZOTE9pl1Ua4wFyFNs3YyyvWq4GaM6ll/Mm3JnB6STQfpWkKkDGB7XaN8x3WCFf7iW7dU4vQrSLz8YgADaGLUcsyqYDW9LU23ZGNYZwrRizzeKTMDTOOKlRyliyGW3evyKUxzYWaJ41KW+Aoreis/BNn77y/KNy7ign8dvfd1HuSXNPw7h5UpQdrEafBIZVAu9esFNES2aEjvpKRWqg1U8pMq5ogLQKNuhCxZ8qsVm4pWiEZo9WHNTrBKUVDV9T0tPQdUmJOmfVmYNic0Hmvja4QEOspqeKlUeeJzemK69/2Mvp1x2+uvutrYugBig186Il3U2rg9rO3uf3MBRfnt7m4c5s27/HRkpsjNzjmymGemebMPE8cx4nxODMeZ9Ixcf7sHU5WKz762FcWhXtXNp5fs28ghGUxdQ4fO1ortFwXabRWRwK0nMjNkrISqUspmOCJwbN40JBmuCMPFvZcMmCMSqstELrIPOnEBIQueC31l8XA2EbKBdsyfa8IuVxHWg242FFtoO8DeW4UK/Qn11mdXWd/OBI8uKAOze1mTUsTZW441CRH0WCbnGaMDThrOGb1u7TWVBtjDKYWqHDj+hpTC6vTDa1mulWPWEvfqUhNjI7TRcCdrNk89DC1vgjv8kWul0TFoMxHtY8OXc8xVYIRzJipxlJiZbc/MlbdGSIgUhcOn8dOie6ko+a8dH8r41SX5tjMmC2veORM4+npmXNmbgVPIrxwzo1rZ1gphDho2WYt+6s9e3/ywJ/3yq7BgJUZi8U4R2mW0DJNDLK8rLbOiNUYPSsVIS4MQ52X21oxpejDasESyYtl2liLXfoOOkM3WHE6EXCBInoDNyc6mRBDSYVUEsfDnth39JtTTahyFuMCVRRfh9XFwxhPN3R6JLv5CJdp88D/710z8NgTr6DUzCfty/jNzV/m0q7Y5Cve9cxH+GuPZkwrfFxexq/xai5KzzUm/qP6p7y5fImPzDf5QHwjV6/YcFr3XLoHo8zPZaBVNaJ5H6iixKkqghPt5vvFp5Cdx3tLlbY8UI3gtrRFWFKq9h9umIk7D0i33soRt1pR93tKBkvRiZHRhcBHdX8aEXKtBFRPY5yGuUQfSXWmFiEzYYxju+q5e77joccf5+yRJzjuD7TpgDFOvSHTyGazQXzAz5WxFmKwzE4fROMCUmb2x4R2qI3yQNE+2mrdY2uijDPROnwT1tuBVDJtTPTDiojH9oEYHaU28jQSY2TYPvi9fbHrJbEw1Fq5Os4cc6VfCa3OTDkh3uCdch2PY+bs2oZcRjYnG9K4I5qO45joer/gyiwJOL/SDm6SykSkDxOH6cAcA888f4vLavixH303H/6Nj/LCMfPG73iU9dnDGBo2rFh5z8WdO6zLjkP4xsXhpO2R1jCug2YoywSlihAkQ+mQELVR1xxGYCqN0BJioE0zTYJiXox6AZxYxGjZbGvCWBX/GAySKy04ms20qundzEctd1tgmho5J0qaGY8T6xvXcV2H7zrVGQCeQkuapKzNXouVDMay3p4w1MSNi8SdByDWr5uZ/mzFh8ctvxrfer+q2MdT/u/HfojV+FE6s+KX7evup0hfMPC++kY+nXv+aHj1/X9z6bfaNX7Add2Mit83GhHQasJ5v1CVLa4amm3UPGJ6j9AhuShz0YKUmVwS5V5WhjT+dnyKfzC/lvRVt3qQwmte+F2erMqv6GKgTTr7LzlxejIw7Sa66JlToY8Bb6APGoloCaTpNq5f4YLheNgRfU9tjUcfO2W97njs5a/kyU//P9x65nPU1nBdz8nJmqvLHbvDkWvbU2wpeKOgnjwmnrsDrTSmKav3Q4STdcc6GqKxeBG2Xcd6vabrAnHokTlx6gc42xJ7HavGYa0Rh2UiC1xe7TD+L2B2ZRPBSKW1Ss4zqy7QvKrNajVczYneO+apcrbqYIlnm6aR7XbNnEdKLmAijUwTx1wqsesoxwnvHMUauq7n2nXHW97+BHfOL1XSXBPOeoahZ3d+m01/jWYtzQpvufU7fOTRH/kaJaSXzA/On9SHElkoUAbNw6g0IrkVIg5MB1Y0S8ZqeUxtjKnh/UIiMg6xhqkUvFFKkmmVNGecN5hmMc5Ql2RrTKMmpTrlBKkdSGXSh0eWEJcuLuPUpi7HDNUN1E7HbS1ljbarhiq6U4qJ/J3ts/zM5bd9zZk8UvmJ9dP0Yc0H2l/+2vh4oFjPbw1vwRjzwL/7g+41yrn86sssGKqv8qRECu+VP1Ulp6hmIvbxfjy8WM+8qEptF6hVR9XOGkqaKQViaIj32phuStt6S36K/0yO/FNezwUrTuXA91x9kqvPf4Jp3NF1npShFh0bEzy1qUu1VHVyWqNSdb9IkcfDXVwY1IpOx3bt2PiOVvZgKw+96jUMfUfJE5JmgrP0657DNGGcJpiBErhrTlAcuYpmd4aOey9/rRVnlFK26iObGOiiZxUc3clWgTBn11hvNozHA855DtORw91LaJVpTkiFeRwp7i8gDFYE6lRZh55o9VyeU8H6AAih94QpUnCsTzccL25znCreqmfBOY80YRoTxqMsASolq4ItVEPnIo+87AnqC8/x1CefZeMd1gkGqw+lA+M9UgulzDjnec38FO7Wv+CT19/N3p+wrXveNX6c15WnVGrbKgWlCnmvjUvnLIZKyxPFi8qjjVYNrSRlC6CLShNopuLFYowi1KiNLIsKr2rT0QHi3dI9L1hrmHOj1kyrWim0VmkpszrZKLrNBKRaakZzHZ3gsJRWML2qSUtJC4FZjWE/sL1E2p/zS4dv407ruGFm/tb6ad4e7hC844IHy4ovzerF39sHGm4B0QrhnJ7rZuIn/Gd5h7uNtEYraupqNKQmfV47T5lncs3YeUmr8tr89EYQa8kpYRuaDWkhpYSTe5FtKPQKg+9XfOnpp3HdWgE+C8hX6/ZGbZUKBOxytDEEpwzIaIVJItiGtR02eOqUeGG/48ZJYHPtUZ541ZvIpTJe3SJYox4YH5h2RzbrHm8M8zhy7eaWIjOp6X1aqyF0lmnU+6/VQt97Tvqe05Wni54YPeIsJSl+rha1e19NiTLvubvbc5wT54eZq8PE5W7Ue+PrF+d/y/XSWBgAsQFnNOU4emizZge0edTsx1YVcukU1OlE+YGpZWUiWsA0drMhuILxgXEudKGnZmUXnr/wPH46MsyJUnp6Ywi1cNZrmMvpQ4+yO3+OVpS4NE+aEfnVl2kFQ6VUQyEovKNk+m4FxqmMu+9wLWH8Rm+2qjedCJr6bBardm744KgtL3ATzbG0shCqrPYH8B2mNUpJypsshdwyUrJG3JdGk0KuhZPQK1/QaOOs2UZpM7oNFS3NWyXXiljBNh23lVbJpfKu9Y7vG/4EoWLu6QOMhsC8GOzkzIwgcJdvXCCsET33f9113c789Or31G8CSniynQbCLpVOzhnBEIY1Tcf5xCHQpgTWqzqzChIipukCLCmRm1aU1kc+nB/nF8yb7h9lrsyaD8S3cPrqP4PPfAgrjSza4DM0NZH5gJPKlGbW0VNKgU446SNTKupCRfM+xjHhBBwGfMfZ46/l2tkJLzz3NPN4IFgPtuDIzKXhxpmSBO8aIQTSYc+UKwHBeQ9Glsa5SutFVJId/Yp+1ROcZRU7xHeUUsjA4fY5f/LMbS72R+5cHReSlCUXjf7z1jLXBx/fXux6aUwlRGimkbOFWpjmTJMlZMPUhUoc2VzfMh8P2GwIXSAfZmKMdEG1AFr+KRzl7uFIyg1ThRidhtWmxuBPONveZLvuMQ1O3EDXr3HesR4C1qqeP9XGF87+Eh986K+zD6dgDDu/5QMn7+HT/jtU+loLpeRlpxGsFH0Ym9BCr2V7EzBNB2+tQZt1V6pqG27HnVqhrVP2ojjEaio1VnRkm4+0eVI779L1n9LMXCZqzgSrMfchRlJJGJT2DA0rDWNVVWeMhsR4r6Yz1YwEpU+LwnFLAxPU/+GGSOx7bYQ6y0+svvANsuJI5b3mSd47PP3Av/v+8MwDP/7j8all1NpwplGNpyBUJ0gdqTUrzDb0hK6jiMP5gK8NhyodnS0gmVpnDQauhdLUrm1EKCXzK/Lab5i0VOs5f/WPYIxQjaWVDMbhbSQ4y5ySajbQCdLQe5wVNbxZR54Lw2pNdFY3sWLwwTCXxurkJtIM45jo+gGhp+GZpkKbElMSfB9p1rLpPdXrZteFjsHrUc+atvwKq2jpYkcfLZ1xeNFKeD4eOKbM7nLPs0/f4tNffIEv3rpkKo20jLtdiBSBVBfs37dwvSQqBmOUOny67amt0AXousBmExlHyFV4+LEbTEelO4mzOGmcXtMubZNGxnJ3nyhVNfvedZRUmKzhoRsnvOq138E8Fp769KdYr1d0Irz95sDbv+etWA/zfk8+7kjjkeN+z93dxMdf9gPUrxurFRP4ndU7eNPV+wl9xBilShmreRZ+6ZfYNCOlqQ5ejFKYqqLkpWlugDFantKgpIlmAFStl0pBsLiSEVHtfEPU1SgVyZnSCtPhQDGO6zdvaCKVizSz+DFMIJVGm3f4fg1WzVekuuRMKM4No32QVivd4JlrVbOUGIINnAwdQuNHrjW682d43/kjnLfIdZv4yc0zvHM1Qmh0+y/w/suXcad13HSZ/2T9FN/rnuG16Yr3j69SbLud+I+7z/O93Qs0HIKDoqzEWhLDMFBar5uE0jUZp0lVpU3IcYVfB8r5OaMsHoYGps2U2GFbVVirM5Qyc/GAKgZA1tehZu1PiGWeVexkvMdMWReEqOncToSzjafMC4N0iOx2dxG7IqeKk8rpI4/z1nf/MMSoocz7HVIjE4XVYDiMgl8N5KwCppPOcX5xm1odU9ZeVW2F47GobsUa1sHx2I0Trq0Gtr2mlYvp2e32fPnywGefu+Bq1NT0vCR8rTeBKVcGG+mHSKqN8XjEWQ8cv+ln8iWxMIgI3luO08zpatAEodqU3mQBY7lz6w6rdUQQglfE+Co60jRhnOV4SEzN3E/Kpig0djP0WFP5K3/9B3jv3/xhvv+tf4N4VVivDS976DqSdljTM6OR8pe7PVe39zyzu+T4ouPKLWJEz240Vr1BffYsxGpoVvsKAMYqyVm8vuFpnrVUBMT3+GCRkjFNISh1npcAmaqiq9YoatTFWkObMiVn9oc9vousNidaLeVCN5wwl4x1UefwTRcnrGAdCv0ImjHZ0CNaaQXnlMxcW8X7hXVgjBKXjcGIwRrDD5zteWf3AqYVXTSN0ErDm8J/sLnNu7vnMGGDcR7mIzI73u2f5V1nz2s0oHFaLVVLNRbnA+NBCCtPFUMpiSaWkg8YZzEmUtMesToWlmYQ8bpw5lnR9y4Q1x1lVjl5mWey63Dec60eueAB49HDOTFYpHnmOWmUXa2ouVWwIvSh04g6H6mtEGPUnX8cCWFgfXLCras9N7Y9OJ08Pf7oTcKwZZxmkgh979nvjmz6wPnVrKPqccJvr5FyouWko1VTMV3H4bjD+sA0zsRVpA8KbLHLKHWcRv7wC7c43x9JTeXcWQpDDJxuO6RqeFNtjd3hSOwU+dZ3A5zf/aafyZfEUcIYw3GciSFQW0VwuCHgXSXljDeZXDO73UhrEBYrsIiA9ezHmTFpORy81Wg1I8Q+cna6RmzlA7/2L/lXn/pDhr5jI5XT9ZqztaOlmf1+5O6zT3P77iUXT1/w8S/d4ku3rhjy5QN/3nvjyiYNJ01BLWIwLiKi+HOxQY8E5l6+pd5sGpiiANUqVr37IrR5JFcFrhqg5UapOq5MDYzR8VVrCazj4vKS/mTLcHZGP3SI86zPrlOtgHfUNlNJQMXFiCww167r0QTLRpqPmtvgHJ13SnS2HidC9GYhXJWlf7Og2lqjWxiVgYatGSeGUMHkgLc9fR8VfiINEyKWiC0Rk61mNTbB9v1CvBLMukecB+tx3YBdYLNGOqo0/LChNQe5EYJOC1ZnZzjX05bufskzzXmMs5Sa9EhXMz8qnyLI14JQTUnwr/4puSnMJyUVjDVpDDFgvV8AN43gHDEE+hDUWdsq1ncY79gd9mxiwA1brt+8wfXTDdvt6dIbqNR0JAS1U9cmzOPENGWKaawHz3EaNV6sWmoV8nxUU5SzRO/Ydp6Vd9AMLSd2l3tuX+yZmzAMPdF5cs54i046msXHgVp0IjHlqptLbcxzesCd/OLXS6ZiELFqiLGe4B1d77HB4WJmOB3o58p8yCrysbqDzdOoJbofqHmv4h8TsEvYKs3iKWQst774Of6H/+6nubMX5QHmyrybGI+ZsTScOJ45v80f3r7Ln75wm2urwGs//xv86at/4muOE14y7zp8jOZ01y3OoTA4PRY0VKbsTMM4oWAJ0hB7L3SmInnSN9uZJXwm4eOgh8qq5m1BKVS1aSMulYK3hoYh1YnV2QndqicED9bhfNRJzEINsq7DCRQSiOCMTl1Smu8PI0PX3XMMqfFInOZdtECdFaUOoswIoyEnzkITNRO1eSIYS7GGUhYAjamY6aAgmtb44PFhfun4cu60yA2b+KnV53hHuK3+it4x7xvOdhTJpJKRVphnwRGx0TMdL6nekosQfKCOGWOa2qxbY3CORsN3K0wTaB5nNkrLdpa31i+QBX7DfBd3ZWAjR+rv/yLnn/kgm7MtU0o6TjaOEO8JprKGyCDqrpSCd4a7h4R1jVQaLkITfSU324Fhe439NPHGh29yddgzHQ/keVZ7dDMcc1HqdYNHXvYyLi5vY42lpoK1jjmrhsWJIS9Al5N1xHttcIMepZo0emeZU+HspOcV1x7lcDhwd6fgoZSy/txdR4yeNs9aNX5dn+ffdr1EFgbtXmM0TAQDm82K0hqPvewhPvfkc0g1gCFEe9+Pn8UQBcWhO8tq0zEdRmy3wZmGC8LcCvtjxojj+fM9ty6OfK7CFy52fPlWz6PekcXwhSJ85vkLBcW0xlyF8IWP8l1Dz2ce+UGO4YRN3fHu48d49fwkzWpUmyWqt15EKTnGYY0i5k1NBBt1TNYaxoKh4lxcsiM0k5Jl5bfGKyvBKzmoGogWSi5YyZQspOnIxX7i7NGHdVGpmqsRfGM6TjTnyDT62DElFQi5RUHosRSBj0w3ed/Fo9yukZsu85PrL/KeayqYMiIaaoKKp8RaPnT3lPed3+B2cdz0hf/85gXv7M/xwZKnGWIAC02q/n8FpGQ+fLjJ3z+88r4u4k7r+Nn9d1IGeKe7Q2tQTeRj6Sa/Mn0b59JxzUz8h/ZJ3liexktVZkS2zHnE+Q3BB8RkpPZ0g7IrnHOaUuUDxmn6d5mONAuf5Nv4DfMm7jJwIkdef/dj/N6ffhjvHJd3boNF+R0LibotR1A1rTpOTrdYmZimA61GuqAVhLWBloXg4Pq1h3jla15LqxacY04Th6s93lr28xGMY0wjIUZig/niBYaTFWUujDlpmtZiTnPRcpgmnPP0Xa/GwS4SXMfJeuB62SLxkqvDzDQl2t0dEXjZ4Ck+Qs08trnO2emGvot86dkXuLzck8q3ho9/SSwMGPDeA4YQAtfOzpTQ5A237+xANGTW5EoXDfNxYnu65u7domnNqRJoLpANAAAgAElEQVQRdRPawGAVibZar5gLWKPux1TbMtMWzg8zH91rp98YuT/r7oPDusAQI8/cvuRtt/6AN6bPcnJ2jT9qD/PBa+/mn29/iG3d8VfTJ3mTPINxgdREqT0sI0ZpGNupYGmZQjhntTIyTS3YJiDLYhE7r3DV3lNzWo4hjTFVvFRsM4yHI3f3V2weepjgLONhR9+vEROoonj6khNx6BTQ0hquFPCeKgVvIh8db/L3L58gLbvd7Rr52atX4t2XeNf6EkHDahqaxvzB4yn/+60bX0myLoH/5fmb8IjlPScjH5wDP//CDW4Xzw2b+dubZ/mBeJcpZd53/EYDU8Lx/vEVfLc8TbOOT9SH+Ufzq77ifpSBf1xfz5SPvCPcpkhdXksLrnEcj8To6L3hOCasWHqzwnvN4CwCNanY5yP5Ef6J/e77qscru+bjZ9+P+84/R578CA89dI3nbl9QNUOPuDgVwRCDVrC7y3Oc8zy6CTQrTMWzGTyXhyPeBYyxxPWWzcl1JFvmudB5z/7uOVJH8jFTctUGKZDmkScee0gxdtIQ4yno1MA4SysFEaFzgc6qWXCIfsn5aGz7DcPqlJRnatO+j/Oew90rYuwI3jN4y1wTXXS8Ybvmbs7cubvjz3//j/lmr5fEwmANbIdA7C0PP3zGsFqxOxyYp4TUxKq3ZBuZ55mSM8PJsOxQwpQykwg2WM7HrARl6+m9pSJcXY1qBxZHK3mBbTRW64HL3UFfWGtw9xyN3tPHiJBJOfHc+Z6TPvIlXs6Hb7zn/rFi50/4dfcuZPo93sbzeGspBUIMGCkL51HAe6yxOFs0V8I5TJtoNdGcpY/akPP5wGSvQ9M4eFsqpSwjxJ3mP/Snpzz0iley2gzsbl8QuxVF9Gwa8sQqDpgq1OzANsiQrBCk0XUD86HwS/vH7i8K966E4xfOH+EvPfcpvI3UcdZsjmr4uc1fZf46ccwslv/zuQ2Xz1/wc/I4aWlV3W6Rn7n7BIfdc7xp9xR3Hn/HV4sb718X0vPlzz7PnEZ++dveTgpf537E8ZvuDbzqzq/Smqpat9ue3cWe9ek1cNCdnNKKsNqs6GNQ5aUYeiNMdWbG8X/Z7/oaKTQov1O+68cxn/1dnr19SV30I6Dn8hi6JYMUNtdPMDWz8spHcHbg9MaKq90IzXPjkVO8Ec6f+zwlv5PN2SlfevJP+Nd//Ae46GhXI9uTNXcv91iE/ZhYdwMpKzeTJjraNuqa9Q56PHYIPLwJnG16TvolVjA6vKlE67kRHa46VpstY9JQIf/tjxJjj21KLkt1JrhArZk8ZaJ/OT//F21hMEDXeWJwOO+4vLxgPI6UlFh1gdh3pHGmGqHrO85ON9y+s1tIzJFm0mIsytjSljO2oZTMnDPRB2qtKv2lgNG/c0bBs9IWHp7ANBU6a/CDYx0CNVeujoWPbd/xwNHlb3dv4835/6XuzWIkS88zvef7l7NFRGZWZlXW0it7IdXNJilxaS7NVRQkWYI0lGTNCCMPxrZmBMM2BoZ94bFvjQEGNmDAV4YJjOCRZ0YSNeKiXSRljSSSorg3m2SLa5O9VXdX5Z4Rcc75N198p0pqVjXFBgZG61wVsjIqozIi/vMt7/u8v01ua5xRSYMSm6xGzVMRU0DEYCu1YFOKlulR71QpQ5QOkUSRTEpa9lmr4NuYM65raDY3qGaNZlsMA5IC2VhmbccwRIagYbiuc6zXKmYyUQhT6k0ehheMatunYbV3jKkanZ0MCjrZfwG14x4170+3MprnHxrBOH579moeOPo6W2nJobvRvLMZT8lWwFqObvL3oHf3/3XnH9LkngIMpmFjtuTHy5d5XXmcGHpc5WnqGusrcsrUToOGrLeYYtgPL0Atmu+Qp4i7YSxgCo3XkGRKJsdEs2jxJmowkFd60zBEQjmA1FDXjd6AZnO2z+zwxDe/yu6Fc2qCG69yx1338uQjB6z6CdxqPJIDXVdzsjxlVhXWY6KqhBgLxchkmYfaF9rGM/MNWKup4AWFyLYtc29xtoNg2Fi0pDBSzVoF4GbVrdTUiCRiAGctlfu7iHYDFotmsroGDg4PsOLY2V4Q1pH10BNCorYWiyoAx1WkXXTEMWKs4+S0h5hxja7QxKLQi5QJRFKGnHU+QVG7srUWJF9X5lkDkMA20yAN+mHFydKxuomZClRJV/KIlBrQQBJ1ySVCsfisLkhAvf7ZYtsFw3qPfhjpNmZ4owi1MpEBsnWEaDBhjSXjKod3DtdU6oS0FiHSR93jkwJiYBzX2GpGGoqSq1Y9+ehUU7syhNWKzTtez5G7cX23mZYsVz0SEr5roSRMKrRlZC03GnC6MrAvNz80DmxHcI4fvvIpPnj+HWTz1xWByZG3P/tJ+pNjEFiEE06qm/xuJ0FOb//6w31s53yA11FZyzvcHv1qpJm3lJgxvqEfl8SSGUIki1HJdbmJjuF0j4kFoxGCCM5ZxvWapqppKq/4NlQ2X3sLMU2B44Z1v4S4xhxG7nvla9i4cJEybRX2rjzH6mgPomO9PsRyDXDjcZWlrvRGlNAMSjGWLCMaHqxDXedqKl9jrKFrG6ra4dNAbSu8OMQ6al+TajOJ6ALj0Z5aCIrBOofzXglgJWGrWvNPX8T1kjgYrDG0jaPEwjiskVJRVTJlQ4yMYyKMA5tbZ/FEDk97pNYBkY7Hg/Zbzqi4SArWGvJ6TUwFby15HPUQmGhI1hlCVPpvLIpLS6VgxdKvBvxiTiTTp0SxmTYcsa5uzLTcyCdINjrAygWhIClo+Kx1U2oziI7+yVKR44okkCb/fSnag7qiWRAx11RlgGpGTscUW+G9thVihTIOhJDpS+bR7nY+Uu7nqHSIKZQobKYV77rySR44fYrY95quZITaex56+s/58C3vJtq/ybSMvOvKZwh9wGZ0ZZgz4+kxsvMCUtqih8PK3ng4tHlQzD437SRIRIZhSQHe8tSf89Hbf5Rkv787WsDx+/mVvDP+KZubWzq49o4ci25himAT4A0PyGX+rNz1vGdRSiE9/jlSiMQp1s5QKCmAVT9Kt+jIKZGkYLJawEMI+oGWmkzm/O4Mbzu6zTk1I5vndnjmqcfpj6+yUe/y8le/lqe/+mWc86zDAUhi3tQ6++oWHB4ekqPDWMGImwKWZZpCCp3XtWllBCeCrVpsXWFMwXtDtmBKISxPGPqercUc7yqK94xhpIQe5yoIAWv0fflirpeGjsEIwxgRSYy94I1Q1yoqWYVCP0TqutGUoFwYYyEGDVuhmIld4GiM8iLdpGPQkWRFAZLJjMOIEf2elBRFHqY8yYJi3nMp6oIEQkpYFCt39+N/jM3h+U+8FO4K31GPQxFiBtDevGB1vYfgfK1zDINae3PC+xZfz8ghkq1TQIvtKG6OrSr9cKaA9R3trJ38G7p3d97hqo5Hm9v5ID/EkZmBaKAtIhy5Gb9z7m18dNzgidN9njq6yvpgnyH0vOzql3jHd/6I+XAEpbAZT/mJq3/OA6ffULpTimxsNNjGkI1hZW5u132hr4OCcqgMf7z7BpJ5/p0qG8cnbn0H8wsbcMbxCv8EL//i+/CnexP8829/A++Xhn7o8XWlOZ2uBWf1dc36YpqY+VK5yHcfTSKCvf21eqAYg1jDvNUhYm1V+uyNZmteS7/WAKOpqkiRbtayffYSp0crxtNDutmc/WcvM6wPyOsDdm+5k6sHV4ipJ8WB2uu8qZA5Ojkl50BIgjGFlBMxqQhtvtFRsGzWltYJG51TsRWCE1XUWgeNsRjDZEs3LDYXVLM5vmup65pZ2zHb2Gax2GRxZptusUFT3byFfKHrJVExZFDGo2SOD0/Y2GrpZh0nRyeMIeC9htoihRCFIU7ZCllwtRCXacJeGYz3VMYSM5wOaRJERazVabs1mQjXXb/WWVLM10k+1gp1ZRijmogNUELh0sHDLA/u4LHt1/217lyEL9U/wJ15j7fHvQnzXmO9J4umJCLgSGA9SMRmJQ5XjYMYSGHEVA5rhNor7twUw9hHTNdhC4Sxx1qtkKwxhHHAtZ6PjA8Q5eYvYXIVj7ziJ3hwFTh96hku+oZq3oJ4zoY9fnj9O5y79U6Wzz3FSk6Z724Tzgzs3Hkbswvb5OM1R8/us03P/k0IwzsysPeCh0bF7n0XObryAuAXWrqLZyljT72xoH3kT3hPd0zuT/jDH/ivWNY3b9uu/2w74iqP9R5ndYNSOcdpUi5jQMg5s38TQAsA8x1AsWmCwRkztZ2Jjc2G5XJkZ7MljJFZK3SN5bTX0F8jmuB9+emrbC7mnBwc8Hh4BDGJvCpsX7ifu+5/OY9/80tsdDOGAoKldjojSCWzXA5qDbcF4wRGqMQQQqapLF3tqeuaGPW5yQTzJQz4ZhtTzzBSqKzFmRaDw7hqiq3LClbOiULGGTXNiXlxuRIviYPBTVHuxycn+EWLdYbnntmn8hVxOWIbR8yJ7zyxz6ULO0REo7+SQAyMazC2YPyUx1hbTCmMfaSuPUMCKUrrNdYiaQKpolw/g5qVEDVAWePYaDdI6xUp6aDy2asHXH7NPTeYUaJ4/t/6dby5/Adc5YlDoVDo6mmQGBPWdhgRpGpBsk7Wx54QVVVoptizaw7BnCPWVfi6xVQOPx7jGsvR/h7NxgJfO+a14Si8sN0ZYOlmnH/N6zjzihWbeWS22MBWNVe/8jDzuqLdabjlvndy/Omv0o+FemfO7BW3sbh4ibR/mfbxjn/4xBXeu771+uYBoLbwT15d8a8eiVyNN7YAux2cffBN7PzBzelJZ+vExoW72bADsW6pzu7yjW88wX27u7z+iY/z8bt+5Ab027WrksQ/OPMkC3+G+c4WxMywXrM+XTL0x+RsKFKo2ort5Zr9m3klTq9qQOyp3nH7YcAYS7vRkIdE62G9DmzOHNYJJ+s1xXQM6yVV3RDGQE6FfkrJ3jlzC/PNC3D7WRYbm5i65smvfY077n81j3z+M8yrmiEEnBWcOE6WPd4JQTzOCEZ6UlYRpCBsbm4wayoNLjJgrVemZF0j4hC0qjbica7CVRVijRrP8OoVyR5jPaVESooIL274+JJoJXLOrIdIWHsFXaKtRaQQjOCqiuVJwhRhuR5oTMKLIGUy3xSNQa+cRXLBmqzZB2IZwqixdkUDSiuvfT8CRYpGwaGMvJg0eNQYw1gSrq7x3tCPa4xken/z/L9jMyeMAyn0FBmVxVDAFo/3HaATfvUeODyatmylMA5TyEgUingMSZmXziGT284b8NZRVQ3EkRIT1tdsS/89f6+bLhPbDc5dukCsWooVinPsvOp1jEQyFesM9d3nsa2BOiPek1OAxRnqW3b5kQuB/3b3KXZbJSucnwn/y0/dyw/fWfFfvKqi4vl3Ij00HI2v+Ee3n1Cb57cGlWR+8c4jnbGUwvHePpXznOsabE78wOHX+fFnP87GeAKlMGNkxggUdkzPL20+xluqy9TdjAwEDGR1jIZsybEodKXAK3n6xtYkDlSPfJA0rjDGqpW/8izmDf0q0LYVuRSsBJpanbspVazXJxSYhoEebyw7Gx2vfPVr8O2M3fMX6dqO3TMLirX0p6c89dRjeMykk0mEYcBQ2Jg51mOiX4+kibwtArWvGIZEJYXKKLPSiCLsdOCsVagzaqIXYzFeXwFnDVVVqRJWdEWug2qHrzWq7sVcL4mKwRjDuh8ZKczrhr4PbG50XD1agSka7ZXXKhTOhdl8g/XpMWOfMcWqgSgGnDVYIsaigNCctY8rCt20CP2ohhkRq5N3kankn4JoRQVQs1lH4wpdGRU0Kpl6OGRobqTtbuTTKdcBxb+LBakoohg1U1R5mUqFuELo16pt8A02DWQc4ChpJJoOg6gL0jolUNPifGY2g5R6hVKXyM+23+JXVz9wg4gIwJbIQ9XTWFtzkg3OeiCRi6ozy9Y5EkIWizl/G349kovmX5ZiqZqGsJnIZ3resZP58UsFvzijUBw5IvYd77qjZ9jb5zf2z/LcaWK3E/7pD3W883ZL6QtvvZSpdoT/+1G4si7sVImfvXiVBxeRp0+1vB3WgcO9Pe42ni6rh+BVVx/hB/e/TnV+g507tqgrizUVYj3WW/KYMYsKYzSOLfuJuB0TRYTuzCZ/ud7mk9z5/AqvFNy3Ps7wlT9BMDooptDVlsPDEzLCat2ztWhpXMFIYtVnhtUaZz0Jq2lXWLbmNXXjKSlw6Y5bMe1ZunCK8Q0lR6RucVHR9jhDGSLW1VAiyzFNIjjox0S6Jn4aA4vOUztHLlotOG+VFSEadnRtFe98rd6brPkX07KNUqya5GRy8RoFJpv84oaPL4mDIeXCmNH5QOVYHgWCCYrEamcUAVc3+KA9U+UKqfGEpCk9cVRLcrwWaV8yqTiauiKEFSXFSUiiKDCFqmriMKiL0IhOrEMIDJIp44raeXLs6ayDYrnjmx/mm/c93ztBKQRxfC7fzoPmqtKnSsT0PbgKV1myOExOlDBimhrTLjDjgCMwGgs5KbOQiQZsCohXqnXKKrN2GbGe2K8pGOI48Bb/LKWFDwx3s59rDBr4uohLXv/UJ7nn/MhyfgmswZ0c012Y46TQDwO+m5NyQZw6GctGBaNR0o8xVBvb7F3do92aM6xOYerbS9Y3tRjBOcO77oD3vH0GKSMpqyBLhGA8xdT86CvO8M5bDlmfnDCcHtLHgWH0LNcrILB/ZY+nnjvmjbddpLUWEwP4Gr9R0+10OCtg1DcjIqQYcbamqStN1CoZyb1mb5I5jRlLxfvH+29AzSFCuuU1yvZ0jopE7Q2na839bLxofqeoAteUgTEJQRqMNdTOUHlH7RxndzbZXMx42cvvw9cV+IrLTyzZOQf9umdjZ4PDPcgcYJzB+4oUI6k4amvJ1pDGEYujD4lZW5MLzNsZYiodVls3RfJ5pYsLeKdRh5q2Dk1tsVKuw32MCDFEfOt1oG6tUrTk7+DBEKOemvPG6X/YWIZhhTUWZyFMjIMhJTYNpGiQqHHzacy0nWMcM3Xl0Luimo5KUeCFGM30KyTAUHLUSTBak5Xp0BAgDgFnLc5kapOxriaNGWcKF698hp3tjs/tvpvBtFyrAdfS8iF5A758gTfGpxBjSMnhvIBV8c2YIyYPuFpl0skA4vQQMBUjltoYTI4Y36j9OSp8JCNYKzp5rzxDP4JYKPDW+lnePDtGxJNL5nTvlKP9q8QahmdGUl7hzm/zjYcf4czGg4xUjMXQWkNIiRgDhUgsDpGKjCEOgfXxVS7e94McPvqZiQUhWGMhBsQ7pWOPGefNdddoyYk0RorRANg+jtxyz8v5+qc+RoiBEANjyJBGlqdLijF87atPcs+FXXbaimQszXyBVBY3r7AkpRqhXo/iJw/DFPRSChBHUjKs80A/KPjFwAtzGLoziPU4Yzi7UfPcwRThljMUQ8HSegehZzQyzR8ahEBVzaiMUDtLV8Hy9IjTfuD8mR3a2YzV6TFp6KmNYTa/lav7X8EZpXqFMbMaEpsLT1ZGD95aUlTOZxgj9axiPmupKjOlUCWM0crIoiE4EpTuBAXndfhrnLJFTVJlb9PNEWdJTknjRSw3KSq/5/WSOBiYPrzGGw4OT3EimGx42V27PPfcHiEq+svhGMagd704kMVS1QYXLFIVWi+QDUOY8iLSMAmZdNWHaOl4DeAaS1JGoOgg0hihWMPQDwxBENPStkAMaCGXuHjweR459w6G7xpCBnH8fr6P15Sn6Vr9gFESaXWCuApbO0x0xJSpZjU5CL5qMTEoncnq0DFkoS5HGLfQ7Ymo/qAYQ10KxEY3NfUGKUUNgi0Z5zVw1p7bYPP8WZaHp5xefo54PEB/yC224+lPfF4zKV3H1x97klvvuYP73t7SdDX5yimPfPTjPPifvYdsatbrnqODLxPWkf7qgN8cwJ6QQqKcnIBAjj1Dv6ZaryhFwSolJUoZySURVwOf/cjvsx7WlJR4/Atfxmxu8qk//TiHV9e86uI5XmdqFrfMqduW9TgylsBiZ45tDd32FtYpLKUUIAR83enhkVWzkvpThpM14xiwM4/YhnEIbHPzwaMs1fuw0xae3jtlTNpSbLYqFjq3MIhVXczMVeRUqDpHjhWVCLdc2mJcrXnDu36Erd07aecz1jFRZ8PuhUtc/s63SFh2duZ8+XNXKGGtrk/jaFtLyJGcEv2QMZUnhkiD2rSNtVirbUGMSbkeIWH9CHmO8ZWSvXLBEBCjhrF+CDRdh1hNwRJrsbaiskXDbqSQh7+DWwkjohFe1lM3DRYISePbK+8ZQsQ6z2xuYRyQxrCYz4mnK2LOSNE7uzeeRCaENd7q6e+sRUzR/Mbps1wQxBVMnHBnJSpjsahvIiTVxJWSqOoO5wQTMk4EjOHULm76/ziiI8WRMakcekyGxovKbLNHnBqqJIGpPIxREeJtixEhxYxJhiyt9qmjajxiKhgsRRzWWurKU3yFi0p0YmIFWiuklIihZ7HZYvM5TvaPWS/XuB6anV2kDOQT4a6tS/jlyPrJJzgW+M7Hv8jOHbez3LuKVI1CZ1zFnz1V8Wvfvo2Db1ecrQd+8e6Rt52PZNEDaVhNQTeiyd/jMGKtoe/X/NVH/4zH73s7f3h0gYPkmcUzvPqRP+fe0nLXTsdZwFlBcqRfrkmmYFqL8QVXN8pFiAmsZlq28w1tK65xHMgMvcYOpJLB6ev9F+N5euVT8TwdQxyoHv4Am1XCOo9gqJyh9oXGq61bjGVmWsY8IC5TS03oI7s7G5w9t8XydMmr3/Q2di68nNn2DkN/pJVm6oljYb51jr2nHsfMtjBVR3+8YojK7gjjyKxrGHPSgy5rHmkyabLXCxvzmc4JjMF6T+UtGP1dWApiLJIjMdeI0wGk80VzQ2Im50LjK4pklWGLkEui6m6e5fFC10viYADY3Nzi6HAfgyGnyJndGWF1TBw83hlOVoOCKSbzlJDpx6A2W5jK+kSc5M+6fSyIWJWkGoOTzDqBmai8ZXpYyTINeIBS9J8qkZJ1ZpEyeJ8wMRNjYhaPWd5kQ7HJkmwgrJe4qgbrSK6BmPAuEyMYL1Si/EeRQuUglqmVcQ6ZQC4lRbIxNKKwF6xRy3EymF6oPESpCSlMBi4hmgpjI54MOdKd2cR0DeWp51hdyTjrKblgu5GtyhATLB99lnFcMssgh8c8/qmH2bz1GUrV8IVyifcvX0aY6tCrg+X/fLQmDSPvOL+mFPjQI8f8h0cNVwfYqSM/t3vMg4sDYoz8VXcvv79/y/XHL6sNPn37j3LON9x+9FdYW/DG8tnubj5y7kGO/IKtvOQnypd5iz8kxYAtmbqZM4rBuIq2bcA58rgmx0E5GEaFZGLgz5e7/Hp85fPnC6VgxlO6L/wm5/Y/i8xrnjuYvBaVMKsFsmWrSowhshwDdTWAqDT8wu4mvnbUVphfupV77r2farHQzMy6w3pLGkfObHc88cQRzcacy089jq0ctvIkFNvm7LU5AxQKMRf17SDkIqrDEQei8nlnRYEvRXF+4hvdsPkW72uMKE7QmEoNgsYi3hFKuB5g46wjvDhGC/ASWVeWAoeHB8SkFtsUEvPGUdcLvBP6XqGvqWQ2NjtOlwPDWLCuVU+DsRgJCBkrlpjLdSZ/yUKImjaMaCx6JgMqVqHonVaLgWlvVCAXgymQYtY1Z9NiZy02CA/ufwJXnq+CdCXww+ELSM4YjIbK6stDEaug0qI06JIhpwjGYGxNbQ22MH2v8iDypAIMxejmQOIUcuOxzhL6Qowe8TWhD+SMAkut+euNhoy0XcXG2TNsX7rEeLKEaGiaGZWpmGHYrCsubJ7jtgt3cEYaNqi48ujXGQ5W/OHy1usf6mvXWAz/9rGWK1dP+PB3hA/mV3Fl0AHX1cHyK0+e4cPfjDz7lcf5k+6BGx4fjefPdh+krj11NeOz3cv40MV3cFRtgAiHds5vutfxWW7FZF3dlqR+GFdVlAnMkibI6TqCOL0r9kPi99K9Nx06EgfOXf0cdVWx7kfmjePsZkNbCWEVmPlIMAZSTdMOXLh0G51t2N5sicPA5uaCMSZ+8KEf5jvPXCb1J5Q4QHE4b8GpZTsXxVPEqDRxcQrD0ec3MsZp5mWEHCM5Raw1zNpGq4c8qgJ3GNSqbSpc5dQGnkHKBOQXsFVDEqshyGKmIN6IGI3eiylrPol1qst/EdffWjGIyG3ArwIXUJHie0sp/4eIbAO/AdwJfBv4+6WUg+kx/xPwSyg25p+VUv7oe/0MYwxVJYRR/QtVVzHEjCOwHnVQaGxSReMYsVhW/YhkoZ51LJdXaX1NToIzOiQbU8QYCDlgnVpcx6j9lneONAZElIrjjKLWpgBtQHC2AiOYkrEixNAz62pcdtxx+EV84/jE7I2c2gWLfMrbVp8hOc//5n+SIzq2ZM1Ph2/ykL0CxpKLw3ur7ITQK4LuGvFpHYmuxVvFvJW4gsk8lHMiJyHkhGtaTFItRBoDxWkClxirVVSOFOPBiYJWUmToB7rNlvVB5ityNx/dfBWH0rK1WPGTp1/kjekKzhRMSEQSCwySNyjfvszRPfXNbdNZlZ0fuLpNKN/lriyGP1zdzmv3vsTx7s0HgIdujmQhlpGP7L6Z8N05ljg+NN7NQ/Uz+uEpBt96jESscar7n2TLIRcdHIrBmfEF1Y6528HkwNFpZIiCzQVsoMTE7rmOMRviKrO54bFWWK5PsR5q57jt9kscDSve8O6f4swtd3Hmdq02Y0y08xnL5RElG0Ia6WrH1aM1hAFrHF1rFb1mLNmqN2OIitnLYgkhT+CZjLMGKRlMwfmZ0qIri8+FlC3eGcQZigg4TxJdSwtec1HEYJLqOFznKSFpAnhOmovxIq7vp5WIwP9QSvmciCyAz4rIR4D/HPjjUsq/FJF/Dvxz4H8UkfuBXwBeCVwCPioiLy+lvCBbSo1DhZQHBMdi5nHG0q9GbNVgg+AkMqSAbytOw6hiJiu0XiipUHkHcYIlpdEAACAASURBVASrJ3FtPWubKP1INpYQy6RVmFKdjCHEjHdOV5VRcXA691Y8mykJkYxzQuUcdaUbCl85Hsjf4q7LDzNrOkrJfNG/gg/P30oUfZMf0vFr4T4whjfXV0hJ9RVVW5Fzj8lGU7FdS/Y9JQ5orK4FMaRcpnpD1ZlOBAmjtjlmklvnqKTpouu1NFVBKjE3JNPgRQNlvtzey283r7iueTh0M963+QbM6ed5c3wG6SrY60E8m5sVMSYFqd7EiXmmrCmPPc7eeOdNX88j03JGPDulZ09u/KBuxVPiMIJzL2i73s81IRW6RshGWY7KL4yYHCFDIU4RA4IlEYrnjPQc3ORwcP0B62iwYtiYN6QYkTzQbW1wfDLQr0ZKyVy81OGY4U1GHMzPbLNz60W2q3NsX7gNrOF0ecL22Vuom46Dq0/gfUfVWoblSj0JsUfEsLloeebkiK1Zw7N9IIvmdCi9y+CkkIyauGzlsQ4N5Q0gsxorDpMyURw1hZQCxXicdxgHdoIDkwNFOiWE5azyl5SwxpNTxDgL5cWtJf7WVqKUcrmU8rnpzyfAo8AtwN8D/vX0bf8aeM/0578H/HopZSilPAZ8A3jwe/0MAXKM2OIwpmKra3AI1ne6qwYQT20dzlm8cQwZZm1HayMXN1rmjcO4TM6ReecoeYAYyHhyYhI9qU+hTGWWEbjONLRWn4kUdWeWwLwCny1dVTFvGjbmM5panZLjasS6OSHoHfsvN950/VC4dgUcvzO8jJLWmjaULcM4EsYAORGHgIxLbGWwbUcxasU1rsZ6JSGHrJi4XDIpaBCPkVr5mOOgmg3jSSVRsgJnJWdyGJAU8bWn6zo+ZO65aRz877b3ExkZ+x6/mFFE8Is5851tfjZ/i+q7zvOqRH4xfZPtgzU7eX3T13On9DSbLe9ZP0r1XSBWnwPvvPwXFOehFLbS6U3/jW308OwHTQO3KSHGkWNEcmFcr+hXkTiOxGHUTE6Bn+BRPN+FMYsDm1/5oArWNjewlVNJchCOjnpmleWWW3a49bY5JjrazQW799zLrffcz2133cY3vvRVbn3lK5GuwTkt+0s8ol89w2KxgXVQUma2mLO39wRHe1e5896Xqc8hJA6OV1ROQ4xCP+jQsTAFIBvFxYlhaz6nLpmmMnhbqLsWYyolksdeJdPGENMwVVIgZdJymEguGrZzzZg1TvGLUiw5vDjm44uaMYjIncAPAX8JnC+lXAY9PIDd6dtuAZ74Gw97cvraC166A6jxvmNrY07GkoExD9fiBHFeSEQoQsxJzU8h4KtayzXUBwFgrWUsgPEwtQrOOP3P5gwl46zDWTcBO7RqydPg8VpJJ6iEMo8RUzxihMp3lHWm5EJdW7VbG8+Jufmd76C0jMmQc4TSI5Khqqcp/kRSToUyDKSUmO4ppKQQWGJPGtZKanCW7A2+KlRepcA5ahybOKOryKKpSCZnyGCzgkf20s0lsQe2Q6oO4/XugkC/PKUfEw/VS36pu8xZGREKOzLyX7rHeYM/ohf4ufzYDR/8qiR+lseIOfPg+DS/uHyY7bSEUjgTl/zssx/ndctv42eeZnvBfzJ+8YYPsifxU/7rGhUXrwX+WkQcVlQh2I+RZHRslIrmKuQMWURNa9OMxgwn7D78b9h56hM0s5Y8jhwfr0njyObccenSDju7O6xODthezLGmcN8P/hAHV57DV/Dck5eZ7Z6nrtUynaaDGmMUwJstJFiv15weL9nc3GUcPc89c4Wd7fO0TU1d6XAR1LkbxlG1HgIx9jjrSGGydyeLFIu3QslgneDzqPqkrI7inCLkOAmbRkoshDAiYvCVTNsLgKxKzRimzJLv//q+txIiMgd+C/jvSinH8sLJNjf7ixtkVyLyy8AvA8xqh/WOuqpo2wpxmX4VqHxFQBiC3mGLKLhzDBryebRasrNZ4aylj5Gu8lxZDvQ54Y1lXUAkUvDXhzwxjdfty2MYoUBVedbrQddgU3CHSn81IHZIiR0rSM74psVaSwoDSIexOkWepxNOb5JDsYXGt5UCKRVMGKE4Si0INbhaDV0GvHEUAYxOqEtWByC20f01U/q1VAp4FdEDB6dv2mTwrdXyMziMSeQEYh1nXeBqvNF6u02PMRnjBTENEkd8O8d2C0w748du6/gROSXHyPLZqzSVMBzXZDvwRrcP5Rv8Fi9jr1TsMPAz6Zu8hefUgp4iry9P8trTb/FJOc/vbb+W37jwDv7o3Ov5sf6LPDQ74KF6SVUe5bfDPRzQcIae99Tf5I32CqmobB2KYuhtJkYh5aIAEimEFDTtyzj+oj/Hv2cK3r22mnYVzmbEGU5PeiRlbj03x1TgTUW/PGZcG2abm7SVYbZ9iav7h5w7t8t8vsF6f8l9b3wHrqopOehQ2lkoFZaKMa0RgY3NTcb+hKPlKfOtDcZ+pJRIch2+7Tg6uEqIuiaNxmBKmqhKhpI0V6UPGVsXpKopeCSDw1A1DS5GxsktkcMI3msMQBacK/q+uYa8rzx57DVPRAzFlOn3+P1f39fBICIePRT+bSnl/dOXnxWRi6WUyyJyEXhu+vqTwG1/4+G3Ak9/979ZSnkv8F6AsxttqSvHOkQ2XYuxhsXGBgeHIxubDftXr1LNawrw7JUjQil47+mXpzoA6mpWBwNjhMY7QkzkUBCTdBeeNI4tT9CWPmnqUS7ao6U0xZFpA48xKkQpJWkQC4aIoxWLn9WM66TOzjDi6pqUM29dfoKPbrz7ee2EL5EfCV+gLwNm1jAkhzDorKJYVKUpYLh+IFVkRv0FQVbsfAg91njVvWMpBkxVI2FNGQsjA7l4HWCNNcVZxKnvQ/X0mV/YvsJ7r1xk/BvDworEz7nHaBYVZbDUWchj1tJ83CeVkT+8fJbfWu1yNRrOVef4B/Isb0j7UCD0gQf9Zd62dcr6uWOKKGG5DL1yI3Mh9AOfnt3K+3beQjD6djt0c94/e5Cu+SpvqJ/jjfkKr7fPXIfnVMYqmn0K9UlGSyEpEPqeUgK5oAPkibI0pswfcKMMutia517+M9Tf+CTtrObCLRe58uQT1MHhFp4zZ8/z+GOXeehdr+Vzn/ws//U//e/52B/9Aed2t/nql7/Cqx56F5s75wCrknscpqieZOjXugGwjnF9wnK1JMXExmIbFmv+6uHHOcyOo/09zpzbZbjyrFZ4TDQxa7VqKJFFu2AcemTRqWRbBEmBNGpqeB8Eb3TvWDUNEgOSPblYjNXnJEWDkhk13i8DKQykcs1+9f1ff2srIVoa/Cvg0VLK//43/uq3gX88/fkfAx/6G1//BRGpReRlwL3Ap77Xz8hFBRq1ZOq6oh8Hmqpm7E842DsAQcEm1jKOgcZZnBRCgGydDt6SfvCvhXh2laOtYAgjxhuqylM7g3eCQyfAZlI8OqcrPmPQu/s0nKy6DmMt3gjWRJXjioZ9dLXVlVIMeJt5xeoxfvTkT1kkdQUu0gk/uf4Er4xPE0KYrLqJYj2lZK0gKJSwQoK67vw0GAWDE/XTK3XEqMYiJXIMmBIo2SPtJrnp8LWW2GLQyipFTbcqSds0a3hrd8gv71zmrNW24Kwd+SdbT/DOMyfUWwvcRkPVWYoV6tZSdTWfNuf5leNdrkRdRz43Cv/X8Tk+Vnb1jtafYkJmZmrCwRVSP1DWS/rVktJnxvUpZhj4/TOvvX4oXLuCOD443qVDXh32UIrFiFONh0mQR0oqIE4ZFwUk9ZSUySEQhoHT1ZpxVEfr4U24EQCx3aFYYasWDp74Fu2iY7Z9htvufQVf/84+7VbLt771FM43fOOLn6aEFQd7B2ztnuPCpVsxouweZ7QSsQJj6CeiliPlkYODw4kzGsFk9k/WxCKY5bPMuxnDcg9ra0YNsWBIiePTJYjSqUtJpDBOFKlCTAHja5Ap/btrqetK22NR4dcwKKJQppzNUgquBEJSOpgVzU5NOepw/UVc30/F8BDwj4BHROQL09f+Z+BfAu8TkV8CHgd+Xl/c8mUReR/wFXSj8d98r42EPgaczSyTqtxsyewdrpXRvx4ZRktV16xPBS8VqayRFBBv6WNgjBknjpIDDsGXxMmQaWcddn9kGMPEX1BHm3NqorLZEqNWDcYUUlDlmHeabzGse3pn2Jx1GKmwxmFCxrYNUifiaoUkKOIwkrgvfJN7Dr+JUMhBmC8WPFLfzR+bBziKHWdSz3vSN3inW1Mk4JJWJOM64G0g1h4jXrmPWKyDGAZisERNpVVkXUoYbyjiaCvLMAzkUVdeSdRFmq2G3MYYiCOINbxt45SH5oe6EkPfjGUQjFSUmSX7QA3kZQQy7y+3M8qNROl/H2/hgSufw1aeT5QtfufgbvYvPsCZtOKnjx/m9cPjeNF1sJlVHNibq+4OSoNJqhUxroKi62RJIykp2s85i7FWrfE50wc1vZEDY4Gq8ZQ0khNsljVHcuOK1K332V407K+X2GTZ2TjD9oXzfOVLXyX2S+5/61t4+GOf4uX33YtPR5w+d5Vuw3PnDz1EN58DhXEcqOoOa3RV6owhJQ0xjmNm++w5jg6eZXV8QjefYfcHWmu5dPYMl4c149oSY8RWnn7ZT5oZQzMZrPr1QLW5RY6RZAPJKnchS0XlPDaN2kI6IeOI6wHMSN1UDFFZkUYKCQslEA36nshJxXb/sVuJUsrHuPncAODdL/CYfwH8i+/3SYhAiIUYCyk52sUOqytH9GPh8PiUjGMYewY8iZ6NblPt01Lx3JWRlU9c2p6xWhVCiUQsIWVsCDSV5XilEeR1rbjuygslGda9ipT01DWaxCRasu5aQ+uFplJijzih6hp8rVmTxglM2ZnGCs3M6fqwVEAmloEvpPN8xL72entxQMu/Ge7HnH6Nd26cEI3V4VrjyNajCzgocY11LYjDl5EwrjlNPcY3VN5RmYK4Gd4kStHUpjAG+hiIxpAl4iIkVAZe0oiThhzW5JBU95AS1jusrclYJWS1mTzrKGNArGf/mZvfgfdty+yuc3wynOXXyv2ME0XqwM34te0H8ec2eEutgTTW12zvDeyXG9mQZ+gZQ2IMI5WJ15kV2JYUI/16zWJrC9+oArEfBmxtGfuRZT/Sr06xzpGyEHLhTekL/JF5nSabX3tvxYFzj/4mx4cn+Nqwe+slnInEYaRI4e577ubK40/yYz/9bppq5JnH97n19ktceMUD7N55tw6hjSElcEYoWVfiZLDOM/Qrxn7JeJLY2drksUe/Qu1aTo+vcnR6hcP9PdZZsF2DjT39OlCMckZjKUgpjGOkqRvGaQZRxFAbiHFy4zrBi2WMhbryrJY94fQEW6n4iTBiHCRjKWWcvIFWZyJMa97yH7mV+P/jkinbIYZACSuG9Yr1OBBzIgaVPaciOCwpGo76lQp/YiSOAyUXojFUtQJhy/QLD0EZDc54PFNM27SRSGFiPZZCihnvPNYZrFW5qizm5FQoWc1QJfXKk7Rqpqol0vlM2+ggyTiHTNBWYzJNU/Gx7sGbrDAtH1i/TB2LYaWDIauOz5IT1hSK8dpbF1WIWed165Kjnvwi+NorjQqPFIOpWlztwQgl685bJqepc1qGkya1Z8pgMiFFMI4iCUPCScFgMHVNMYWzNtzs5WJHBkzteL+8/PqhcO0acXwg3wMOXFNRNTU/P3+C6rsi0ioSPylfg5Im7oDqScwkeVdnbEFcrQa4nCH2lBAVhNtHnK90uCoCORG+9jG2Pv//4Nb7yl5Y73H24V+l+c5fkog4Z2k8VM6xPDlisXGWl917B694zatYrZY8+Y1naRdnOHPpVmbb26Q4qs1cnEYhSsFXLZI9Fkfs1yrNXy8RAifrEwYMYmG9znTdghgNMU2r6kG1Esaohd0YS87KHT1dL1kP66k6MvTRUlylMvicCLmQw5IUBsK4AhIxRp2n5UiIIzmH6+0xuZCTujBDCgqveRHXS8IrIQjL9Rpfe6SqyP0xJydHZKkoxmGMsFyuAENGswaxNTkNKoeuLGEYqGrHzBkODiOV0Tg2VwnVqGsvA3hv1WiUCzlpUOsYIrV3QCYm9cavxoFF65GiMffWOMhBP4QJpSyNCW9rVibSSwaJOuwTKNlzYl9YvFNS0PAZsUiadtJGDzXjLTlESCBVR+hXuhOPI8Zr+2N9mlyphVQSSKRUNZIURBNjxjmtgFSwlcgp62akZFI2iIcyrXJzzsRssF5AMiZlfmHjad57ePt3DSwzPz9/mqqp2Tu9OfNxr9TY2iOmIpbC2zeXSHyM9/W3sl9qdmTgp/23eF15Gm8qQpoKUimaCmWmvA8x+MqRk94kxiGTU6QU+Ey5yEezqjg3y4p35s/z1JN/wR3tczzzoX/G1tYGfd/jrDZm3unmZnXa068POR4jb37orWA9jz9+mcvf/hazeoPX3HKRZjGn7RakGKiampiFtm2pPPRDxluhH9eIFfqTExazGc4UVuue1s9wGM6eu8C3v/IwpURyHvG+Y8zC9vYGe1f3Nb08Kgs05URlhMb7SXmbEB9I/UB0BldXkCNV15BTIJek8umg9K/BeOoUKUYp10WMysZ1YEZdV5T8dzGijsIwnGKlw1lYDSMxZIwvpOzYObvN8f6zFBoWCw0ipRRMpRLpUjS8xVea2+hNmQaIQNbkqUBGbAUxEIpMZiVIJWGYch3GMPXwGakahhzJ2VAZSypu2l4YEI1jz24D3zTMu4G0OiZJJmbB2IpchI18wrG9cYV5RnrGkPlEf54P7t3JXq45a0d+YfMp3r64QsoNxlW6f85xagfUGBXKCbZuyJIwYoklTxAPNVtZcaoGdDp4ohiMK2hqhWCdkKLi8kkD4pRqbWRC05cpQasY3jnT2PRfP77EXvLs2MDf33yWtzXHkCvOmpGr+cbDYceMiPETNj+RSuHNs0Pe1O4R04izMPQjOTswgvOWTFLk/qT0HNJINtqymSKkrMK0LPDJ9Rk+JA9cX0seMeP3qjdxxz2XqZ79NBfP73B0eIw1mcOTka2uIucEqXBysqTbWDCrhcXFXb72xb/iy597mNvvuMib3v0WrCm0m2fwrlBiIgah7awSysdRqxpbcBPQJRfB+S0aHzC+AvsU2cLV/atkYNa2OgMq2h7s7x0yBK2QhxhAYOGs5orGTD8u2Ww71dCIIYWR5JRQTU5YazXdfFgRS8TkjBtH3HymprOq1iR2CkWKwoOHhJMX1xy8JA6GApr5ZwMnB4cMQ+LkZITakEqirYRD41iv13RVRb8aEaeW2ZQcYyyMopLYHHToVVeGsaCe/ZKJeVKJGSGHSFUJ66AnrzG6Ly9Z98lhDMRhgNbiRRijZVEFcoQiMy3HRbC1U9ZD1eJOr5LyNeJSwnnPW9ef4cOzt92wwvwJ/zX+Mu7y79b3XlcjXk017z24g1wSb50dYZzRZC1XQfEQMmMSKusxqMtS14MgTUd/cqI8BixFijpCp6zMlBPZOjWCWMF6HcSWUsiTVRdjmCR8akuP6jp9S33EQxeXZKPrM0T7bcmF/7R7il85veN5isqKxM9U3yaNEds5SoqoOUxDfW1Jyjiwjj5riVuMQwhY05KykGOvIJOm1XbOFkoeiTmSc+R3hpcTvquFieJ4+o4f5daTRziOma7xnK4HFm2FSYF23iFFyDFxdHWfarHgd9/3u2xtLbj/lffwxnc8xPlz55C6AXFQDM7XtN2CQkBDxTW+zvD/Ufdmv5al53nf7/2mtdbe+wx1auqB3Ww22RxEUiQlyhIlKhAkR0iixIaTwEiAAAGcvyB3uc5/EcCIAyEOJFmABtsyJCeSI1umJMpWLJmiOmQ31eyJXdOZ9l7DN+biXVWM1U3DfSGguYC6KRxUnT2sb73D8/yeTG2Nw2Hi+OQWQiLWjBjHbndKbYm2TFQs3bDFXRy4WjLewCKWWBO9t5SoFazHsSyJaT6QckTkSGFDKBKw5kizG1IWvAWxhSxoIncpNAxxWXChI+eyhhep27QkFcyl92mx/EDMGFprtGbZbTfEaaFWIZZMSjMxLZjQsWTIDWJST/ocM0bW4NdUGNPEpldIS2wVIxWh4a3BOT2hvVcIRvDankjTfs5bYZwSrcLxdkCMYT9mvHVcHhaMSdQSabh1Cqv/vkEHed4Zerth6AasVRiMqYkftt/mP1t+j+OqK8zjsue/yH/A5+U1fm1+8d2Br83yS1fPY2rCNH2Ctpp0Ei1GfSBAXilIbk1GUnu4x1u/usBU/VcbekOKDl2NAakJqkDKGsRDUbdpq9imM4ySMkhFgsOGALViKRgXVtcqSI18uXvI39n+BTfNAjRumoW/c/waP7k71/e6lSdq0tba+t5YGna1ncsKFRGa9GQspSYaZX39jipCy5EpRmrRduh7rSUnd8y0nxiCYc6NnEQR/U10prSKihrC9dWeL/7wZ/jIC8/yM//5f8Lx2SmvvvIyTXSNXfJCKpUqqxfDyLr6azw8v+bialLkmtGVcl4StSyMcUKk0YUN1jliyuQmbLYdsQrjOOJMU5ajwP56RML2yXe0iiMtC5msrA2BKnb12hj1DjmLN8qbaE3BL7U2ctZDPa8rUSpaZaVVCfo+rg9ExWBEcGI5ORp46/XXsJujVUPeMc2Z5bAnp8jF+YGzsyOqM0gzdFaYMUxLJgiMY8Jg2DmYTMcyJWqtGn1XdahYc2W7GZCUsKboPVIq0tQnsaTCtt8gLeK6UwZhVblBzsC6KrQi2prUSo3Q725wtDniOI5cX98nLYlsLZ/Nr/KZq5dxITBnoeRINsecv0eCE2j6tHEdtWi2IkDLmj3RrKWVRLWW3Axlzhjr1W7eb5CaYcwamCteDzJpqrbTnaW+zlzw3pEKeKkrBkxZAVhLEEfoemJWYVWyBmsbZVm0ZQle3a8CP3W85z86+holLbjQk3LCGktZ2zXXBKEo9RitXoKzJKk62MVgrMJspFaaQE0NzFZBvTlyGEdKU57k1dUlJzJyKe9egQ75ih/8/A/wla/8G2wWrFS8EU7PTijLwvZ4C9byzJ2XePa5D3F8a8eDdx6QlonT3W0+/vkvYYwhT9cMR3fANqQlchEuLq/YDh1IoRu2WOexxtJSRIwgITAdDhzZns4EPvNDX+Cf/sZv8MYbb1KbEA8TS07YEBiniEjGiuPoZOB6OjB0PTFn7j14xM0XXiClRB0CLVuaXajZs7hKNY1dF6jSyPNCyz21N9i8KBm9VsQ5UozQLDXODNsdF+fn7++efH+38F/d1ff9mulnWRYtMWOcoSlafpwSmcrF1TWykoKa8MRdaCRwvS90wxGmcyoYsloNWFGqvvL21dP+uNRyTn3udUXMWxGsqYgL7M/PtQe2BfGWnCbSMukTpUKrWraZ4LGdo5LpOstme0YwPV50E2Ks0fbFOr0hU+K0je/5Ptw0CzktlBgpS0ZEib8iqpIr1kLLxDTRqPrEQFdbj6PWS9HgGuugSlORjBSgKvXSO3JrGrhVFC7qqRiSVhVWY+SwlmoMXT9gxQOe0AVaTfjeg3OUBpRJN3il0XWOZt1alTWMadQMqRhy1g1MSgWq5jYiUErFNt0A1GZp1mI8uE4zI2PMxLiQUiSlzE+Xf4v7Sx4NUyKfPv9dXn3lWxjXwFuCaTjv2BwdM2w6XPBPdvqXl5dcX1zxpZ/8aW7cfQYJO6wTNblVNa0ZY2gYnLVsNzrXWZak9mfTyDWTm6ZJpXlm1wUVZVnLkiOboxOwhsuxEjpPacI861O9lELMid0mcH49MsWVw5ATyzTh1wolN1HbfYNaMyVnlmmiGUczhhgjuWTlRqbMsiyas5ETc5qZ5okHb71FnN/f8PEDcTCUUpGSGDrNj9yPs85Ui/a+1XZKNG6wxELOC0UEQei8WeMrK1fjgnjVMNTa6JxFWlNTSZPVs+84zJEueJyouEZWV6XmTKrrsvcGN5wgrPAMb/QLkdXlqOvJxyaoqNSdspBrw4eObrelScMYhw+9QjuaPuHHVPiP6x+/yzwUKPzt47coEtDIOadVijGrg24VZ1F0+lyVjo3oYLKUSm4G1zusKWu4qQJnvV2ls6LyTucdLScKQloycY6AxzZHA1Jsa6CJUYR9geYaqUSsGILrcMbQbzuQgZQs3jtKM7SmLY8gtGopzazsWk1+akYQcZhVzmvVBrIeJBoY5H0gx6L5IkmjA64Pe3JMfHL6Bj95/s/YpCtoDT895CPf/AfYl3+Ht95+m1snZ3S2gA/Ypg+RznfcOD5mYy0np1ueeu4ZPvdjX8b2gf3hgO8GYspY3+HDjq5z1LJuflomxpFaDV3XUVrWyrEqWKg2o/F4WDa7Y2rNvPXmW2yOtvRh4NYN3UJp0pnVVrNlTk+2LHFhipHDkriaEkvKLLURcyXGRk6V1Ay5qnFvRUET80JqBrGNkiZyrhRjqNKY5gPzMpNzIZZKkYDvvg/RboK+3kcX12sOZKZimZL2VYfrC43Zsp40RXLR4dKD64WTYYCa2dfKQMEHSy5AKUy50XlPnhti85okUGmlsMyLKh7Vu/qEePPYZakAjIRpGWc3ynEohkOquJIIBEwTWom04PCm0lqDGpGwpQ6eLT3TnMml6MQ66xOmVuET0zf4mwH+z/AFLthw08z8l9vX+bH+AhHDUsG2BYfDiM4OxKjrs6Ig1oqSnaVmnY4Hi2C1/C+acC2P03Wqw5iqX3ZEjTjWYVZISmurfqAUlSBbo1mWWPKsa9z+5JSUFz0oohq06pRpzhKcZY6LshSb2uhL0yrCSIUqKqxKOhtKReGnirxvGK/aklbUD0Iw+K4jX53rPGKF/IpxNAx3r/+cz//ZbzJNM75Tmfk0RYa+pzKTMxwNHTEluqBr53maGY62nNx9mt2NY2ItXF5ccfP2hzT9Syyd7Si2MY0HMAFrCkutdH4DJdOqweJ1narNKbSsq1Vv6bcd5+fnNCwnp0ecHG85P7/EC5TcKGlh0yse/vRk4C9ev4c4R9/37OeJ20aYppGNjhDSRQAAIABJREFUgN1uKC1SstEVpeuwrZBLWz09mZIbOM+SIlIT4pQWXg4TjVl5D7WxlO/DGQPAbhMQmVGrqPbXUywM3uCDJ/jAkhaawLhktv2GJc60jU7fFatlOb8cMdZQmsEm3e3WtgAaAWYMZAxGFMluayU3oRRVhokRljlijMO0TGuOYHtaaVretqKnc13DbbpeNQZN069arciakNxsT/IRTAajJil9CDVyKnxOvskXeQ3nAsPmmCHs1FFpA96o1Pp392f88r3neVg8t2zkb5+8yU8M71BwTEmweWHYrV4JXfopk8Hr7lpR44Jb17DO9khTv36rBTENS6FWoYpiw1wTBYkah1Tohw6cSpJt1dtBjMXSdD+eM9JE3aEVLcMFKAlnPbWsFVlVSbcRQZogTvftVCHlRmmOXBbsGs1mKPzLeJff4ONclg3H7sCX87/mk/nbvHXvTeIyE4JO7b13NG9p1jNdX+ODILWwWYen4gKtZcYUub664nN/7Ud4/c23eeapF9SL0vSzSVaTr6+vrjm+cVOHvmuPFnOl7/2TTIe2Wr27YDFiaU00lqAUgvf8+de/pc5IzmnW4W1l6BylNE52A5dXE0PfsyyR+xdX9K7nfCo8ZTWkpqxcUMW9FVoz1GzIdsC2TKUiUtUkVQolC7IUXN9RZGaOEW8CxSnX4/1cH4iDQWgcHw18641LUlZxjZSKN4ZliVxfXtFEsLZRaRymQi0HAjrlhULf74jzyBtvX/LCMwN5qVxnYSFjRRQyW1Vy6mslUxg6NV+lnHHG6LCvKDfSJlgWgzs75np/zTNHt+g2A5cPHiHLxNAF6rDDVEspVgnVzWDdmmVRK/5ogy8deT4wJU9yFRcTdRlZmnC1En/EVZZSaVSM7emrsht/f7zJ37v+yL+z0vy7jz5MPC78aP8OIkJqC+16wAePiAJBpyprurbKvBWKWzHOr/LzSuc8S0J9CXi8E0wxat+1lpZnRByu7yl5geyhJrWDF4vrLCZbatRU7FIz1hhlaFZ1+XnjKLEgZA2rUVkpzapc26Io9FKVyEHJWBMUxWcavzee8cv5Y09Wk1dmx28NP843Xn6T6298m5PjAanaArnOM86JwVmOtj2d7Qmh0JojG0HyzPHtp2gmcPbUh/iLb3yLFz75OYSMaTDnQtd7UlOa2PHZTS4ePeD05IRcE4gldG59nZZcCs45MI1UNS7xepl48Og+Z7tjjKmM4wV/8cprPHN2Rp737IaOi0PWIKVgWcbCOEbmktVObifuUXjx6ZvUXUfNiVYsxQlGNIFraQ2RPdIgHyriLaYuVO+gGEpMjOOMISg93BtM8Azd+0u7/kDMGJyzbEPUlWTJWG+V27haVA+xkLPGyXmrgNcihmGjCLJcLHPKmM4Sho7OHnF53khJV44igVKTKoFFQ2ZSqhxte0D/XWME73RV5awh1QXqQs7KK7i4vGCZRpzTJ16MMy1HlJGg5i/fO4yzNKOkKZrBGS3tvfV0weK9RWxQMZJ1xFTJOVHTojr6EklZe81f2j//7pUmll/ZPwdtXU8J5JoV+4WGrjlrqK2RVxJ2QZ94igFraAEvONOo1eG80blNKzizth4lY8SQl0JbIiTNyTRG9ac5V1orVNdW96rDeY/zFusMzjqkqURd0HAg/3jYGBWIW+IMZBxFlZjOY4LFed1e/MrhhffUK3zr+Z8F5xEDuRpiqrTm6YwCbPp+wHc63+iD5fbpLT79o1/EB+GlT/4AXjLPPPciJS8rTPi7oB5vDMGr3Hzoe1or9KFfZwN6VVZRGZlpzutswmN8ICbD0fEJ73znTTZ9h7WWVHSwuvEaPnTz5jGGzPnlTBFtLYO1iNPb8eqwJxe4vk7E3CglUqohVqjWUBtMkw7QrQ9aHYpDvKcFT80G8Sqbd9sdxkF8n8rHD8TBsNltuJgUolEwLLkyL4p0W7IOCFNWUrDI46TrgjFamjZpzHEiFcsSE/t5ots0irPMqdIH6IJTxJvRHj3mTFipR4q3UfsqTehM1eGjtZQiLKu5K6VG2O5wwZNSI6eo5bqoDLrWqjemEbU7i/6+Pji2J8eEYBj6gPeWruswFl2ZpsiSKilFYlpIeaHmxMP3UBUCPKydJlSVRI4Lua3DyFJoORKXhLRGLoVq9KADu+r+wRrdcHhpiBXqMiGuwz2xbS+0BjlmSl6oLWnfuiZNYYwq/5zDW4sLTm3YqAuyNtS8Y7RSwQrGqp265kQqC9ahSd9YKh21FsSKekWqek8efY/Xn4czHTx3W3Jr4BzjeE3XDTijhruYF3JOjIc9MV3x4N4DputLKo3T23fpdzucXQ/EnNawIRUG1ap2dd9tGKdZVaKtkFJcY/oqUlXR6QSG4ClN17ebzY5qGvtD4ZkXXuTWrafphiO810zTbeg4Puq5nAqbTUfoOhyoCaoUppR5eDVq6vpRYI4HUiqk3EilkEpUgKwxFAOZDKsbuOYZYw3DZsB2uoFxVofovvv3J6P/5esDcTCUXLjaR1osSG3U1pipiDUcpoVcmvrcRWgtY63apq8nlc06a2jGMx0mzveqihyGHkxhzhExMHR+BZesbYN5HISr5W3wDm89peiO39uOHDOmFS3DpLHME04ctXpMWYCsbIQccdbinVMsnNHSz675m9SCE8GFTr+AFYyz2OAxYilZSPPIHCeW+UBcZnJJ3zPN+qZZ9BArMyXN1DSxjHvitNCqQVpDHeyGkgtlTVsSYxDFCWNtALvaiAUkq2QawFmPrGWytRbXbzBOB27iVPTUXIfvB1w30DCYYDHBYIMDaxGvPo1WUWt7VogLgKWQF6jZkdGVhIous35GzpJK5lTemynplwvmWLg+7Lm8nmi1Yr0lp8aN23d1k5VUiGWNYTg+I6XM0dEtbt25zY3TMzU/pYXWvK6CjaMl+0S0luKi2o2SdRDZGkZWUZEIuQrTmCklEuOCiKHzGkG/v9pz60PPcf/+BS/9wKcYNgPSElNKhN4x7y+Z5oVSI9dXe4wVvHfklWR+WDJFKmm6BtdxmCJ5nnVdCSRjSCyIKeSaoSbNbq1ND6jB0XWBEBRH2OJMWt77vfxe1wfiYMg5czhkNfQER2qqsW+AFcOyCmtaiRjjMVYR8U2E8XBAC+ZGER2w1FYZtp5gO4J1lFbwdvU6iKVh6ENgntPKgmg4DAJ01mIRSi006yjNcX3IpJJV+y8Fayv9diAlaNWojLcKslYwsobYsuTVw2BoOdKbyjAMHJ8dY20juIDdbAm9R6xRC2+KzNNIihM/5/6c8B4rzb/hvg5odfDV9CH+p0c/zv/w8Kf4Hx98nv/78ojcFIrbUDpUqXHNS1TgioiAsWtqstNoNFFVYUZISclIPnTq4GxV9/lhwHhPMz02eGS7Bb9RoIj1qrfg8YHXoAWMOLxRWlGrDUdVrYJpal3PCy1XKIr5r7Wy5EjKlZ8uf/wuvQJ54e43f43egxVN5S4lYRFoiYfnDzAUrOvZnRzhh45+s2GZLrDhhM3pKbujE673BzqnmR6dNxQ82+2w5oSa9YFRON6dotQsq1VTEUhNvTlOs1aNDRpBUDPGAeI4PT5hng7EZWE/z/gQWHLh9MaOZbW+LwkwAbGWlMqTVvAwLcSUqeKoRZ2yc4nMqZHTouye0JNbI3RelcM101LCWc2wqDRM31FyIseoq873cX1ADoZCaYZcMzEDrSE46tLobEdKikAXRFN56tpeJDVQjXOGltntjjAts6TCIVVsFkxuSG5sj3qcC+QqT2CvDTUMWWMU4S3QWw0bNWYlPJlCM4opt14zA2vNVL8heE/KCVCHY1kFKVISYDQxKTf8cIQRgSJ0zuMksTvaqbiqJMQJGI2vi7OKm+Yp8vn6bf4b/yfckAlonMnEf9t9jR/t7iFF+MP8Av/H8ukVly48qj3/6+WH+f3xdH2fFFW+5EpOOtysaUGoq/0XdYtaTy5Nb4ay6P7YGF2HWovtt9Ra1Ufg1DlpQ6+HwLYHo4SlZjRIR0pWy68V3PracKJuROXNIrmsOEe1q2MDtVb9vbHEac8PjP8vX776Pfz0UHUc+wc89ee/DN/8A7abLTlGbBNMM7RqOLtzhxvHp7SYYUkMN+/QDR2H/TWH/Tknd075zptvcH15weluwzBs8J1hnCc6b0l5WU1kQquJnDPG6gYix0SrmoBWRDUZuVaM6zDGcBhH9udXeOPx4pinkftvfQtnjGac2J6SK+PhwPnVgY0PT9qUSmPOmZQbLnRMKXNv3a55j1aVKVOy+kxKLZRlwvVexYA0Sm7Ybc+SFtI8gYE0H4g5EXMmxu9D2/WSMudXM/MM2IVxrlAVOFKdULOezPsxsRn0jSNpbmOtGeM6pMJ0mMD3PLqaecoZpG+kRfMgX//OQzrruX+dODtybLZepdRGmYGDVxXZ6dmGQOGpG0cclkX75hjpjwJlWUj5BGcteU7sbh6Rc8VU1qFdxVpLdUH5/lZ3yix7DI4ajvEWNv6UFgLDsLBbRualcLi+Zhkj45LIMWFsxLg9H+uu+J+PX9d71TmaVTXeYY78ev7EuxiHEcsvnj/Nl7p7KgO30JowN33PnDPrF1xDhG3V6sFbqz8fHZZARWi2kVxHFUd3+1Rbkazzi2UcaSljQ8B0AUqiJIv4jpYNpmWqEWqpdJuBlBSMYv1ARTc3xhuWMeKcUGviq/kpfn3+KOcMHNeRH5p/h1d++3/jhdMNX//GW5ycbPGbgftX1zxz64TzKdNZ4fjkhNoiyzJRpkhrwoc++2mOho5Duub09k2Ob5zy0me+QMoz1gamcWF68EAzOMWoDNsGjPPEaVrt6kJNM3FZsFKQMCiQuKw+DxrGD5S64A185+E5t+/coTZDCJbPfOrTSA3cvHWH63fu8/zTx1w82mu7Z8EbS26o3iOvHhuj9vKvv/YdPvfhZ8m50PcByY08TizVsT06worD+h5rJ5oxNBrT+TnFObrNjnZ1STaWdj1yvsyk6fC+7skPxMHQGuSiJzNNdJBiVNBXasU7SxcC4zSyRP3gxQjdsGUZJ2SVN0tt5ByJUQ+PKqgr0AiDGIYu8OB6otTCtrOk4lhSxErTP60RnGVnOranp5SH9zCm4Ywlp4zB4Gj4ztHmPfOhEfojqgRqilgMlaA3nwikGe8DxXmwg4qHjEqSjdM5h5EBYyLWHPEwPiSIyrlrjEBhaZU5eLzzSh9fy3ChvGewCuhwMqYKLdGFNbJuTd2SogM/MUAqWO+RDNKCGp48BCzZe0zwSpZyQc1FKkWjLppXkE0lzRO1Fe3zscRF+3Faw3mPuMYyJYxxlEWZht7rIDSXrCvA1viD8Sl+IX/qyUF3Zbb87o2f5fQjb3Px8u/QBVUMjtOIM5Zhd0a7OChR2xtsC/TBUqOA93z0xZdILWPLnne+8xZf+OKXGMeRk9NjrHWkEjm5cUc5FDmxpESd9kw50gdH3wctwa3Q8kJLAm7Qct/ohuJxpoOqNA3jYWGe9xr00qDfHfPGN1+mE8ejvLDfHygUcqu4nJlzVhOecVQa1qDw4eAoOdP6DWa8BgJiCmEQYpp0gOwtSMZ1G5oRjFEIcIkzV4/OMc6TjOHq/jktLcS/Ckr0X/nVIJVGTJXOGwSlBfs+UMdI7xXkYhA03Flo7fHQyjHHpIM/Y3ANrsYR8Y2zo55d32FSYSyOKS30Xp19Mes83BkLVv9fsqV36nqbDiq3jSnju45WBe/CCm/WdB/THE4qza25EXXArD74JmD7LW3ZU4sBabjOU5aDEptQ/YZ3AWOcMgo2Pfv9vH7Z1AVIg3GKeJsJXUetGnVncNyQifP27mnzDSZymgihJ6WCKYKRRWnGSfDdgJSF5j1QFYbT1kzNKmSbyLExDDuc78liaCWT4gxZOYI0PbzEgmmGUiqxzGu2osF6xzLNqxpTs0FzKvjeU5vahoWyKjQr/6h87F3VT7WBq8/8V/T/9rfY9j05F7pND3mm1oQ3hu1mo5g6a0k5Ml7MPPvSC4wpce+NV/jwcx9iE2dOzp7i9PZtqFml16VijUqanbOaaL5x9DmTY1oHuEq8mpcE4kj1mr4fiC1ra2g0TazZVSviGzlWxCnjo6bI1aMHHJ/exErBuJ5WRxDDlNQ8aA20nOiso4pSxzbDwPnFBfO8MLhBc0PSuilxusGxYpW9IEKQhDnaYpJjmR0bEV7/9htUsTy6vGI6zOss7j/8+kDMGCpo8KsxpFLoO4u3OoiiqYmm1crQB8TYJ/DWlDT8w9iVOYBq9H0XKBnSXEhjJjenh0yDo01P5x6Hynrc6sCz1rDrgvINmrDpPSFYum6L7xyh09K7AKZkRAZoljkWWo2UpoDuUnVQqU6kjPEdth9w3uKsgPMYq8NI3+kT30ojeM/u9Abboy3GmFXTUShNycNLqsxLIqasRpya+Tn+9N1hLS3zs+3PWJZKjAWMoTQdRNZUdeUVR1Ja+86SVjApuODxwRGbjmLTtCh3EKVvtzizTlypa95DjZmaIjUvUNpKwG6kRV2Hj3MySkwQZ92SNLBV0fneGqRUzt+DCQmQ+xta/TVhsxlIywr3TROVQucNLnis80z7Cb/bcPb0C2w2Hc54bj33PK475ujkRJkOJbPEGe97vFH/izVVfQxSoSa8VU7H8dGO3Wbg1s279N2OZhzX48z19SX7/RV5Hil5Jo57ail0xrOsSVObfsscI7duHHNxeUmuYEvGS6MzDnJhM3TqzUEhO12wijLUvpFvvfm2plDVSjFq83fGEjUbGWqhMw0xDueFbtvR7wZOTjzP3r1LaIZ3zi+49+iCLO/59n7P6wNRMbQG06LRWq2tT3GRdeLrIWesc+xnDSE1xpOzhs0227DOkmMi16pbgVaRWaihEUvDGH26WGuAghdDLGZNe7J03tBKXUGwwtHgaTmxCR6/C4zXlwzmFN81LfNNR2ehScTaXs1RfktNM74fVv28+v+bH5SqtFxSkhCcpYilmcJyOOjQrglWAl0Ac+MU6z3T1RXzEimt0orBU1lmvbGsEwgdP+zfoKXKb5gf5JwNp23kZ8qf8un2BtPB6nqr9oS+AwxVDDku2hBV6I2lxoJIRERL0xKbGp5qJc9XyAqLsV2PaUKOBcU+tDX5aj0wih6uOReQovbl3JQ/mTM2J1obqLmQV0ZEShVxQmlw2kYu3sNKLYcH3Dg5pmRdpnrvsKki8YA3HlnZn61GKJkxzty4dYOH99/Bb3te/vOv88xzL9FoOLHUlhCCKjDX6i8lJXMJlZomQr9VAEqJK6myqTuzyMpIaEgtpFK4ePQAZ2E3HDHOl9y8cRtjG9jM1dVDrOuYp/uaFLXpMRPkMpFyZLMZmMeM88JhSQy7gTHpjMl6zzfefIcXnr2torzWqKu3pGXVNRinaVa2cwRTyGj03pQN2xs3uGvgzjv3SLsdU3zv1ff3uj4gB0Mlr+tA71TC2Xs4LIlNEOaq7ACRptXBY57AFLEuILVhrCPlpJP0aqgO5lJZpNFi5jQE8qICn5Ntz8NHM8EbxpwYNrq376QwWMvx4Ng4XXO2ZY9v2lsjVr33ous3t0bTIx7bIrYL1BwJXY8hYmz3XeOWHzDLqLiwGvChJ9VJjUrOE+cDphU6gbbdEKxgx4Vpv9cnRq4Ya8ipPDEhpdr4PN/mR9obVNMoVnH61EyplqkVvprv8k+uP8mj1nPTRP7W8Bpf9udQKyVGxPaIqavBSj+Psqjq0QBSRxYbkRgp44htkZw9GKfwGtMUCNIqJSVaSZrgZUSdf7WB0bomm5m0zEhw2he3wjyOpAw/Pf0xv9r9Nar9rnRX8sL2//lFVVe0wnS4ZtjuECo+bDlbo9sO+wN3bp1w78HMzeee0nYuLTzz7DMIiTt3bpPzDPSUZNj0eqPlqqDh4DTDI2dNK2uprCFAhlQzwTmF4YiQrVqysdp2bjdPQUmkOHF8fILvgm4k3vkLfAlsBs37jK1CE6z1dP1Mvz3mweVICGbdPhhMU+DOEiOxZKY4sJ9numHzxA1rzEAtC6X0zDEjocelhVw8wUb6oafEDaYCdcOnPvoRXn7tG9T6fbiVEBGCD0zLwpgaeVooVbANjLf0QSfBKRecHyglkVNFrBBjwhrBGMfGBVorLDXpvjds2Q2e8+vC9VzxVZBqGMfIZgh4W+lN4GzjON4NjNPErvNsbaAbdhwOB6wx2K6Qlz3Gd7R2jGsLtQVkdxPJGYlR6dRisC1jYqXYgSYGqSq0McaQjUbaWVQ512+PKbUw7fcE40i1wwWHo9KGDac3KtM4c/7gPvNKR4YIyWOWA8YHsniSsWpdNplghYolx8If12f5h+azT2TFD2vHzx8+Sqkv8+XhnGlpWJNxXafZnl5BLlYcacmUWqnzQrCigaylsswLpdV1VoAONXNG2mMSl/IQq5OVsxhXspClBeEPxqf49f2LPGo9N5j46+VrPDe9xj//x3+XfOOrmC/9d7C7iRkfsvuTX6a9+i8o3lOx7GPk1lNbSi4M24GxFoiZ09Md55dX3H32Ni9+/KO4rifbwPmDd3j1lVf5rz/5GWgQlxEbdpjgwFRKtLS0IFKJaSbHjO+cVkK1seRZD/AU1Zpvqm548uq4qoXtVgVInevougFpjn7r8d0Jn/3hu/zOb/1THl4eODu5S3GNVhp5grce6RBRKExzYXe0YVkiLjjGudCycH0Y+c2v/Cl/4ye+CKZn65UBIXiuLh6wO+qZZYfzHbsEuXlagX7oqTkxhGMkwo/c/CLxcMn/8kff/g++Jz8QB8Nj/FcXHBXRYY8RJDeqqRjvCbbSKDjJLLXinCNnDR8xdkV3WaGJrIMTzzROHO9OAM2NDE5DalNpDMHgTVtlo42aCrsw4J1ld+eU6fqa2gopZ46PjzGpUpIoTdp19F69B8YHmtEVnJVAzY2lGVwdoQtYpzLhVvWJLy3hu4FljoodS4uKpZZEMJ6UJpx3pLgQnGcYNtSzEzhk3HQgxkYqlU4MJaosfEYDR4SKGQaNR7fw2+GH3uU1iFh+df4IP8zbDMaQfaVFZVs0CtLAZN0QVQARYspPHKTiLKa2VZCj4Tyt1VUO7jDGEEshWNQDUgRjLSUn/mC5yd9P312xnrPhV8wX+IFHb/HoeqLe/13aK7/HnZsnxGUheOFQNXWqYeiC5+13HujvURJLimyHHiOGGhuf+viLhBs3KTUzXdzH7BzPf/TjTOPErt9gQ4+xMKeEMwYj2g5Yq14Z673axNe8Du/9CsuBUhfKou9HHzxiLYdlhnrEjV2HGLjeN5DEPCbi/prlcKDzlTs3b3Lnwx/m7de/xfXlTJrBiJLFDnMEUe3EyVHP/csDtqinZ7MdmOfIn736Op//xAsYDBtUE5FjZVrWFOxjz/U8cXJ8REsjeB3UWucRZ7ANuuG95eXf6/pADB/V8poppbHErDHesZCtI2VNAfK2sBt6rEAwkKI+iR6XmSJGn1zrKxIyrvM8OL8kWAex4CyEoEtFQSEwZ5vGLgR2G/1wd9se7x3ihC4obMTUTOgC2KoZirURU6GlhEjFb5TnV5JSeoNtOpwrhZLSk//LWItYB7URHFgpOCu0ZcE4h3MasCs0glPKsveOYXPE2Y0TtsdHDLsNIYgGluRELUnFOLVRmuorYkyUwnv27KAJUJKTchRFXafj4YppGhkPe8Y4sSyTugprVphoSUzzGge3vsmlaflNrVSUGVBaXYeTSrbOrRJzIscDv5pefNfmIYvjz259WZPCrJbVF5dXmNXUZGwg+A5WvH6suvO/9/CSICpsO39wn9wsF4eZMl9w77VXuHHjhP35Jcc3b7LkSnlsn25FRVjSiHnBGIjzAZpBqqM1SDFjiwoM/GqaK0XJX1aKzrKcYTds8L4yLQdyyczzCFUYl8KN23f42tf+hIvza4pseOrpm3gxjBNEJ4RgGaeFzlmGYDjaeq7HRYfmzZJpzHEBEV59+x7X04FcKlUccZ6xrqdUw5wT83zA2Uquuj73teJdr1b4Xmla/fa9owy+1/WBqBiMEQVZ1IyggzixhZxmogimCEPvscbQ7zr6uXI9L6QqDINjnCOlJULwGt+mOyDikgm+w2CwQW3VtlWO+h6pmSAW04SNC3jnICWOtoEyjziEznhEKs4ZgteZQ6HgzQp3qbMOskh0g8OWmdqUTITtSDHibFXxkw9QJioBI0IuBdsqrVp836m1uep2RtRMocrLnLBoWK87PaYbPXvrGS/OqSutSprQSlJZt1POI6ZwUg9cvke2xUkd2Y+TWq0XQ6bx1fIsvxq1xD+Tmb8VvsWX6n1S09gzaRW8J5XKH803+JX4Ao9ax5ks/E35Jl/0Dyio07flghiIUd2fJV1RSlXdxXtMx/NwRquNlAvHfUcuqzwSoZREQp6g46RVbp6dMPR6o1ILzXSK4HOGw/mBj3/6c7z6zT+l297i5PQU53tqszgxtGZxzpBzUv/LEpHm1KJutV0NYUCkscSIfzxrouqa2XjIouvWVVNijZDKQi2VEBxDqPhuy3hxTtxf8vwnXkKsJY6J47Mdy6OEdZbgHaloMtphjixLUrmzUfBLyppDmo3hG2/c53MvdhxvBupYwUBeJrouUGolxoyxSQNqZAXxWkfwQjUdLS/v6578QBwMIsJJbznaeN4+nzRHshT6wTMvGSeN/X5mGwLJVAKOPlhMakzjjBW3KvrWXlfqyoHUL8BCpTOCbY0UGxtvqOKQVDChZ9h25BTZdI75ao+I4fmPPMv1w/v02+3/z/zTcK0QpNBcAIR5ga2Z8MMp1QVcE6hFSUN10tDTUvAkyjqDyM2qitBulSdZFkSEmBZEHIhoRoAPNNTUVHJSwvHQCCEgOTGPM9UZalO36bIkOrMCgQ389OEP+YdH/y6+3rXET1z9Pgd7IFidE/yx/TC/1L5b4j9qAz+/fJySEj8i7yiFGDAh8K/a0/z99BJptYM/aj3/e/sUOb/MF+SobRgJAAAgAElEQVQNvrrc4R/xCc6XnlMm/tP6NT493yPNkZOT9z6ozPhQ20PvaaBSY+uAorZoIEijCGyscHq6RZaJ3cZzdXXAWcfpzS3HQ8/m7Cleff01DJEXP/lDHJ+eYl1gWiKbvlcKdmtYF1jmPVYauWWmuIJUTWPOkc45nAtrwI3mmy6psOk8WFaICiyxIlbl5D5YjaCvlct7b3H37m3+5Dv3eOpDzzAfHmA6x8nQ8/DhfTadY78f2W2PqNIYx5mKeUI3D0G3Zt5a+k3PN177Nh+9e0rnHLvH30eplFJIS2OcEmJmHVLWNWDHVMSEFRDr3/W+//uuD0Qr0VpDnKXmwtMnHbeOAndOBgYHN4YOJ4LFssRKKxbjLUMf6DvLENTRZkUNV8YIUlaOI7pTz1l5iKVUuiAkCo7M0Dt8pwEi1KSrN+PY+p5Kw/uOo82GIJZWM9vtCaUG0lRpS6Ph6bxF6oouo+G8BrdYB2IcrDdtWnFzrVVsqxhrteSulUagFaNS5KT+BuvXntBaaskYa3HW0gfP0Adu3LrF8anKlKEok3IIlBhJKVJS5NPxW/zc9b/guOxXfP01P7f/57y0/zpXlw+42l9yfdjzj+vH31XiJxy/Vl4ip0jOK4h1nvi19JEnh8J3f9by6+nDfGW+wS+0z3COejcu2PAP+AK/n5/i0TjzhQf/Esl/Kd8gL/T/5hcxRnmKuWplMM0TS2q63zcN1w9U0b7/8uEF221HTknnLWv4y0Lg1nMfoQuO+99+GxccOepTWM1IGRFNlT4c9rqibQYnA05EaUvGYRvUXJGqRKtSVu2F85TWSDXphsxZbVEIxLnQpk5nTOPE6998jU98/mMY6agtErotu+Mth/PvIMYzxopxA/v9xNV+5HouhFVm/TgUqLOeYegoOXNIwpuPDkxJWzNvGrFUas6kuOjKtmTSslBrpa6qzFQKxlnEfh8GzvR94Om7J5R54dHlRI+hGUtKQiqV02HAeWFOMI0zcUw45xmMY3fkmPYjzXpYg2MOE9QVH4bTR2gsmTFadr0jtIrRUAbOusCQI0+dbsF3nN2+Sa4L8frA3Wef5eLyEYHIdriF23ZIKTA4jK3kkhRaGzpcucawoWJxIai6zg20HJGWaTWT0arGYDV9qYkackSgH3TgleMaoqoTfcRjnVP+wmp4Mgi9RIazE27cOCKLI6ZISgtXDw7EaUQMpJj4RPw6nxy/jvFBV6fqQadldQvuD+dc7N7bq3/OwP2LB3rI1kIqlfM7/Xu2A5ey4Tfqx0nuL80QjOM3w2dxv/33GKffYnn+T+h/8r+nbW8i+4e4f/UL2Df/UCuelChVJeilClhhcJ5KZYk6gMylkg3YUhinhVunO7VcLyNhd0JOE9P1Q37wx36Gp595mmmZKDkSPGpNbqKfgRiVnTtPLXH1PkCqMHQ6vEulKsZu3VZ7K9Q1+7RKJpfHaECDWM/mzHH/0QNOb93hlVe+yW/+k9/jQx/5JNvNlgf37ynZ2Q5szMKjdCCbxs2bJ7z21j3AMLfGZgiUAjEuDKufp6RIFxx/9PKrXEwzP/GxZynXmRB6zCqfrkWYpsiyLGyPT7C24athNxxT6qIGufdxfSAOhlIqS8w0KqFzBANF1kTgIpRU6UvDiXB0MmBLY1wy0zhTm2XjeqKANRpIY1dEeVtj6FLKOBpNEtlWnPWEsrA9OuXoZODWUY9xuuIx3iLJsDveKI05zuyOtpjeYb3D2IItI8adErylWgMlkWuHNRVb8+pMVBhKlYZYr/LaEimtYajrPEEIBjKNmhd86DUPIiU9z3hMWFYKcSuZkioYQ9gck5dFE6VZMzlbTxeOmA57xa3PkVrX8NN5oVqnZa8AvieiztPjuufKHr3rczkq11xdX7HpB1KciVU4Ktdcu3fH7m3LNQf/7r8HSN0ph8OIt4bjd/41h5//CqcnR1weRqwx1MEhRiur1tDqEaBqzscQPE0islK/B2+VB7rpcEGYU+HpDz3L3WfucrHfc++1b/PCi5/len+BdYFcFsTtIEVabErPtobUJkzzGvqbRw7TwmbYkUvSWY8x1BoptWCaUa+JCCKNnCveNpoB740mZzlHKZHN9hbPPn3MP/u/HvL8D30CnOP+W69y++6L5CxMy9u4g2F3suXB+QpAro1ho3g6IzqcXCZ184oVxBpqgVffeJsf+9gzmgyeJpoEpBlyjPT9FrWsVmo2pNZwm6rxg/J9aLteUuWVB5FHi5Jt9gmu5kKqwlIqSSwTluI7llwZi9qgw9YTho59LuzHyMXlgVyEMdcnkWtiLM45Ot8zdJ7eeaxVP3td5afG9piS2XhLGvfUw8jueEtdDty59bRG0rtACAFTKrQt81RZ5owrBStrz2eU9FMpiBSaqDqQllbmosMHB1n5CF6UKmUriPGaOlU16JRaKElpwGIEaU1bilUwVFvDhE6HtW6FxHjP7mjD6c1Tzm7e5OzOXY7v3uH41i36oyOEii1JHYKm4KTgvOWnlq/i/xL3wNXEj1/9/v/X3pnEWHZe9/13vuHe+4aq6mp2c9RAUhGdyJYs07ISxIoQZJHE2ije2YvAiwDOwgESIFko8cbbBBmWAWzEgGEYMRIkQYQgi1i2AAmBYJtSZImyJkqixKFFssnuruG9e+83nCzO1zRNsjlIJKsaeAco1Ovb71Wduu/d757vDL8/2+0p146OuLFJvHD9Bj995bP4lzXLhJp4+Mb/ZVWOX/X91eOrjQMB2zExDAPbcdNGiBUvzprGsAvM0ehdczLF7zway8ELB6ue9XKwu72HMRW8Fu669zLr9QW2R9e5sH8J1dwIX5G+X1o1q19QSkbrZP6UJV0WvArB9wyhQ3MBF8itcqHqreXYuTaursxFcGGwu7Az369dP+Z0syHVxNVnnyLlwjTOXL7nApvjI4auZ8wTywtrfvjCDfYP1zz//DVrpR46tGmeVFW6IAxDBC2WpHaOaZvogoAPjEmZMpxuIZUOF4P5NSdrzU+FOSUSlZJmI4y/yWvyXEQMKReevzHx7I2RIAp54tLlS1BH1BW2uTKOM8EZ4Sd4TxoV3xXS0WTNQyqoRHJWcvF4Z4nHGCGKmPhLcHhVeu8hz/QHd9DHwGIRSROklIhDh/qBsjliEfeIg2O7UVaLgRAiKS5NGs/3iFPSnBmGHlXHnKGPhaAJE4lTnHP4WqzdtmlFzAR02jbB0g5pz6nTlkyTslfrkaABVh2tuiFWyvJOCALVeYrmplwVUYEQO2K3pEdNnGQx2B1y2ZGmQikmpipTppfET8/fwtXKHy3+OkduzX494eOnf8pP8gP00h2kyaIwHx0f4kn6a5/lTy78TY7dinU94eFrn+d9m28RneNzB3+H7P4i0SV5Yv7C79F5RUtFvCfn1O7a2boItZgWRU3Wcrww/kMuttBmHJ339MFRS2bMsIgm4uO859777+H4xswLz/+QmhP96gLLxdKoTGr9HbUqKo5hsbJyeJpxzpPIBLWO1iAG/5nzTBcjm8nEf6L3phsarWej6wJTLqj3lDmzmU9YrZbUJDYVmbfETgih59r1GwxOUT+gOeFR1ntLfvDUs8zVUUqiqjFOU0r0fcdy8JxsJmLscRXTxlArQc8Kj115jofedZm9QRjTFh8jJI8PGbQyT8JybwFpS4j7bft5O05XAnOjzySFzkeuXH3B+gKwsdqaMqtVhwDjPKEuMM2ZKJEppya8Ypg4qiNrIQZBS2HRe1ad43AIBB9IaWYYOsgzaCYXJXYD6h1OHMN+z7Jfs502eDrIR3Qo4j0TjiLGcCjVZjgsERVsgrAAWggygwuotJHn0lRVxOjXYbVmGk8o80zsrD7tfY+qkZVKoy2pFivLKUipjR9gQBMXe8qcbGbEWf9F1YJKaHdeJe7vU7W0ysaKWgvbMb9Yz5+Pr5Gy8jP8gA8ePYaWSnWhJa5ooj+Cklkue7IqH/RP8OGT/8o4jTZToQW88ND4TUQLf7z/MU78HqtyzPi53yU+/gWcKNUZkbvvgvVD4Bj6jjm34auGn6u5EDorH+M6NpuRYW9p48ath8JYmh0HewPDek2VyObkiBtPfJs7HngY7SO5gtOE7xcmMahNU0OVELyNRndGOaLaIhX7BU5bTqMmOu9ACjF2qLOopuBwvlJLxWkmFVNUH+fCxUv38KUvfAYp1Ya78Nz7/gf48y99iSvPPsF8suHKc9ebRqYwI4BaS7SLLDvH0ekW1FS/fRfYbGe8E9RBnipff+o5Ll/Yww2RC4tMSkroTWIwxIEyZ9LpTN8tqEkRr5ZrexN2PhYGAUXZNIWoMSe890bABbbTRN/3nGxHDg72ScmysKlUjvMGJ86w8631uOuE6ANzKYQ2vXZpHbm0tyB6AV0wb09YdUJOUPJMv7di3hZcVfZix+rSReoNT3QQDt+FhI4QIj6atJsE0OTt4veCSEV8wDtFJBrjnwkJHTV0TZBWcHT4vMWVgGNN39sEYs0TI4KTYOGsOPJkDTjVB4I4are0WmSZER/RnKniUUzUpWhFvBGFxs0JPgRogNrYB5ueFMewrIAN4pS9A1SNwq3VmNe1VsPeeUUTpJroYkdRo2alkhk3x4SWmR/UkeqMquOvjt/hr83fY54zn/vyN/jGs9etdT0VEKVkwfUO5zu8WjXodK6m2iW2GI4p2fvWmJWxvffrxYopZ/Ziz96qZ1jtczKPPHjpPi7ffS+PfP4P2cyV9/1shy82Zi7BVK67IFChlNRuDgXnI6fbrYnJloxzDsYJFWeUbyJosDH7GHBUUq2Ukm2uwtRyLHJAKDheuHaNJ7/zXTrvGQbPgw89yLWrT7PdHiHFBHHnVG060lWG0CIRPForx9uJoetIcyZ6JY0TwUdWQ8dmmhmc48q1Y/73I1/nkz/7EErPftyQnEfLhDiPerumkioSBZ8NBvNm7HWXERF5t4h8VkS+LiJfE5F/2o7/hog8JSJfbl+feMlr/qWIPCYi3xSRv/d6v0MBxIHztO06pSqL5ZK5mB7COGeqC7xwdGr4t8Z89N5bS7WzaCMXC0E302RkXbVkE61GvIyORbC9Z5pmRAL7Fw7Rect6MbB35yVkb81mc0JwdsEQvcnHe0OzRQeLoafvrfGnVMW1PXQhUOvNxc5AsNQZ0RmHdXhmDWwzaMOGBacEyVCVedpYCbNUQj8YnSrYHXweZ5s4FWf9ESjBKeqElNt+fJ4oebaegJLxoW+LbE/sB3LT5yhaCV4YlgPL1Yr1esV6b8V6NbB3eMile+/m4OJF7rjnMnffdzd7B/t0/WCNS6dHuGIaE94Joe+J0YR3YnDWZCaO57ezXWyI0afwLBZGtao1m5J526N3PhC9kbDEOTyWf6gKtSTWg4X73ik3TmYWFw6ptfKe+98LNbE9uoF3sN5fELuBqQihX4DLRGfzHFFswjL4zvQ8qjUQKWoRgVq7eQgwz2ML8YXYKlghdtD0SYI3Erg6kzrUkok+cuP55yilcO36dVQd670Fzz3zPOKWnJxMjLlSSyY66Jxn2QSXSTPeFUskpkLN2iQFhcXQMxdFxCNBWfSRGycjn3n0e5ycbDkpHUVnvATG7ZY5F6Y0UeaJ1FrKaVIMb9TeSMSQgX+uql8SkT3giyLyB+3//oOq/tuXPllEPgD8EvCTwL3AZ0TkIdVbI2SkpUZCE0m11VsYxxHvoFRrLqltSCeGQE7tLgdNpBaGPlgrbakUxYZ/xDHOiQt7+7gmWjvsD5Q8sxnVKgm+4nqHSiEG8J1HSqHOI6UGFn0gRutYHEKP9D2aM2X2uFBxLjRIrIWbDqjV4xRQJZZEloh6cGJddMGrtRaLheyiC/poC1xNk20pxBvIVgAJuCigs4nJloxgyangPTrcJETZW6qI1f7nySYFCShNmVk8Q4zW2zHPFiJLtHFuJ6RqCc0QPCnNxousUOdE2myMK+E7Om/6ndtxAvEWtWRLps5pZjMb2dt5R83G6gzewLm2oFnCTTHdSlGrRiw629K5qtRaGPrAIhpMtqrnrsOeu+6+zFOPfx/vAxIXIMo9l++k27/AwWpgUjtHLvYklUa/UmrN5GTj4t4ZL7LWarM5YhonpVacRqiZsSp97A3YkgoQic5a8asUutCxHScrD5fK8emJdUD2S8RtefqJK2QqaT7meDMyTQkfektGU0AbWs4pm9G2jFVNK8RR6Ya+aadUut6RUmSeN0DlmevHPHm85T4vLBkgdPRdhiKkOaB5y7BaUfoemd7cdOXrRgyqekVVv9QeHwNfB+57jZd8Evh9VZ1U9XvAY8BHX9cR71C1ZozYdYgTcrHtghf7QAXvLEObEqkYmKWoSaWqKjU3so43hqE4pV/0bEvl6rVT9tdLlgcXGOKCRb8gOs92HAkeXOzolytKGln6TJ1nvO9Y7y3xmNwZKFU8OM+sjjjEF8ExXkFKxrXmJXEg3lFqYM4mXebSiNNqJTq9KTDrEGe49aAzSy/0XSSrg3mL14zmylyMhJ3V9CtCnSBnNE/UXHApGTC1GgdCxKE5IZSmV1kIPqBq5OqcC1ISzkfT6oiePM9tgjWiIi1PgXUfamY6OUZKIjihD84Uo8Qb/alAzqannS3jx5gSUg33DzZ3kIuSq929XFN00gpIsGlMrXahqLbSZGTVeWbxjJsNB+sO1y24fMcFnDi2Sajzlm1SnnjyaQ4u389cCs6HxmBQvFOKuobHF7xmYrD3R9WeW4qCZlK1KlmpyUSEGwRIcWi1bsPaPmM3Fc+DtwgOzTz/3DMcXDzEq6H0EJhzIsRIzZO1sEsleGflaS2IKKejCQ1pg+D0sd0uq1JzwYviHEw5WfUGRwgdX/jadznZTmQcabpGLrBJlXHagBY2164zbTak/DYKzojI/cDPAH/cDv0TEfmKiPy2iBy2Y/cBT7zkZU/yKguJiPyqiDwiIo9UVZOO6wbjLTRlqIIpOU3ZMu7OByREQivNxS6wXg4MQzSAR4iUAr1X1mv7WSlnUIukclGbsVdlsVqg1eNroaTKYrHE10wdZ/KkhH5B8J6Ovik7YUitLtANPcPgUa/E3hqr8mwoMDVRQ6Kr4AM+At7alkHQLBjWSA0hVhNailUscORR0QzLTqgtuahK68OgbZ1Aa4As4ExXcxxnSirUtCVtJ9J2Y7g5tb4NEUxVSwKiShQMc9/0NqHi+x5ESNOWeZqYxpG5FObtljJn1Dt87Oi7oSlBeQvFsyXfvC9IFVIpiO8JMeJjj9BCb81MKbeOxWyNXBXmlHFiC5K1lFcWMdLHwHIIdmdHOLy4phs6fu5vfITtNlNkIM1brlx5mvH0GocXL1FqKyU6j0NMySkloDbITUBFKdlCFe8DWkzMRVUIDU3nQgRxxvlsWxtVq4qlYgI8qVTmqqizsrgTR3SwXO1TnSdvtqjvGMeMRFMdq9qqarkwzhOlwjSbLkloUUsfPavVyoa2QmTROzqZGef6opp49EZMn6ry5I1TpmmkhIFUJsZxQ6VjSoJ6o0Ll8jYtDCKyBv4b8M9U9Qj4j8D7gA8DV4B/d/Opr/LyV6Q+VPU3VfUjqvoR752pSIig4lt4aIrASQUfHC4ExpQZ54RWe1OgUZ6cNaPc/IA7Hxk3xrgLIRL7joyJpaQ0MW9OSSmxWNi4c61C3w+EfmXZfT/QxY7xdGJ20hpcKvPpKXGxtAYlH/DB41yH9x01iAnr5glXEwp4LTgHxQW89wZhE0wsBiVGG3d2ruI1MedgDVNYOBl9pT84hJJtdkITrmZAKCGgwXIyNsdg5yzhcC7Qd51BbLzNdPhgC2cIznQlxLiTvovGS0zJuiuzYdO1VuMSzFvmacs4TbaIOSVVjHdU7QKTaABfre3v8Y6pZlIulJKNhiRiakythOucZ87WA1KFNmNQ6ILj7jsO2F8GfuL99xOa/N+6V7q+4+GP/hxTThydbLlwcWXQEjxVHG6xZhiMxqUK1RlEN7etpn3wbEjPNPKyAViC2MxE9a1vQIyVcPOTrLCdZovCWgMRaHv/DSdYq9G9kvTkGNicnnB4Yc3p8TF7e4ekObdIwKHFJm6HEIxxqkJwGLNUlC4EK8WGSB+gc4XaDdbzos5adLxV3PCeR594jqnmpktSSMVEnpMmtqen5KLM49tAcBKRiC0Kv6eq/71d2M+85P9/C/hf7Z9PAu9+ycvfBTz92r8AuqFjyhlLEiolF9zQ099kAzoxDb7ZQKRBGk0atW7Alk1X74lF8aGnlMJmThQRngaGJ69y74UV773vDg4O97h2/Yjjq0dstiMHalsPH5TsPV1wlOgJw4AvE0U8cbWyUWrniTGCJGpOzHlmNXRUZ1i3OVe8ZEIt4APBBQimOFVtuL8RgoRaHQFTLvZdb38b1sRUnJCmU8LQI2Ak43GiyoR0HV7EoitvQiWkiehabZxipVZnd3Mt9UXZOvERh414l6p458EKGLZwVBvpnlOhZChFkJrt72qJ0Jws9K41UVOhFigpk0qhhI4/+uJXKFhyOGuwBaDkBqRNRBQRE8PtoicCyUXuvvsCDz74XmN5iufGs0/jQk8Y4KMf+1vcc9/dHB2NvMBVnv7GNzm8+w60CFqF5cGaYdkRYgBnGpNznkGcMUTLjDhPrljFQmxDr1VAPQ5tAsCtU1Atf5VzaolAq351zoRgvIukNOFCBxgyAIHLl99DetcRX/nil3nye19j/+BOxuPrOBeYc6ILpu+ppTDOhUWwBKfWatsrEfbWe/zUhz7IF774KKdzYUq5iRjbNiwbOIvtnJmL8v0fHnHX3pLVnmM1KClb70kIjs1mS+DNtUS/kaqEAP8J+Lqq/vuXHL/nJU/7ReDR9vjTwC+JSC8iDwDvB/7k9X5PVaH3pjIlwQRCVRve3EERhbaFoDEhxXmqOkv4BU8cFvQx0g2d9Uc7b3dGJ4zZmHi+i1SUk+tH5JwZqzEmT4+u0wVpb7JQJLLY26d3MM0mEx/iYGGyCM556CIae3zsKeqRaiUv54MB3dRbK3TOuJpNgRrDoHFzz1czqZpIrIgJlabijXFYC32/sDt7sARoXK8Ji2WLIgpaTAYdBecDFY/rOvxgtCjDODuruTtnF5m3etZ4uiFPs/UUFOvqU+wCztXCXuMpWhRhlGrbYzsn7fx2JsPuA6VUa++VwNWNXZA3f7Yh3rxxF9q2qe96jFfrca2J6z3vvo9F37MIkTwnlntrrh0fs9q/yJ13XWI83RrINUTKNHK4f8hfuf9eDg/3rO7v5cVOSo+z6FNMSyN2A7nyoipTVVsgVCzx7YJDQk/S+iJYOGtt2pDKnExcKHRDyzHYSHq6uYgCd16+l+d/+H0OLuyRa+Zb336yNccNNg2Zs+lSqnExF0OP00pUZegCPjj2h56fuP+Q7BOX7zywz0uxvgmp1bphscgt+IATz+e/+0Oemye2yUBEZAMHp5xRHNK/uenKNxIx/DzwD4GvisiX27F/BfyyiHwY2yY8DvxjAFX9moj8F+DPsYrGr71WRQKs98D3ke00AjamPLRQXVEW/ZrT7RbxdkJUTVy0Qpsj8BSySYVTcGotxkSodaaKCa16b6XDPE/EsCTNG47GLeXoiOwUXQ847+l6R03GbtymjATP0EdC56nVoTUZLVoU6SIhi8FIXUAkI0B2mESdt+2RJfGEWhTF5u21zMZ1KIagF21bkFaDjq0/30dhKjNeblZgKn7Ro3Oi1tRozIaMK34AadLzzu4S1TkTv8H0KFLK+NhB9Ti8JW6rNRmBtoZkUC2kdv5y9Ehq2HzvkJTRDH0MSPZMUzHYbnU8ee0qJ9kI1671f9Rs+9zivEURtVJzbWPt1g9x6cIB64sXWS97g5FMiedubAnLJT/1oYdYLQeuZ9DWd3DXu+9ltepZrAfGzSnDuqc6bDCqGmrPArRKqgWvEGJPrsVuOt4jWhpkxhrp5mLU5eKcJW4tu0DVRuwSx1wz4ixZqs6ALioVTRN9L1w/OmKQkf39gTwVnnvmu8zbY6aqFBdY9x0lW94lF+vEXa8XbLdGrbr3vssczY7TJ57lB09dRSRQXX2x16JUm9Xo4sBcCkmV4ynz/x5/lr/9wB0suwPzvxRcl6k6MS4vv8ElwUxU32Tnw9tgIvIccApcPWtf3oBd4vbwE24fX28XP+H28fXV/Hyvqr6hFeJcLAwAIvKIqn7krP14Pbtd/ITbx9fbxU+4fXz9cf08F9OVO9vZzs6X7RaGne1sZ6+w87Qw/OZZO/AG7XbxE24fX28XP+H28fXH8vPc5Bh2trOdnR87TxHDzna2s3NiZ74wiMjfb+PZj4nIp87an5ebiDwuIl9to+WPtGMXReQPROTb7fvh6/2ct8Gv3xaRZ0Xk0Zccu6Vfb3YU/h3w9S0b238L/bwVYuBcndd3AoXwot7gWXwBHvgO8CDQAX8GfOAsfXoVHx8HLr3s2L8BPtUefwr412fg18eBh4FHX88v4APt3PbAA+2c+zP29TeAf/Eqzz0zX4F7gIfb4z3gW82fc3VeX8PPt+ycnnXE8FHgMVX9rqrOwO9jY9vn3T4J/E57/DvAP3inHVDVzwEvvOzwrfz6kUbh3yq7ha+3sjPzVW+NGDhX5/U1/LyVvWk/z3pheEMj2mdsCvwfEfmiiPxqO3aXql4Be5OAO8/Mu79st/LrvJ7nH3ls/+22lyEGzu15fStRCC+1s14Y3tCI9hnbz6vqw8AvAL8mIh8/a4d+BDuP5/nHGtt/O+1VEAO3fOqrHHvHfH2rUQgvtbNeGN78iPY7bKr6dPv+LPA/sBDsmZvTpe37s2fn4V+yW/l17s6zqj6jqkVtcuu3+IvQ9kx9fTXEAOfwvN4KhfBWndOzXhj+FHi/iDwgIh3Givz0Gfv0oonIqnEuEZEV8Hex8fJPA7/SnvYrwP88Gw9fYbfy60cahX877a0e23+LfHpVxADn7Ly+IyiEdyLb+zoZ1k9gWdXvAL9+1v68zLcHsU9/6wYAAACgSURBVGzunwFfu+kfcAfwh8C32/eLZ+Dbf8bCxYTdEf7Ra/kF/Ho7x98EfuEc+Pq7wFeBr7QP7j1n7SvwMSzE/grw5fb1ifN2Xl/Dz7fsnO46H3e2s529ws56K7Gzne3sHNpuYdjZznb2CtstDDvb2c5eYbuFYWc729krbLcw7GxnO3uF7RaGne1sZ6+w3cKws53t7BW2Wxh2trOdvcL+P/g6z13zgwqzAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "image_path = '00000-generate-images/seed6622.png'\n",
+    "image = mio.import_image(image_path)\n",
+    "image = image.resize([256,256])\n",
+    "input_pixels = image.pixels_with_channels_at_back()\n",
+    "pts_pred = sess.run(\n",
+    "    pts_predictions,\n",
+    "    feed_dict={images_input: np.expand_dims(input_pixels, axis=0)})\n",
+    "pt.imshow(input_pixels)\n",
+    "pt.scatter(pts_pred[0][:,1],pts_pred[0][:,0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/README.md b/insightface/reconstruction/ostec/external/landmark_detector/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..52792b5af8b665904b40cf85ee5bf8114b766133
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/README.md
@@ -0,0 +1,16 @@
+# Face_Detection_Alignment
+Face Detection and Alignment Tool
+3D projection landmarks (84) and 2D multi-view landmarks(39/68)
+
+Environment:
+Tensorflow 1.3, menpo, python 3.5
+
+Train:
+CUDA_VISIBLE_DEVICES="1" python train.py --train_dir=ckpt/3D84 --batch_size=8 --initial_learning_rate=0.0001 --dataset_dir=3D84/300W.tfrecords,3D84/afw.tfrecords,3D84/helen_testset.tfrecords,3D84/helen_trainset.tfrecords,3D84/lfpw_testset.tfrecords,3D84/lfpw_trainset.tfrecords,3D84/ibug.tfrecords,3D84/menpo_trainset.tfrecords --n_landmarks=84
+
+Test:
+3D model: 84
+2D model: frontal68/Union68/Union86(better)
+
+Pretrained Models:
+https://drive.google.com/open?id=1DKTeRlJjyo_tD1EluDjYLhtKFPJ9vIVd
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/data_provider.py b/insightface/reconstruction/ostec/external/landmark_detector/data_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..53266694a77c1a1090588705e691cf2d83b76d2c
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/data_provider.py
@@ -0,0 +1,236 @@
+import tensorflow as tf
+import numpy as np
+
+from menpo.transform import Translation
+
+from external.landmark_detector.flags import FLAGS
+
+def augment_img(img, augmentation):
+    flip, rotate, rescale = np.array(augmentation).squeeze()
+    rimg = img.rescale(rescale)
+    rimg = rimg.rotate_ccw_about_centre(rotate)
+    crimg = rimg.warp_to_shape(
+        img.shape,
+        Translation(-np.array(img.shape) / 2 + np.array(rimg.shape) / 2)
+    )
+    if flip > 0.5:
+        crimg = crimg.mirror()
+
+    img = crimg
+
+    return img
+
+def rotate_points_tensor(points, image, angle):
+
+    s = tf.shape(image)
+    image_center = tf.to_float(s[:2]) / 2.
+
+    # center coordinates since rotation center is supposed to be in the image center
+    points_centered = points - image_center
+
+    rot_matrix = tf.dynamic_stitch([[0], [1], [2], [3]], [tf.cos(angle), -tf.sin(angle), tf.sin(angle), tf.cos(angle)])
+    rot_matrix = tf.reshape(rot_matrix, shape=[2, 2])
+
+    points_centered_rot = tf.matmul(rot_matrix, tf.transpose(points_centered))
+
+    return tf.transpose(points_centered_rot) + image_center
+
+def rotate_image_tensor(image, angle):
+    s = tf.shape(image)
+    image_center = tf.to_float(s[:2]) / 2.
+
+    # Coordinates of new image
+    xs, ys = tf.meshgrid(tf.range(0.,tf.to_float(s[1])), tf.range(0., tf.to_float(s[0])))
+    coords_new = tf.reshape(tf.stack([ys,xs], 2), [-1, 2])
+
+    # center coordinates since rotation center is supposed to be in the image center
+    coords_new_centered = tf.to_float(coords_new) - image_center
+
+    # Perform backward transformation of the image coordinates
+    rot_mat_inv = tf.stack(
+        [tf.cos(angle), tf.sin(angle), -tf.sin(angle), tf.cos(angle)])
+    rot_mat_inv = tf.reshape(rot_mat_inv, shape=[2, 2])
+    coord_old_centered = tf.matmul(
+        rot_mat_inv, tf.transpose(coords_new_centered))
+    coord_old = tf.to_int32(tf.round(
+        tf.transpose(coord_old_centered) + image_center))
+
+
+    # Find nearest neighbor in old image
+    coord_old_y, coord_old_x = tf.unstack(coord_old, axis=1)
+
+
+    # Clip values to stay inside image coordinates
+    outside_y = tf.logical_or(tf.greater(
+        coord_old_y, s[0]-1), tf.less(coord_old_y, 0))
+    outside_x = tf.logical_or(tf.greater(
+        coord_old_x, s[1]-1), tf.less(coord_old_x, 0))
+    outside_ind = tf.logical_or(outside_y, outside_x)
+
+
+    inside_mask = tf.logical_not(outside_ind)
+    inside_mask = tf.tile(tf.reshape(inside_mask, s[:2])[...,None], tf.stack([1,1,s[2]]))
+
+    coord_old_y = tf.clip_by_value(coord_old_y, 0, s[0]-1)
+    coord_old_x = tf.clip_by_value(coord_old_x, 0, s[1]-1)
+    coord_flat = coord_old_y * s[1] + coord_old_x
+
+    im_flat = tf.reshape(image, tf.stack([-1, s[2]]))
+    rot_image = tf.gather(im_flat, coord_flat)
+    rot_image = tf.reshape(rot_image, s)
+
+
+    return tf.where(inside_mask, rot_image, tf.zeros_like(rot_image))
+
+def lms_to_heatmap(lms, h, w, n_landmarks, marked_index, sigma=5):
+    xs, ys = tf.meshgrid(tf.range(0., tf.to_float(w)),
+                         tf.range(0., tf.to_float(h)))
+    gaussian = (1. / (sigma * np.sqrt(2. * np.pi)))
+    marked_index = tf.to_int32(marked_index)
+
+    def gaussian_fn(lms):
+        y, x, idx = tf.unstack(lms)
+        idx = tf.to_int32(idx)
+
+        def run_true():
+            return tf.exp(-0.5 * (tf.pow(ys - y, 2) + tf.pow(xs - x, 2)) *
+                          tf.pow(1. / sigma, 2.)) * gaussian * 17.
+
+        def run_false():
+            return tf.zeros((h, w))
+
+        return tf.cond(tf.reduce_any(tf.equal(marked_index, idx)), run_true, run_false)
+
+    img_hm = tf.stack(tf.map_fn(gaussian_fn, tf.concat(
+        [lms, tf.to_float(tf.range(0, n_landmarks))[..., None]], 1)))
+
+    return img_hm
+
+class ProtobuffProvider(object):
+    def __init__(self, filename= FLAGS['dataset_dir'].value, batch_size=1, rescale=None, augmentation=False):
+        self.filename = filename
+        self.batch_size = batch_size
+        self.image_extension = 'jpg'
+        self.rescale = rescale
+        self.augmentation = augmentation
+
+    def get(self):
+        images, *names = self._get_data_protobuff(self.filename)
+        tensors = [images]
+
+        for name in names:
+            tensors.append(name)
+
+        return tf.train.shuffle_batch(
+            tensors, self.batch_size, 256, 64, self.batch_size)
+
+    def augmentation_type(self):
+        return tf.stack([tf.random_uniform([1]) - 1,
+                        (tf.random_uniform([1]) * 30. - 15.) * np.pi / 180.,
+                        tf.random_uniform([1]) * 0.5 + 0.75])
+
+    def _image_from_feature(self, features):
+        image = tf.image.decode_jpeg(features['image'], channels=3)
+        image_height = tf.to_int32(features['height'])
+        image_width = tf.to_int32(features['width'])
+        image = tf.reshape(image, (image_height, image_width, 3))
+        image = tf.to_float(image)
+        return image, image_height, image_width
+
+    def _heatmap_from_feature(self, features):
+        n_landmarks = tf.to_int32(features['n_landmarks'])
+        gt_lms = tf.decode_raw(features['gt_pts'], tf.float32)
+        mask_index = tf.decode_raw(features['mask_index'], tf.float32)
+        gt_mask = tf.decode_raw(features['gt_mask'], tf.float32)
+        image_height = tf.to_int32(features['height'])
+        image_width = tf.to_int32(features['width'])
+
+        gt_lms = tf.reshape(gt_lms, (n_landmarks, 2))
+        gt_heatmap = lms_to_heatmap(
+            gt_lms, image_height, image_width, n_landmarks, mask_index)
+        gt_heatmap = tf.transpose(gt_heatmap, perm=[1,2,0])
+
+        return gt_heatmap, gt_lms, n_landmarks, mask_index, gt_mask
+
+
+    def _info_from_feature(self, features):
+        status = features['status']
+        return status
+
+    def _set_shape(self, image, gt_heatmap, gt_lms, mask_index, gt_mask):
+        image.set_shape([None, None, 3])
+        gt_heatmap.set_shape([None, None, FLAGS['n_landmarks']])
+        gt_lms.set_shape([FLAGS['n_landmarks'], 2])
+        mask_index.set_shape([FLAGS['n_landmarks']])
+        gt_mask.set_shape([FLAGS['n_landmarks']])
+
+    def _get_features(self, serialized_example):
+        features = tf.parse_single_example(
+            serialized_example,
+            features={
+                # images
+                'image': tf.FixedLenFeature([], tf.string),
+                'height': tf.FixedLenFeature([], tf.int64),
+                'width': tf.FixedLenFeature([], tf.int64),
+                # landmarks
+                'n_landmarks': tf.FixedLenFeature([], tf.int64),
+                'gt_pts': tf.FixedLenFeature([], tf.string),
+                'gt_mask': tf.FixedLenFeature([], tf.string),
+                'mask_index': tf.FixedLenFeature([], tf.string),
+                'status': tf.FixedLenFeature([], tf.int64),
+            }
+
+        )
+        return features
+
+    def _get_data_protobuff(self, filename):
+        filename = str(filename).split(',')
+        filename_queue = tf.train.string_input_producer(filename,
+                                                        num_epochs=None)
+        reader = tf.TFRecordReader()
+        _, serialized_example = reader.read(filename_queue)
+        features = self._get_features(serialized_example)
+
+        # image
+        image, image_height, image_width = self._image_from_feature(features)
+
+        # landmarks
+        gt_heatmap, gt_lms, n_landmarks, mask_index, gt_mask = self._heatmap_from_feature(features)
+
+        # infomations
+        status = self._info_from_feature(features)
+
+        # augmentation
+        if self.augmentation:
+            do_flip, do_rotate, do_scale = tf.unstack(self.augmentation_type())
+
+            # rescale
+            image_height = tf.to_int32(tf.to_float(image_height) * do_scale[0])
+            image_width = tf.to_int32(tf.to_float(image_width) * do_scale[0])
+
+            image = tf.image.resize_images(image, tf.stack([image_height, image_width]))
+            gt_heatmap = tf.image.resize_images(gt_heatmap, tf.stack([image_height, image_width]))
+            gt_lms = gt_lms*do_scale
+
+            # rotate
+            image = rotate_image_tensor(image, do_rotate)
+            gt_heatmap = rotate_image_tensor(gt_heatmap, do_rotate)
+            gt_lms = rotate_points_tensor(gt_lms, image, do_rotate)
+
+        # crop to 256 * 256
+        target_h = tf.to_int32(256)
+        target_w = tf.to_int32(256)
+        offset_h = tf.to_int32((image_height - target_h) / 2)
+        offset_w = tf.to_int32((image_width - target_w) / 2)
+
+        image = tf.image.crop_to_bounding_box(
+            image, offset_h, offset_w, target_h, target_w)
+
+        gt_heatmap = tf.image.crop_to_bounding_box(
+            gt_heatmap, offset_h, offset_w, target_h, target_w)
+
+        gt_lms = gt_lms - tf.to_float(tf.stack([offset_h, offset_w]))
+
+        self._set_shape(image, gt_heatmap, gt_lms, mask_index, gt_mask)
+
+        return image, gt_heatmap, gt_lms, mask_index, gt_mask
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/flags.py b/insightface/reconstruction/ostec/external/landmark_detector/flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..d89fe8119de426b1285bb36e779cc67ae5c1fc56
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/flags.py
@@ -0,0 +1,32 @@
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_float('initial_learning_rate', 0.0001, '''Initial learning rate.''')
+tf.app.flags.DEFINE_float('num_epochs_per_decay', 5.0, '''Epochs after which learning rate decays.''')
+tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.97, '''Learning rate decay factor.''')
+tf.app.flags.DEFINE_float('learning_rate_decay_step', 30000,'''Learning rate decay factor.''')
+
+tf.app.flags.DEFINE_integer('batch_size', 4, '''The batch size to use.''')
+tf.app.flags.DEFINE_integer('eval_size', 4, '''The batch size to use.''')
+tf.app.flags.DEFINE_integer('num_iterations', 2, '''The number of iterations to unfold the pose machine.''')
+tf.app.flags.DEFINE_integer('num_preprocess_threads', 4,'''How many preprocess threads to use.''')
+tf.app.flags.DEFINE_integer('n_landmarks', 84,'''number of landmarks.''')
+tf.app.flags.DEFINE_integer('rescale', 256,'''Image scale.''')
+
+tf.app.flags.DEFINE_string('dataset_dir', './data', '''Directory where to load datas.''')
+tf.app.flags.DEFINE_string('train_dir', 'ckpt/train', '''Directory where to write event logs and checkpoint.''')
+tf.app.flags.DEFINE_string('eval_dir', '','''Directory where to write event logs and checkpoint.''')
+tf.app.flags.DEFINE_string('graph_dir', 'model/weight.pkl','''If specified, restore this pretrained model.''')
+
+tf.app.flags.DEFINE_integer('max_steps', 1000000,'''Number of batches to run.''')
+tf.app.flags.DEFINE_string('train_device', '/gpu:0','''Device to train with.''')
+
+tf.app.flags.DEFINE_integer('flip_pred', 0,'''db name.''')
+
+tf.app.flags.DEFINE_string('train_model', '', '''training model.''')
+tf.app.flags.DEFINE_string('pretrained_model_checkpoint_path', '', '''Restore pretrained model.''')
+tf.app.flags.DEFINE_string('testset_name', '', '''test set name.''')
+tf.app.flags.DEFINE_string('model_name', '', '''test model name.''')
+tf.app.flags.DEFINE_string('savemat_name', '', '''save_mat_name''')
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/models.py b/insightface/reconstruction/ostec/external/landmark_detector/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d7c4c45ca5950cfd9a0d1d0f0bae9f70787e355
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/models.py
@@ -0,0 +1,1333 @@
+import tensorflow as tf
+import numpy as np
+
+slim = tf.contrib.slim
+
+# custom layers
+
+def deconv_layer(net, up_scale, n_channel, method='transpose'):
+    nh = tf.shape(net)[-3] * up_scale
+    nw = tf.shape(net)[-2] * up_scale
+
+    if method == 'transpose':
+        net = slim.conv2d_transpose(net, n_channel, (up_scale, up_scale), (
+            up_scale, up_scale), activation_fn=None, padding='VALID')
+    elif method == 'transpose+conv':
+        net = slim.conv2d_transpose(net, n_channel, (up_scale, up_scale), (
+            up_scale, up_scale), activation_fn=None, padding='VALID')
+        net = slim.conv2d(net, n_channel, (3, 3), (1, 1))
+    elif method == 'transpose+conv+relu':
+        net = slim.conv2d_transpose(net, n_channel, (up_scale, up_scale), (
+            up_scale, up_scale), padding='VALID')
+        net = slim.conv2d(net, n_channel, (3, 3), (1, 1))
+    elif method == 'bilinear':
+        net = tf.image.resize_images(net, [nh, nw])
+    else:
+        raise Exception('Unrecognised Deconvolution Method: %s' % method)
+
+    return net
+
+
+# arg scopes
+def hourglass_arg_scope_torch(weight_decay=0.0001,
+                              batch_norm_decay=0.997,
+                              batch_norm_epsilon=1e-5,
+                              batch_norm_scale=True):
+    """Defines the default ResNet arg scope.
+  Args:
+    is_training: Whether or not we are training the parameters in the batch
+      normalization layers of the model.
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: The moving average decay when estimating layer activation
+      statistics in batch normalization.
+    batch_norm_epsilon: Small constant to prevent division by zero when
+      normalizing activations by their variance in batch normalization.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
+  Returns:
+    An `arg_scope` to use for the resnet models.
+  """
+    batch_norm_params = {
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon,
+        'scale': batch_norm_scale,
+        'updates_collections': tf.GraphKeys.UPDATE_OPS,
+    }
+
+    with slim.arg_scope(
+        [slim.conv2d],
+            weights_regularizer=slim.l2_regularizer(weight_decay),
+            weights_initializer=slim.variance_scaling_initializer(),
+            activation_fn=None,
+            normalizer_fn=None,
+            normalizer_params=None):
+        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+            with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
+                return arg_sc
+
+
+def hourglass_arg_scope_tf(weight_decay=0.0001,
+                           batch_norm_decay=0.997,
+                           batch_norm_epsilon=1e-5,
+                           batch_norm_scale=True):
+    """Defines the default ResNet arg scope.
+  Args:
+    is_training: Whether or not we are training the parameters in the batch
+      normalization layers of the model.
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: The moving average decay when estimating layer activation
+      statistics in batch normalization.
+    batch_norm_epsilon: Small constant to prevent division by zero when
+      normalizing activations by their variance in batch normalization.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
+  Returns:
+    An `arg_scope` to use for the resnet models.
+  """
+    batch_norm_params = {
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon,
+        'scale': batch_norm_scale,
+        'updates_collections': tf.GraphKeys.UPDATE_OPS,
+    }
+
+    with slim.arg_scope(
+        [slim.conv2d],
+            weights_regularizer=slim.l2_regularizer(weight_decay),
+            weights_initializer=slim.variance_scaling_initializer(),
+            activation_fn=tf.nn.relu,
+            normalizer_fn=slim.batch_norm,
+            normalizer_params=batch_norm_params):
+        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+            with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
+                return arg_sc
+
+
+# bottleneck_inception_SE
+def bottleneck_inception_SE_module(
+        inputs,
+        out_channel=256,
+        res=None,
+        scope='inception_block'):
+
+    min_channel = out_channel // 8
+    with tf.variable_scope(scope):
+        with tf.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(inputs, min_channel * 3,
+                                   [1, 1], scope='Conv2d_1x1')
+        with tf.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(inputs, min_channel *
+                                   3 / 2, [1, 1], scope='Conv2d_1x1')
+            branch_1 = slim.conv2d(
+                branch_1, min_channel * 3, [3, 3], scope='Conv2d_3x3')
+        with tf.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(inputs, min_channel //
+                                   3, [1, 1], scope='Conv2d_1x1')
+            branch_2 = slim.conv2d(
+                branch_2, min_channel, [3, 3], scope='Conv2d_3x3')
+        with tf.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(inputs, [3, 3], 1, scope='MaxPool_3x3')
+            branch_3 = slim.conv2d(
+                branch_3, min_channel, [1, 1], scope='Conv2d_1x1')
+        net = tf.concat(
+            axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+
+        se_branch = tf.reduce_mean(net, axis=[1, 2])
+        se_branch = slim.fully_connected(se_branch, out_channel // 16)
+        se_branch = slim.fully_connected(
+            se_branch, out_channel, activation_fn=tf.sigmoid)
+
+        net = net * se_branch[:,None,None,:]
+
+        if res:
+            inputs = slim.conv2d(inputs, res, (1, 1),
+                                 scope='bn_res'.format(scope))
+
+        net += inputs
+
+    return net
+
+
+# bottle neck modules
+def bottleneck_inception_module(
+        inputs,
+        out_channel=256,
+        res=None,
+        scope='inception_block'):
+
+    min_channel = out_channel // 8
+    with tf.variable_scope(scope):
+        with tf.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(inputs, min_channel * 3,
+                                   [1, 1], scope='Conv2d_1x1')
+        with tf.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(inputs, min_channel *
+                                   3 / 2, [1, 1], scope='Conv2d_1x1')
+            branch_1 = slim.conv2d(
+                branch_1, min_channel * 3, [3, 3], scope='Conv2d_3x3')
+        with tf.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(inputs, min_channel //
+                                   3, [1, 1], scope='Conv2d_1x1')
+            branch_2 = slim.conv2d(
+                branch_2, min_channel, [3, 3], scope='Conv2d_3x3')
+        with tf.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(inputs, [3, 3], 1, scope='MaxPool_3x3')
+            branch_3 = slim.conv2d(
+                branch_3, min_channel, [1, 1], scope='Conv2d_1x1')
+        net = tf.concat(
+            axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+
+        if res:
+            inputs = slim.conv2d(inputs, res, (1, 1),
+                                 scope='bn_res'.format(scope))
+
+        net += inputs
+
+    return net
+
+
+def bottleneck_module(inputs, out_channel=256, res=None, scope=''):
+
+    with tf.variable_scope(scope):
+        net = slim.stack(inputs, slim.conv2d, [
+                         (out_channel // 2, [1, 1]), (out_channel // 2, [3, 3]), (out_channel, [1, 1])], scope='conv')
+        if res:
+            inputs = slim.conv2d(inputs, res, (1, 1),
+                                 scope='bn_res'.format(scope))
+        net += inputs
+
+        return net
+
+
+# recursive hourglass definition
+def hourglass_module(inputs, depth=0, deconv='bilinear', bottleneck='bottleneck'):
+
+    bm_fn = globals()['%s_module' % bottleneck]
+
+    with tf.variable_scope('depth_{}'.format(depth)):
+        # buttom up layers
+        net = slim.max_pool2d(inputs, [2, 2], scope='pool')
+        net = slim.stack(net, bm_fn, [
+                         (256, None), (256, None), (256, None)], scope='buttom_up')
+
+        # connecting layers
+        if depth > 0:
+            net = hourglass_module(net, depth=depth - 1, deconv=deconv)
+        else:
+            net = bm_fn(
+                net, out_channel=512, res=512, scope='connecting')
+
+        # top down layers
+        net = bm_fn(net, out_channel=512,
+                    res=512, scope='top_down')
+        net = deconv_layer(net, 2, 512, method=deconv)
+        # residual layers
+        net += slim.stack(inputs, bm_fn,
+                          [(256, None), (256, None), (512, 512)], scope='res')
+
+        return net
+
+
+def hourglass(inputs,
+              scale=1,
+              regression_channels=2,
+              classification_channels=22,
+              deconv='bilinear',
+              bottleneck='bottleneck'):
+    """Defines a lightweight resnet based model for dense estimation tasks.
+    Args:
+      inputs: A `Tensor` with dimensions [num_batches, height, width, depth].
+      scale: A scalar which denotes the factor to subsample the current image.
+      output_channels: The number of output channels. E.g., for human pose
+        estimation this equals 13 channels.
+    Returns:
+      A `Tensor` of dimensions [num_batches, height, width, output_channels]."""
+
+    out_shape = tf.shape(inputs)[1:3]
+
+    if scale > 1:
+        inputs = tf.pad(inputs, ((0, 0), (1, 1), (1, 1), (0, 0)))
+        inputs = slim.layers.avg_pool2d(
+            inputs, (3, 3), (scale, scale), padding='VALID')
+
+    output_channels = regression_channels + classification_channels
+
+    with slim.arg_scope(hourglass_arg_scope_tf()):
+        # D1
+        net = slim.conv2d(inputs, 64, (7, 7), 2, scope='conv1')
+        net = bottleneck_module(net, out_channel=128,
+                                res=128, scope='bottleneck1')
+        net = slim.max_pool2d(net, [2, 2], scope='pool1')
+
+        # D2
+        net = slim.stack(net, bottleneck_module, [
+                         (128, None), (128, None), (256, 256)], scope='conv2')
+
+        # hourglasses (D3,D4,D5)
+        with tf.variable_scope('hourglass'):
+            net = hourglass_module(
+                net, depth=4, deconv=deconv, bottleneck=bottleneck)
+
+        # final layers (D6, D7)
+        net = slim.stack(net, slim.conv2d, [(512, [1, 1]), (256, [1, 1]),
+                                            (output_channels, [1, 1])
+                                            ], scope='conv3')
+
+        net = deconv_layer(net, 4, output_channels, method=deconv)
+        net = slim.conv2d(net, output_channels, 1, scope='conv_last')
+
+    regression = slim.conv2d(
+        net, regression_channels, 1, activation_fn=None
+    ) if regression_channels else None
+
+    logits = slim.conv2d(
+        net, classification_channels, 1, activation_fn=None
+    ) if classification_channels else None
+
+    return regression, logits
+
+
+def StackedHourglassTorch(inputs, out_channels=16, deconv='bilinear'):
+    net = inputs
+    with tf.name_scope('nn.Sequential'):
+        with tf.name_scope('nn.Sequential'):
+            net = tf.pad(net, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]))
+            net = slim.conv2d(net, 64, (7, 7), (2, 2),
+                              activation_fn=None, padding='VALID')
+            net = slim.batch_norm(net)
+            net = slim.nn.relu(net)
+            with tf.name_scope('nn.Sequential'):
+                with tf.name_scope('nn.ConcatTable'):
+                    net0 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                    net1 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net1 = tf.pad(net1, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net1 = slim.conv2d(
+                            net1, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                net = tf.add_n([net0, net1])
+            net = tf.pad(net, np.array([[0, 0], [0, 0], [0, 0], [0, 0]]))
+            net = slim.max_pool2d(net, (2, 2), (2, 2))
+            with tf.name_scope('nn.Sequential'):
+                with tf.name_scope('nn.ConcatTable'):
+                    net0 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                    net1 = net
+                net = tf.add_n([net0, net1])
+            with tf.name_scope('nn.Sequential'):
+                with tf.name_scope('nn.ConcatTable'):
+                    net0 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 64, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                    net1 = net
+                net = tf.add_n([net0, net1])
+            with tf.name_scope('nn.Sequential'):
+                with tf.name_scope('nn.ConcatTable'):
+                    net0 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                        net0 = slim.batch_norm(net0)
+                        net0 = slim.nn.relu(net0)
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.conv2d(
+                            net0, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                    net1 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net1 = tf.pad(net1, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net1 = slim.conv2d(
+                            net1, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                net = tf.add_n([net0, net1])
+            with tf.name_scope('nn.Sequential'):
+                with tf.name_scope('nn.ConcatTable'):
+                    net0 = net
+                    with tf.name_scope('nn.Sequential'):
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = slim.max_pool2d(net0, (2, 2), (2, 2))
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net00 = net0
+                                with tf.name_scope('nn.Sequential'):
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net01 = net0
+                            net0 = tf.add_n([net00, net01])
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net00 = net0
+                                with tf.name_scope('nn.Sequential'):
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net01 = net0
+                            net0 = tf.add_n([net00, net01])
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net00 = net0
+                                with tf.name_scope('nn.Sequential'):
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net00 = slim.batch_norm(net00)
+                                    net00 = slim.nn.relu(net00)
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.conv2d(
+                                        net00, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net01 = net0
+                            net0 = tf.add_n([net00, net01])
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net00 = net0
+                                with tf.name_scope('nn.Sequential'):
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = slim.max_pool2d(
+                                        net00, (2, 2), (2, 2))
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net000 = net00
+                                            with tf.name_scope('nn.Sequential'):
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net001 = net00
+                                        net00 = tf.add_n([net000, net001])
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net000 = net00
+                                            with tf.name_scope('nn.Sequential'):
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net001 = net00
+                                        net00 = tf.add_n([net000, net001])
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net000 = net00
+                                            with tf.name_scope('nn.Sequential'):
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net000 = slim.batch_norm(
+                                                    net000)
+                                                net000 = slim.nn.relu(net000)
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.conv2d(
+                                                    net000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net001 = net00
+                                        net00 = tf.add_n([net000, net001])
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net000 = net00
+                                            with tf.name_scope('nn.Sequential'):
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = slim.max_pool2d(
+                                                    net000, (2, 2), (2, 2))
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0000 = net000
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0001 = net000
+                                                    net000 = tf.add_n(
+                                                        [net0000, net0001])
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0000 = net000
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0001 = net000
+                                                    net000 = tf.add_n(
+                                                        [net0000, net0001])
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0000 = net000
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0000 = slim.batch_norm(
+                                                                net0000)
+                                                            net0000 = slim.nn.relu(
+                                                                net0000)
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.conv2d(
+                                                                net0000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0001 = net000
+                                                    net000 = tf.add_n(
+                                                        [net0000, net0001])
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0000 = net000
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = slim.max_pool2d(
+                                                                net0000, (2, 2), (2, 2))
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00000 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00001 = net0000
+                                                                net0000 = tf.add_n(
+                                                                    [net00000, net00001])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00000 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00001 = net0000
+                                                                net0000 = tf.add_n(
+                                                                    [net00000, net00001])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00000 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00001 = net0000
+                                                                net0000 = tf.add_n(
+                                                                    [net00000, net00001])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00000 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00001 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00001 = tf.pad(net00001, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00001 = slim.conv2d(
+                                                                            net00001, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                net0000 = tf.add_n(
+                                                                    [net00000, net00001])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00000 = net0000
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00000 = slim.batch_norm(
+                                                                            net00000)
+                                                                        net00000 = slim.nn.relu(
+                                                                            net00000)
+                                                                        net00000 = tf.pad(net00000, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00000 = slim.conv2d(
+                                                                            net00000, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00001 = net0000
+                                                                net0000 = tf.add_n(
+                                                                    [net00000, net00001])
+                                                            net0000 = tf.pad(net0000, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0000 = deconv_layer(
+                                                                net0000, 2, 512, method=deconv)
+                                                        net0001 = net000
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00010 = net0001
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00011 = net0001
+                                                                net0001 = tf.add_n(
+                                                                    [net00010, net00011])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00010 = net0001
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00011 = net0001
+                                                                net0001 = tf.add_n(
+                                                                    [net00010, net00011])
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                with tf.name_scope('nn.ConcatTable'):
+                                                                    net00010 = net0001
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                        net00010 = slim.batch_norm(
+                                                                            net00010)
+                                                                        net00010 = slim.nn.relu(
+                                                                            net00010)
+                                                                        net00010 = tf.pad(net00010, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00010 = slim.conv2d(
+                                                                            net00010, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                    net00011 = net0001
+                                                                    with tf.name_scope('nn.Sequential'):
+                                                                        net00011 = tf.pad(net00011, np.array(
+                                                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                        net00011 = slim.conv2d(
+                                                                            net00011, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                net0001 = tf.add_n(
+                                                                    [net00010, net00011])
+                                                    net000 = tf.add_n(
+                                                        [net0000, net0001])
+                                                    with tf.name_scope('nn.Sequential'):
+                                                        with tf.name_scope('nn.ConcatTable'):
+                                                            net0000 = net000
+                                                            with tf.name_scope('nn.Sequential'):
+                                                                net0000 = slim.batch_norm(
+                                                                    net0000)
+                                                                net0000 = slim.nn.relu(
+                                                                    net0000)
+                                                                net0000 = tf.pad(net0000, np.array(
+                                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                net0000 = slim.conv2d(
+                                                                    net0000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                                net0000 = slim.batch_norm(
+                                                                    net0000)
+                                                                net0000 = slim.nn.relu(
+                                                                    net0000)
+                                                                net0000 = tf.pad(net0000, np.array(
+                                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                                net0000 = slim.conv2d(
+                                                                    net0000, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                                net0000 = slim.batch_norm(
+                                                                    net0000)
+                                                                net0000 = slim.nn.relu(
+                                                                    net0000)
+                                                                net0000 = tf.pad(net0000, np.array(
+                                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                                net0000 = slim.conv2d(
+                                                                    net0000, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0001 = net000
+                                                        net000 = tf.add_n(
+                                                            [net0000, net0001])
+                                                net000 = tf.pad(net000, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net000 = deconv_layer(
+                                                    net000, 2, 512, method=deconv)
+                                            net001 = net00
+                                            with tf.name_scope('nn.Sequential'):
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0010 = net001
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0011 = net001
+                                                    net001 = tf.add_n(
+                                                        [net0010, net0011])
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0010 = net001
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0011 = net001
+                                                    net001 = tf.add_n(
+                                                        [net0010, net0011])
+                                                with tf.name_scope('nn.Sequential'):
+                                                    with tf.name_scope('nn.ConcatTable'):
+                                                        net0010 = net001
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                            net0010 = slim.batch_norm(
+                                                                net0010)
+                                                            net0010 = slim.nn.relu(
+                                                                net0010)
+                                                            net0010 = tf.pad(net0010, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0010 = slim.conv2d(
+                                                                net0010, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                        net0011 = net001
+                                                        with tf.name_scope('nn.Sequential'):
+                                                            net0011 = tf.pad(net0011, np.array(
+                                                                [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                            net0011 = slim.conv2d(
+                                                                net0011, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                    net001 = tf.add_n(
+                                                        [net0010, net0011])
+                                        net00 = tf.add_n([net000, net001])
+                                        with tf.name_scope('nn.Sequential'):
+                                            with tf.name_scope('nn.ConcatTable'):
+                                                net000 = net00
+                                                with tf.name_scope('nn.Sequential'):
+                                                    net000 = slim.batch_norm(
+                                                        net000)
+                                                    net000 = slim.nn.relu(
+                                                        net000)
+                                                    net000 = tf.pad(net000, np.array(
+                                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                    net000 = slim.conv2d(
+                                                        net000, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                    net000 = slim.batch_norm(
+                                                        net000)
+                                                    net000 = slim.nn.relu(
+                                                        net000)
+                                                    net000 = tf.pad(net000, np.array(
+                                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                    net000 = slim.conv2d(
+                                                        net000, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                    net000 = slim.batch_norm(
+                                                        net000)
+                                                    net000 = slim.nn.relu(
+                                                        net000)
+                                                    net000 = tf.pad(net000, np.array(
+                                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                    net000 = slim.conv2d(
+                                                        net000, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net001 = net00
+                                            net00 = tf.add_n([net000, net001])
+                                    net00 = tf.pad(net00, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net00 = deconv_layer(
+                                        net00, 2, 512, method=deconv)
+                                net01 = net0
+                                with tf.name_scope('nn.Sequential'):
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net010 = net01
+                                            with tf.name_scope('nn.Sequential'):
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net011 = net01
+                                        net01 = tf.add_n([net010, net011])
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net010 = net01
+                                            with tf.name_scope('nn.Sequential'):
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net011 = net01
+                                        net01 = tf.add_n([net010, net011])
+                                    with tf.name_scope('nn.Sequential'):
+                                        with tf.name_scope('nn.ConcatTable'):
+                                            net010 = net01
+                                            with tf.name_scope('nn.Sequential'):
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                                net010 = slim.batch_norm(
+                                                    net010)
+                                                net010 = slim.nn.relu(net010)
+                                                net010 = tf.pad(net010, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net010 = slim.conv2d(
+                                                    net010, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                            net011 = net01
+                                            with tf.name_scope('nn.Sequential'):
+                                                net011 = tf.pad(net011, np.array(
+                                                    [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                                net011 = slim.conv2d(
+                                                    net011, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                        net01 = tf.add_n([net010, net011])
+                            net0 = tf.add_n([net00, net01])
+                            with tf.name_scope('nn.Sequential'):
+                                with tf.name_scope('nn.ConcatTable'):
+                                    net00 = net0
+                                    with tf.name_scope('nn.Sequential'):
+                                        net00 = slim.batch_norm(net00)
+                                        net00 = slim.nn.relu(net00)
+                                        net00 = tf.pad(net00, np.array(
+                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                        net00 = slim.conv2d(
+                                            net00, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                        net00 = slim.batch_norm(net00)
+                                        net00 = slim.nn.relu(net00)
+                                        net00 = tf.pad(net00, np.array(
+                                            [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                        net00 = slim.conv2d(
+                                            net00, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                        net00 = slim.batch_norm(net00)
+                                        net00 = slim.nn.relu(net00)
+                                        net00 = tf.pad(net00, np.array(
+                                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                        net00 = slim.conv2d(
+                                            net00, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net01 = net0
+                                net0 = tf.add_n([net00, net01])
+                        net0 = tf.pad(net0, np.array(
+                            [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                        net0 = deconv_layer(net0, 2, 512, method=deconv)
+
+                    net1 = net
+                    with tf.name_scope('nn.Sequential'):
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net10 = net1
+                                with tf.name_scope('nn.Sequential'):
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net11 = net1
+                            net1 = tf.add_n([net10, net11])
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net10 = net1
+                                with tf.name_scope('nn.Sequential'):
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 128, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 128, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net11 = net1
+                            net1 = tf.add_n([net10, net11])
+                        with tf.name_scope('nn.Sequential'):
+                            with tf.name_scope('nn.ConcatTable'):
+                                net10 = net1
+                                with tf.name_scope('nn.Sequential'):
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 256, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [1, 1], [1, 1], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 256, (3, 3), (1, 1), activation_fn=None, padding='VALID')
+                                    net10 = slim.batch_norm(net10)
+                                    net10 = slim.nn.relu(net10)
+                                    net10 = tf.pad(net10, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net10 = slim.conv2d(
+                                        net10, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                                net11 = net1
+                                with tf.name_scope('nn.Sequential'):
+                                    net11 = tf.pad(net11, np.array(
+                                        [[0, 0], [0, 0], [0, 0], [0, 0]]))
+                                    net11 = slim.conv2d(
+                                        net11, 512, (1, 1), (1, 1), activation_fn=None, padding='VALID')
+                            net1 = tf.add_n([net10, net11])
+                net = tf.add_n([net0, net1])
+            net = tf.pad(net, np.array([[0, 0], [0, 0], [0, 0], [0, 0]]))
+            net = slim.conv2d(net, 512, (1, 1), (1, 1),
+                              activation_fn=None, padding='VALID')
+            net = slim.batch_norm(net)
+            net = slim.nn.relu(net)
+            net = tf.pad(net, np.array([[0, 0], [0, 0], [0, 0], [0, 0]]))
+            net = slim.conv2d(net, 256, (1, 1), (1, 1),
+                              activation_fn=None, padding='VALID')
+            net = slim.batch_norm(net)
+            net = slim.nn.relu(net)
+            net = tf.pad(net, np.array([[0, 0], [0, 0], [0, 0], [0, 0]]))
+            net = slim.conv2d(net, out_channels, (1, 1), (1, 1),
+                              activation_fn=None, padding='VALID')
+        net = tf.pad(net, np.array([[0, 0], [0, 0], [0, 0], [0, 0]]))
+        net = deconv_layer(net, 4, out_channels, method=deconv)
+
+    return net
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/networks.py b/insightface/reconstruction/ostec/external/landmark_detector/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c96fca886cfea0b71b6682f406406c64482a020
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/networks.py
@@ -0,0 +1,180 @@
+import tensorflow as tf
+from external.landmark_detector import utils, models, data_provider
+
+from tensorflow.python.platform import tf_logging as logging
+
+slim = tf.contrib.slim
+
+from external.landmark_detector.flags import FLAGS
+
+# general framework
+class DeepNetwork(object):
+    def __init__(self):
+        pass
+
+    def _build_network(self, inputs, datas):
+        pass
+
+    def _build_losses(self, predictions, states, images, datas):
+        pass
+
+    def _build_summaries(self, predictions, states, images, datas):
+        tf.summary.image('images', images[:, :, :, :3], max_outputs=min(FLAGS['batch_size'], 3))
+
+    def _get_data(self):
+        provider = data_provider.ProtobuffProvider(
+            filename=FLAGS['dataset_dir'],
+            batch_size=FLAGS['batch_size'],
+            rescale=FLAGS['rescale'],
+            augmentation=FLAGS['eval_dir']=='',
+            )
+        return provider.get()
+
+    def _build_restore_fn(self, sess):
+        init_fn = None
+
+        if FLAGS['pretrained_model_checkpoint_path']:
+            print('Loading whole model ...')
+            variables_to_restore = slim.get_model_variables()
+            init_fn =  slim.assign_from_checkpoint_fn(
+                FLAGS['pretrained_model_checkpoint_path'],
+                variables_to_restore,
+                ignore_missing_vars=True)
+        return init_fn
+
+
+    def train(self):
+        g = tf.Graph()
+        logging.set_verbosity(10)
+
+        with g.as_default():
+            # Load datasets.
+
+            images, *datas = self._get_data()
+            images /= 255.
+
+            # Define model graph.
+            with tf.variable_scope('net'):
+                with slim.arg_scope([slim.batch_norm, slim.layers.dropout],
+                is_training=True):
+
+                    predictions, states = self._build_network(images, datas)
+
+                    # custom losses
+                    self._build_losses(predictions, states, images, datas)
+
+                    # total losses
+                    total_loss = slim.losses.get_total_loss()
+                    tf.summary.scalar('losses/total loss', total_loss)
+
+                    # image summaries
+                    self._build_summaries(predictions, states, images, datas)
+
+                    # learning rate decay
+                    global_step = slim.get_or_create_global_step()
+
+                    learning_rate = tf.train.exponential_decay(
+                        FLAGS['initial_learning_rate'],
+                        global_step,
+                        FLAGS['learning_rate_decay_step'] / FLAGS['batch_size'],
+                        FLAGS['learning_rate_decay_factor'],
+                        staircase=True)
+
+                    tf.summary.scalar('learning rate', learning_rate)
+
+                    optimizer = tf.train.AdamOptimizer(learning_rate)
+
+        with tf.Session(graph=g) as sess:
+            init_fn = self._build_restore_fn(sess)
+            train_op = slim.learning.create_train_op(
+                total_loss,
+                optimizer,
+                summarize_gradients=True)
+
+            logging.set_verbosity(1)
+
+            slim.learning.train(train_op,
+                FLAGS['train_dir'],
+                save_summaries_secs=60,
+                init_fn=init_fn,
+                save_interval_secs=600)
+
+class DNFaceMultiView(DeepNetwork):
+    def __init__(self, n_lms=FLAGS['n_landmarks']):
+        super(DNFaceMultiView, self).__init__()
+        self.n_lms = n_lms
+
+
+    def _get_data(self):
+        provider = data_provider.ProtobuffProvider(
+            filename=FLAGS['dataset_dir'],
+            batch_size=FLAGS['batch_size'],
+            rescale=FLAGS['rescale'],
+            augmentation=FLAGS['eval_dir']=='',
+            )
+        return provider.get()
+
+
+    def _build_network(self, inputs, datas=None, n_stacks=1, n_channels=FLAGS['n_landmarks'], is_training=True):
+        # gt_heatmap, gt_lms, mask_index, gt_mask = datas
+
+        batch_size = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+
+        net = inputs
+
+        # net = models.StackedHourglass(net, FLAGS.n_landmarks)
+        # states.append(net)
+        # net = tf.stop_gradient(net)
+        # net *= gt_mask[:,None,None,:]
+        # net = tf.concat([inputs,net], 3)
+        # net = models.StackedHourglass(net, FLAGS.n_landmarks)
+        # states.append(net)
+
+        batch_size = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+        channels = tf.shape(inputs)[3]
+
+        states = []
+
+        with slim.arg_scope([slim.batch_norm, slim.layers.dropout], is_training=is_training):
+            with slim.arg_scope(models.hourglass_arg_scope_tf()):
+                net = None
+                # stacked hourglass
+                for i in range(n_stacks):
+                    with tf.variable_scope('stack_%02d' % i):
+                        if net is not None:
+                            net = tf.concat((inputs, net), 3)
+                        else:
+                            net = inputs
+
+                        net, _ = models.hourglass(
+                            net,
+                            regression_channels=n_channels,
+                            classification_channels=0,
+                            deconv='transpose',
+                            bottleneck='bottleneck_inception')
+
+                        states.append(net)
+
+                prediction = net
+                return prediction, states
+
+    def _build_losses(self, predictions, states, images, datas):
+        gt_heatmap, gt_lms, mask_index, gt_mask = datas
+
+        weight_hm = utils.get_weight(gt_heatmap, tf.ones_like(gt_heatmap), ng_w=0.1, ps_w=1) * 500
+        weight_hm *= gt_mask[:,None,None,:]
+
+        l2norm = slim.losses.mean_squared_error(states[0], gt_heatmap, weights=weight_hm)
+
+        tf.summary.scalar('losses/lms_pred', l2norm)
+
+    def _build_summaries(self, predictions, states, images, datas):
+        super()._build_summaries(predictions, states, images, datas)
+
+        gt_heatmap, gt_lms, mask_index, gt_mask = datas
+
+        tf.summary.image('predictions/landmark-regression', tf.reduce_sum(predictions, -1)[...,None] * 255.0, max_outputs=min(FLAGS['batch_size'],3))
diff --git a/insightface/reconstruction/ostec/external/landmark_detector/utils.py b/insightface/reconstruction/ostec/external/landmark_detector/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a3e9c5b77691383aa31f84e696dc4f063ddc55
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/landmark_detector/utils.py
@@ -0,0 +1,421 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+import menpo.io as mio
+from menpo.image import Image
+from menpo.shape import PointCloud
+import cv2
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import optimizer as tf_optimizer
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables as tf_variables
+
+from menpo.transform import Translation, Scale
+from menpo.shape import PointCloud
+
+slim = tf.contrib.slim
+
+def generate_heatmap(logits, num_classes):
+    """Generates a coloured heatmap from the keypoint logits.
+
+    Args:
+        features: A `Tensor` of dimensions [num_batch, height, width, FLAGS.n_landmarks + 1].
+    """
+
+    keypoint_colours = np.array([plt.cm.spectral(x) for x in np.linspace(0, 1, num_classes + 1)])[
+        ..., :3].astype(np.float32)
+
+    prediction = tf.nn.softmax(logits)
+    heatmap = tf.matmul(tf.reshape(prediction, (-1, num_classes + 1)), keypoint_colours)
+    heatmap = tf.reshape(heatmap, (tf.shape(prediction)[0],
+                                   tf.shape(prediction)[1],
+                                   tf.shape(prediction)[2], 3))
+    return heatmap
+
+def generate_landmarks(keypoints):
+    is_background = tf.equal(keypoints, 0)
+    ones = tf.to_float(tf.ones_like(is_background))
+    zeros = tf.to_float(tf.zeros_like(is_background))
+
+    return tf.where(is_background, zeros, ones) * 255
+
+def project_landmarks_to_shape_model(landmarks):
+    final = []
+
+    for lms in landmarks:
+        lms = PointCloud(lms)
+        similarity = AlignmentSimilarity(pca.global_transform.source, lms)
+        projected_target = similarity.pseudoinverse().apply(lms)
+        target = pca.model.reconstruct(projected_target)
+        target = similarity.apply(target)
+        final.append(target.points)
+
+    return np.array(final).astype(np.float32)
+
+def rescale_image(image, stride_width=64):
+    # make sure smallest size is 600 pixels wide & dimensions are (k * stride_width) + 1
+    height, width = image.shape
+
+    # Taken from 'szross'
+    scale_up = 625. / min(height, width)
+    scale_cap = 961. / max(height, width)
+    scale_up  = min(scale_up, scale_cap)
+    new_height = stride_width * round((height * scale_up) / stride_width) + 1
+    new_width = stride_width * round((width * scale_up) / stride_width) + 1
+    image, tr = image.resize([new_height, new_width], return_transform=True)
+    image.inverse_tr = tr
+    return image
+
+def frankotchellappa(dzdx, dzdy):
+    from numpy.fft import ifftshift, fft2, ifft2
+    rows, cols = dzdx.shape
+    # The following sets up matrices specifying frequencies in the x and y
+    # directions corresponding to the Fourier transforms of the gradient
+    # data.  They range from -0.5 cycles/pixel to + 0.5 cycles/pixel.
+    # The scaling of this is irrelevant as long as it represents a full
+    # circle domain. This is functionally equivalent to any constant * pi
+    pi_over_2 = np.pi / 2.0
+    row_grid = np.linspace(-pi_over_2, pi_over_2, rows)
+    col_grid = np.linspace(-pi_over_2, pi_over_2, cols)
+    wy, wx = np.meshgrid(row_grid, col_grid, indexing='ij')
+
+    # Quadrant shift to put zero frequency at the appropriate edge
+    wx = ifftshift(wx)
+    wy = ifftshift(wy)
+
+    # Fourier transforms of gradients
+    DZDX = fft2(dzdx)
+    DZDY = fft2(dzdy)
+
+    # Integrate in the frequency domain by phase shifting by pi/2 and
+    # weighting the Fourier coefficients by their frequencies in x and y and
+    # then dividing by the squared frequency
+    denom = (wx ** 2 + wy ** 2)
+    Z = (-1j * wx * DZDX - 1j * wy * DZDY) / denom
+    Z = np.nan_to_num(Z)
+    return np.real(ifft2(Z))
+
+def line(image, x0, y0, x1, y1, color):
+    steep = False
+    if x0 < 0 or x0 >= 400 or x1 < 0 or x1 >= 400 or y0 < 0 or y0 >= 400 or y1 < 0 or y1 >= 400:
+        return
+
+    if abs(x0 - x1) < abs(y0 - y1):
+        x0, y0 = y0, x0
+        x1, y1 = y1, x1
+        steep = True
+
+    if x0 > x1:
+        x0, x1 = x1, x0
+        y0, y1 = y1, y0
+
+    for x in range(int(x0), int(x1) + 1):
+        t = (x - x0) / float(x1 - x0)
+        y = y0 * (1 - t) + y1 * t
+        if steep:
+            image[x, int(y)] = color
+        else:
+            image[int(y), x] = color
+
+def draw_landmarks(img, lms):
+    try:
+        img = img.copy()
+
+        for i, part in enumerate(parts_68[1:]):
+            circular = []
+
+            if i in (4, 5, 6, 7):
+                circular = [part[0]]
+
+            for p1, p2 in zip(part, list(part[1:]) + circular):
+                p1, p2 = lms[p1], lms[p2]
+
+                line(img, p2[1], p2[0], p1[1], p1[0], 1)
+    except:
+        pass
+    return img
+
+def batch_draw_landmarks(imgs, lms):
+    return np.array([draw_landmarks(img, l) for img, l in zip(imgs, lms)])
+
+def build_graph(inputs, tree, transpose=(2,3,1,0), layers=[]):
+    net = inputs
+
+    if tree['name'] == 'nn.Sequential':
+        with tf.name_scope('nn.Sequential'):
+            for tr in tree['children']:
+                net = build_graph(net, tr, transpose, layers)
+    elif tree['name'] == 'nn.ConcatTable':
+        net_table = []
+        with tf.name_scope('nn.ConcatTable'):
+            for tr in tree['children']:
+                net_table.append(build_graph(net, tr, transpose, layers))
+        net = net_table
+    elif tree['name'] == 'nn.JoinTable':
+        net = tf.concat(3, net)
+    elif tree['name'] == 'nn.CAddTable':
+        net = tf.add_n(net)
+    elif tree['name'] == 'nn.SpatialConvolution':
+        out_channel = int(tree['nOutputPlane'])
+        kernal_shape = (int(tree['kH']),int(tree['kW']))
+        stride_shape = (int(tree['dH']),int(tree['dW']))
+        net = tf.pad(
+                net, [
+                    [0,0],
+                    [int(tree['padH']),int(tree['padH'])],
+                    [int(tree['padW']),int(tree['padW'])],
+                    [0,0]
+                ])
+        if 'weight' in tree.keys() and 'bias' in tree.keys():
+            net = slim.conv2d(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID',
+                              weights_initializer=tf.constant_initializer(tree['weight'].transpose(*transpose)),
+                              biases_initializer=tf.constant_initializer(tree['bias'])
+                             )
+        else:
+            net = slim.conv2d(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID'
+                             )
+
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.SpatialFullConvolution':
+        out_channel = int(tree['nOutputPlane'])
+        kernal_shape = (int(tree['kH']),int(tree['kW']))
+        stride_shape = (int(tree['dH']),int(tree['dW']))
+        net = tf.pad(
+                net, [
+                    [0,0],
+                    [int(tree['padH']),int(tree['padH'])],
+                    [int(tree['padW']),int(tree['padW'])],
+                    [0,0]
+                ])
+        if 'weight' in tree.keys() and 'bias' in tree.keys():
+            net = slim.conv2d_transpose(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID',
+                              weights_initializer=tf.constant_initializer(tree['weight'].transpose(*transpose)),
+                              biases_initializer=tf.constant_initializer(tree['bias'])
+                             )
+        else:
+            net = slim.conv2d_transpose(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID'
+                             )
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+
+    elif tree['name'] == 'nn.SpatialBatchNormalization':
+        net = slim.nn.batch_normalization(net,
+                                     tree['running_mean'],
+                                     tree['running_var'],
+                                     tree['bias'],
+                                     tree['weight'],
+                                     tree['eps'])
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.ReLU':
+        net = slim.nn.relu(net)
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.Sigmoid':
+        net = slim.nn.sigmoid(net)
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.SpatialMaxPooling':
+        net = slim.max_pool2d(
+            tf.pad(
+                net, [
+                    [0,0],
+                    [int(tree['padH']),int(tree['padH'])],
+                    [int(tree['padW']),int(tree['padW'])],
+                    [0,0]
+                ]),
+            (int(tree['kH']),int(tree['kW'])),
+            (int(tree['dH']),int(tree['dW']))
+        )
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.Identity':
+        pass
+    else:
+        raise Exception(tree['name'])
+
+    return net
+
+def build_graph_old(inputs, tree, transpose=(2,3,1,0)):
+    net = inputs
+
+    if tree['name'] == 'nn.Sequential':
+        with tf.name_scope('nn.Sequential'):
+            for tr in tree['children']:
+                net = build_graph(net, tr, transpose, layers)
+    elif tree['name'] == 'nn.ConcatTable':
+        net_table = []
+        with tf.name_scope('nn.ConcatTable'):
+            for tr in tree['children']:
+                net_table.append(build_graph(net, tr, transpose, layers))
+        net = net_table
+    elif tree['name'] == 'nn.JoinTable':
+        net = tf.concat(3, net)
+    elif tree['name'] == 'nn.CAddTable':
+        net = tf.add_n(net)
+    elif tree['name'] == 'nn.SpatialConvolution':
+        out_channel = int(tree['nOutputPlane'])
+        kernal_shape = (int(tree['kH']),int(tree['kW']))
+        stride_shape = (int(tree['dH']),int(tree['dW']))
+        net = tf.pad(
+                net, [
+                    [0,0],
+                    [int(tree['padH']),int(tree['padH'])],
+                    [int(tree['padW']),int(tree['padW'])],
+                    [0,0]
+                ])
+        if 'weight' in tree.keys() and 'bias' in tree.keys():
+            net = slim.conv2d(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID',
+                              weights_initializer=tf.constant_initializer(tree['weight'].transpose(*transpose)),
+                              biases_initializer=tf.constant_initializer(tree['bias'])
+                             )
+        else:
+            net = slim.conv2d(net,
+                              out_channel,
+                              kernal_shape,
+                              stride_shape,
+                              activation_fn=None,
+                              padding='VALID'
+                             )
+
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.SpatialFullConvolution':
+        out_channel = int(tree['nOutputPlane'])
+        kernal_shape = (int(tree['kH']),int(tree['kW']))
+        rate = np.min(int(tree['dH']),int(tree['dW']))
+        h,w = tf.shape(net)[1:3]
+        net = tf.image.resize_bilinear(net, (h,w,out_channel))
+
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+
+    elif tree['name'] == 'nn.SpatialBatchNormalization':
+        net = slim.batch_norm(net)
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.ReLU':
+        net = slim.nn.relu(net)
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.Sigmoid':
+        net = slim.nn.sigmoid(net)
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.SpatialMaxPooling':
+        net = slim.max_pool2d(
+            tf.pad(
+                net, [
+                    [0,0],
+                    [int(tree['padH']),int(tree['padH'])],
+                    [int(tree['padW']),int(tree['padW'])],
+                    [0,0]
+                ]),
+            (int(tree['kH']),int(tree['kW'])),
+            (int(tree['dH']),int(tree['dW']))
+        )
+        tree['tfname'] = net.name
+        tree['tfvar'] = net
+    elif tree['name'] == 'nn.Identity':
+        pass
+    else:
+        raise Exception(tree['name'])
+
+    return net
+
+def keypts_encoding(keypoints, num_classes):
+    keypoints = tf.to_int32(keypoints)
+    keypoints = tf.reshape(keypoints, (-1,))
+    keypoints = slim.layers.one_hot_encoding(keypoints, num_classes=num_classes+1)
+    return keypoints
+
+def get_weight(keypoints, mask=None, ng_w=0.01, ps_w=1.0):
+    is_background = tf.equal(keypoints, 0)
+    ones = tf.to_float(tf.ones_like(is_background))
+    weights = tf.where(is_background, ones * ng_w, ones*ps_w)
+    # if mask is not None:
+    #     weights *= tf.to_float(mask)
+
+    return weights
+
+def ced_accuracy(t, dists):
+    # Head	 Shoulder	Elbow	Wrist	Hip	   Knee	   Ankle
+    pts_r  = tf.transpose(tf.gather(tf.transpose(dists), [8,12,11,10,2,1,0]))
+    pts_l  = tf.transpose(tf.gather(tf.transpose(dists), [9,13,14,15,3,4,5]))
+    part_pckh = (tf.to_int32(pts_r <= t) + tf.to_int32(pts_l <= t)) / 2
+
+    return tf.concat(1, [part_pckh, tf.reduce_sum(tf.to_int32(dists <= t), 1)[...,None] / tf.shape(dists)[1]])
+
+def pckh(preds, gts, scales):
+    t_range = np.arange(0,0.51,0.01)
+    dists = tf.sqrt(tf.reduce_sum(tf.pow(preds - gts, 2), reduction_indices=-1)) / scales
+    # pckh = [ced_accuracy(t, dists) for t in t_range]
+    # return pckh[-1]
+    return ced_accuracy(0.5, dists)
+
+def import_image(img_path):
+    img = cv2.imread(str(img_path))
+    original_image = Image.init_from_channels_at_back(img[:,:,-1::-1])
+
+    try:
+        original_image_lms = mio.import_landmark_file('{}/{}.ljson'.format(img_path.parent, img_path.stem)).lms.points.astype(np.float32)
+        original_image.landmarks['LJSON'] = PointCloud(original_image_lms)
+    except:
+        pass
+
+    return original_image
+
+def crop_image(img, center, scale, res, base=384):
+    h = base * scale
+
+    t = Translation(
+        [
+            res[0] * (-center[0] / h + .5),
+            res[1] * (-center[1] / h + .5)
+        ]).compose_after(Scale((res[0] / h, res[1] / h))).pseudoinverse()
+
+
+    # Upper left point
+    ul = np.floor(t.apply([0,0]))
+    # Bottom right point
+    br = np.ceil(t.apply(res).astype(np.int))
+
+    # crop and rescale
+
+    cimg, trans = img.warp_to_shape(br-ul, Translation(-(br-ul)/2+(br+ul)/2) ,return_transform=True)
+    c_scale = np.min(cimg.shape) / np.mean(res)
+    new_img = cimg.rescale(1 / c_scale).resize(res)
+    return new_img, trans, c_scale
diff --git a/insightface/reconstruction/ostec/external/stylegan2/Dockerfile b/insightface/reconstruction/ostec/external/stylegan2/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ab45a553e0d49878585054e690aba74f2ca939ff
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/Dockerfile
@@ -0,0 +1,11 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+FROM tensorflow/tensorflow:1.15.0-gpu-py3
+
+RUN pip install scipy==1.3.3
+RUN pip install requests==2.22.0
+RUN pip install Pillow==6.2.1
diff --git a/insightface/reconstruction/ostec/external/stylegan2/LICENSE.txt b/insightface/reconstruction/ostec/external/stylegan2/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..288fb3247529fc0d19ee2040c29adc65886d9426
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/LICENSE.txt
@@ -0,0 +1,101 @@
+Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+
+
+Nvidia Source Code License-NC
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+"Nvidia Processors" means any central processing unit (CPU), graphics
+processing unit (GPU), field-programmable gate array (FPGA),
+application-specific integrated circuit (ASIC) or any combination
+thereof designed, made, sold, or provided by Nvidia or its affiliates.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by Nvidia
+    or its affiliates commercially or non-commercially. As used herein,
+    "non-commercially" means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grants in Sections 2.1 and 2.2) will
+    terminate immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor's or its affiliates' names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grants in Sections 2.1 and
+    2.2) will terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE. 
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
diff --git a/insightface/reconstruction/ostec/external/stylegan2/README.md b/insightface/reconstruction/ostec/external/stylegan2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..075f0b67d19fcfee5d45d60dc5da867d11dc2fe8
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/README.md
@@ -0,0 +1,220 @@
+## StyleGAN2 &mdash; Official TensorFlow Implementation
+
+![Teaser image](./docs/stylegan2-teaser-1024x256.png)
+
+**Analyzing and Improving the Image Quality of StyleGAN**<br>
+Tero Karras, Samuli Laine, Miika Aittala, Janne Hellsten, Jaakko Lehtinen, Timo Aila<br>
+
+Paper: http://arxiv.org/abs/1912.04958<br>
+Video: https://youtu.be/c-NJtV9Jvp0<br>
+
+Abstract: *The style-based GAN architecture (StyleGAN) yields state-of-the-art results in data-driven unconditional generative image modeling. We expose and analyze several of its characteristic artifacts, and propose changes in both model architecture and training methods to address them. In particular, we redesign generator normalization, revisit progressive growing, and regularize the generator to encourage good conditioning in the mapping from latent vectors to images. In addition to improving image quality, this path length regularizer yields the additional benefit that the generator becomes significantly easier to invert. This makes it possible to reliably detect if an image is generated by a particular network. We furthermore visualize how well the generator utilizes its output resolution, and identify a capacity problem, motivating us to train larger models for additional quality improvements. Overall, our improved model redefines the state of the art in unconditional image modeling, both in terms of existing distribution quality metrics as well as perceived image quality.*
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+**&#9733;&#9733;&#9733; NEW: [StyleGAN2-ADA-PyTorch](https://github.com/NVlabs/stylegan2-ada-pytorch) is now available; see the full list of versions [here](https://nvlabs.github.io/stylegan2/versions.html) &#9733;&#9733;&#9733;**
+
+| Additional material | &nbsp;
+| :--- | :----------
+| [StyleGAN2](https://drive.google.com/open?id=1QHc-yF5C3DChRwSdZKcx1w6K8JvSxQi7) | Main Google Drive folder
+| &boxvr;&nbsp; [stylegan2-paper.pdf](https://drive.google.com/open?id=1fnF-QsiQeKaxF-HbvFiGtzHF_Bf3CzJu) | High-quality version of the paper
+| &boxvr;&nbsp; [stylegan2-video.mp4](https://drive.google.com/open?id=1f_gbKW6FUUHKkUxciJ_lQx29mCq_fSBy) | High-quality version of the video
+| &boxvr;&nbsp; [images](https://drive.google.com/open?id=1Sak157_DLX84ytqHHqZaH_59HoEWzfB7) | Example images produced using our method
+| &boxv;&nbsp; &boxvr;&nbsp;  [curated-images](https://drive.google.com/open?id=1ydWb8xCHzDKMTW9kQ7sL-B1R0zATHVHp) | Hand-picked images showcasing our results
+| &boxv;&nbsp; &boxur;&nbsp;  [100k-generated-images](https://drive.google.com/open?id=1BA2OZ1GshdfFZGYZPob5QWOGBuJCdu5q) | Random images with and without truncation
+| &boxvr;&nbsp; [videos](https://drive.google.com/open?id=1yXDV96SFXoUiZKU7AyE6DyKgDpIk4wUZ) | Individual clips of the video as high-quality MP4
+| &boxur;&nbsp; [networks](https://nvlabs-fi-cdn.nvidia.com/stylegan2/networks/) | Pre-trained networks
+| &ensp;&ensp; &boxvr;&nbsp;  stylegan2-ffhq-config-f.pkl | StyleGAN2 for <span style="font-variant:small-caps">FFHQ</span> dataset at 1024&times;1024
+| &ensp;&ensp; &boxvr;&nbsp;  stylegan2-car-config-f.pkl | StyleGAN2 for <span style="font-variant:small-caps">LSUN Car</span> dataset at 512&times;384
+| &ensp;&ensp; &boxvr;&nbsp;  stylegan2-cat-config-f.pkl | StyleGAN2 for <span style="font-variant:small-caps">LSUN Cat</span> dataset at 256&times;256
+| &ensp;&ensp; &boxvr;&nbsp;  stylegan2-church-config-f.pkl | StyleGAN2 for <span style="font-variant:small-caps">LSUN Church</span> dataset at 256&times;256
+| &ensp;&ensp; &boxvr;&nbsp;  stylegan2-horse-config-f.pkl | StyleGAN2 for <span style="font-variant:small-caps">LSUN Horse</span> dataset at 256&times;256
+| &ensp;&ensp; &boxur;&nbsp;&#x22ef;  | Other training configurations used in the paper
+
+## Requirements
+
+* Both Linux and Windows are supported. Linux is recommended for performance and compatibility reasons.
+* 64-bit Python 3.6 installation. We recommend Anaconda3 with numpy 1.14.3 or newer.
+* We recommend TensorFlow 1.14, which we used for all experiments in the paper, but TensorFlow 1.15 is also supported on Linux. TensorFlow 2.x is not supported.
+* On Windows you need to use TensorFlow 1.14, as the standard 1.15 installation does not include necessary C++ headers.
+* One or more high-end NVIDIA GPUs, NVIDIA drivers, CUDA 10.0 toolkit and cuDNN 7.5. To reproduce the results reported in the paper, you need an NVIDIA GPU with at least 16 GB of DRAM.
+* Docker users: use the [provided Dockerfile](./Dockerfile) to build an image with the required library dependencies.
+
+StyleGAN2 relies on custom TensorFlow ops that are compiled on the fly using [NVCC](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html). To test that your NVCC installation is working correctly, run:
+
+```.bash
+nvcc test_nvcc.cu -o test_nvcc -run
+| CPU says hello.
+| GPU says hello.
+```
+
+On Windows, the compilation requires Microsoft Visual Studio to be in `PATH`. We recommend installing [Visual Studio Community Edition](https://visualstudio.microsoft.com/vs/) and adding into `PATH` using `"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"`.
+
+## Using pre-trained networks
+
+Pre-trained networks are stored as `*.pkl` files on the [StyleGAN2 Google Drive folder](https://drive.google.com/open?id=1QHc-yF5C3DChRwSdZKcx1w6K8JvSxQi7). Below, you can either reference them directly using the syntax `gdrive:networks/<filename>.pkl`, or download them manually and reference by filename.
+
+```.bash
+# Generate uncurated ffhq images (matches paper Figure 12)
+python run_generator.py generate-images --network=gdrive:networks/stylegan2-ffhq-config-f.pkl \
+  --seeds=6600-6625 --truncation-psi=0.5
+
+# Generate curated ffhq images (matches paper Figure 11)
+python run_generator.py generate-images --network=gdrive:networks/stylegan2-ffhq-config-f.pkl \
+  --seeds=66,230,389,1518 --truncation-psi=1.0
+
+# Generate uncurated car images
+python run_generator.py generate-images --network=gdrive:networks/stylegan2-car-config-f.pkl \
+  --seeds=6000-6025 --truncation-psi=0.5
+
+# Example of style mixing (matches the corresponding video clip)
+python run_generator.py style-mixing-example --network=gdrive:networks/stylegan2-ffhq-config-f.pkl \
+  --row-seeds=85,100,75,458,1500 --col-seeds=55,821,1789,293 --truncation-psi=1.0
+```
+
+The results are placed in `results/<RUNNING_ID>/*.png`. You can change the location with `--result-dir`. For example, `--result-dir=~/my-stylegan2-results`.
+
+You can import the networks in your own Python code using `pickle.load()`. For this to work, you need to include the `dnnlib` source directory in `PYTHONPATH` and create a default TensorFlow session by calling `dnnlib.tflib.init_tf()`. See [run_generator.py](./run_generator.py) and [pretrained_networks.py](./pretrained_networks.py) for examples.
+
+## Preparing datasets
+
+Datasets are stored as multi-resolution TFRecords, similar to the [original StyleGAN](https://github.com/NVlabs/stylegan). Each dataset consists of multiple `*.tfrecords` files stored under a common directory, e.g., `~/datasets/ffhq/ffhq-r*.tfrecords`. In the following sections, the datasets are referenced using a combination of `--dataset` and `--data-dir` arguments, e.g., `--dataset=ffhq --data-dir=~/datasets`.
+
+**FFHQ**. To download the [Flickr-Faces-HQ](https://github.com/NVlabs/ffhq-dataset) dataset as multi-resolution TFRecords, run:
+
+```.bash
+pushd ~
+git clone https://github.com/NVlabs/ffhq-dataset.git
+cd ffhq-dataset
+python download_ffhq.py --tfrecords
+popd
+python dataset_tool.py display ~/ffhq-dataset/tfrecords/ffhq
+```
+
+**LSUN**. Download the desired LSUN categories in LMDB format from the [LSUN project page](https://www.yf.io/p/lsun). To convert the data to multi-resolution TFRecords, run:
+
+```.bash
+python dataset_tool.py create_lsun_wide ~/datasets/car ~/lsun/car_lmdb --width=512 --height=384
+python dataset_tool.py create_lsun ~/datasets/cat ~/lsun/cat_lmdb --resolution=256
+python dataset_tool.py create_lsun ~/datasets/church ~/lsun/church_outdoor_train_lmdb --resolution=256
+python dataset_tool.py create_lsun ~/datasets/horse ~/lsun/horse_lmdb --resolution=256
+```
+
+**Custom**. Create custom datasets by placing all training images under a single directory. The images must be square-shaped and they must all have the same power-of-two dimensions. To convert the images to multi-resolution TFRecords, run:
+
+```.bash
+python dataset_tool.py create_from_images ~/datasets/my-custom-dataset ~/my-custom-images
+python dataset_tool.py display ~/datasets/my-custom-dataset
+```
+
+## Projecting images to latent space
+
+To find the matching latent vectors for a set of images, run:
+
+```.bash
+# Project generated images
+python run_projector.py project-generated-images --network=gdrive:networks/stylegan2-car-config-f.pkl \
+  --seeds=0,1,5
+
+# Project real images
+python run_projector.py project-real-images --network=gdrive:networks/stylegan2-car-config-f.pkl \
+  --dataset=car --data-dir=~/datasets
+```
+
+## Training networks
+
+To reproduce the training runs for config F in Tables 1 and 3, run:
+
+```.bash
+python run_training.py --num-gpus=8 --data-dir=~/datasets --config=config-f \
+  --dataset=ffhq --mirror-augment=true
+python run_training.py --num-gpus=8 --data-dir=~/datasets --config=config-f \
+  --dataset=car --total-kimg=57000
+python run_training.py --num-gpus=8 --data-dir=~/datasets --config=config-f \
+  --dataset=cat --total-kimg=88000
+python run_training.py --num-gpus=8 --data-dir=~/datasets --config=config-f \
+  --dataset=church --total-kimg 88000 --gamma=100
+python run_training.py --num-gpus=8 --data-dir=~/datasets --config=config-f \
+  --dataset=horse --total-kimg 100000 --gamma=100
+```
+
+For other configurations, see `python run_training.py --help`.
+
+We have verified that the results match the paper when training with 1, 2, 4, or 8 GPUs. Note that training FFHQ at 1024&times;1024 resolution requires GPU(s) with at least 16 GB of memory. The following table lists typical training times using NVIDIA DGX-1 with 8 Tesla V100 GPUs:
+
+| Configuration | Resolution      | Total kimg | 1 GPU   | 2 GPUs  | 4 GPUs  | 8 GPUs | GPU mem |
+| :------------ | :-------------: | :--------: | :-----: | :-----: | :-----: | :----: | :-----: |
+| `config-f`    | 1024&times;1024 | 25000      | 69d 23h | 36d 4h  | 18d 14h | 9d 18h | 13.3 GB |
+| `config-f`    | 1024&times;1024 | 10000      | 27d 23h | 14d 11h | 7d 10h  | 3d 22h | 13.3 GB |
+| `config-e`    | 1024&times;1024 | 25000      | 35d 11h | 18d 15h | 9d 15h  | 5d 6h  | 8.6 GB  |
+| `config-e`    | 1024&times;1024 | 10000      | 14d 4h  | 7d 11h  | 3d 20h  | 2d 3h  | 8.6 GB  |
+| `config-f`    | 256&times;256   | 25000      | 32d 13h | 16d 23h | 8d 21h  | 4d 18h | 6.4 GB  |
+| `config-f`    | 256&times;256   | 10000      | 13d 0h  | 6d 19h  | 3d 13h  | 1d 22h | 6.4 GB  |
+
+Training curves for FFHQ config F (StyleGAN2) compared to original StyleGAN using 8 GPUs:
+
+![Training curves](./docs/stylegan2-training-curves.png)
+
+After training, the resulting networks can be used the same way as the official pre-trained networks:
+
+```.bash
+# Generate 1000 random images without truncation
+python run_generator.py generate-images --seeds=0-999 --truncation-psi=1.0 \
+  --network=results/00006-stylegan2-ffhq-8gpu-config-f/networks-final.pkl
+```
+
+## Evaluation metrics
+
+To reproduce the numbers for config F in Tables 1 and 3, run:
+
+```.bash
+python run_metrics.py --data-dir=~/datasets --network=gdrive:networks/stylegan2-ffhq-config-f.pkl \
+  --metrics=fid50k,ppl_wend --dataset=ffhq --mirror-augment=true
+python run_metrics.py --data-dir=~/datasets --network=gdrive:networks/stylegan2-car-config-f.pkl \
+  --metrics=fid50k,ppl2_wend --dataset=car
+python run_metrics.py --data-dir=~/datasets --network=gdrive:networks/stylegan2-cat-config-f.pkl \
+  --metrics=fid50k,ppl2_wend --dataset=cat
+python run_metrics.py --data-dir=~/datasets --network=gdrive:networks/stylegan2-church-config-f.pkl \
+  --metrics=fid50k,ppl2_wend --dataset=church
+python run_metrics.py --data-dir=~/datasets --network=gdrive:networks/stylegan2-horse-config-f.pkl \
+  --metrics=fid50k,ppl2_wend --dataset=horse
+```
+
+For other configurations, see the [StyleGAN2 Google Drive folder](https://drive.google.com/open?id=1QHc-yF5C3DChRwSdZKcx1w6K8JvSxQi7).
+
+Note that the metrics are evaluated using a different random seed each time, so the results will vary between runs. In the paper, we reported the average result of running each metric 10 times. The following table lists the available metrics along with their expected runtimes and random variation:
+
+| Metric      | FFHQ config F  | 1 GPU  | 2 GPUs  | 4 GPUs | Description |
+| :---------- | :------------: | :----: | :-----: | :----: | :---------- |
+| `fid50k`    | 2.84 &pm; 0.03 | 22 min | 14 min  | 10 min | [Fr&eacute;chet Inception Distance](https://arxiv.org/abs/1706.08500)
+| `is50k`     | 5.13 &pm; 0.02 | 23 min | 14 min  | 8 min  | [Inception Score](https://arxiv.org/abs/1606.03498)
+| `ppl_zfull` | 348.0 &pm; 3.8 | 41 min | 22 min  | 14 min | [Perceptual Path Length](https://arxiv.org/abs/1812.04948) in Z, full paths
+| `ppl_wfull` | 126.9 &pm; 0.2 | 42 min | 22 min  | 13 min | [Perceptual Path Length](https://arxiv.org/abs/1812.04948) in W, full paths
+| `ppl_zend`  | 348.6 &pm; 3.0 | 41 min | 22 min  | 14 min | [Perceptual Path Length](https://arxiv.org/abs/1812.04948) in Z, path endpoints
+| `ppl_wend`  | 129.4 &pm; 0.8 | 40 min | 23 min  | 13 min | [Perceptual Path Length](https://arxiv.org/abs/1812.04948) in W, path endpoints
+| `ppl2_wend` | 145.0 &pm; 0.5 | 41 min | 23 min  | 14 min | [Perceptual Path Length](https://arxiv.org/abs/1812.04948) without center crop
+| `ls`        | 154.2 / 4.27   | 10 hrs | 6 hrs   | 4 hrs  | [Linear Separability](https://arxiv.org/abs/1812.04948)
+| `pr50k3`    | 0.689 / 0.492  | 26 min | 17 min  | 12 min | [Precision and Recall](https://arxiv.org/abs/1904.06991)
+
+Note that some of the metrics cache dataset-specific data on the disk, and they will take somewhat longer when run for the first time.
+
+## License
+
+Copyright &copy; 2019, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the Nvidia Source Code License-NC. To view a copy of this license, visit https://nvlabs.github.io/stylegan2/license.html
+
+## Citation
+
+```
+@inproceedings{Karras2019stylegan2,
+  title     = {Analyzing and Improving the Image Quality of {StyleGAN}},
+  author    = {Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+  booktitle = {Proc. CVPR},
+  year      = {2020}
+}
+```
+
+## Acknowledgements
+
+We thank Ming-Yu Liu for an early review, Timo Viitanen for his help with code release, and Tero Kuosmanen for compute infrastructure.
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dataset_tool.py b/insightface/reconstruction/ostec/external/stylegan2/dataset_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c4dc280b4e40219f649c31e304c058da7ed043
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dataset_tool.py
@@ -0,0 +1,644 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Tool for creating multi-resolution TFRecords datasets."""
+
+# pylint: disable=too-many-lines
+import os
+import sys
+import glob
+import argparse
+import threading
+import six.moves.queue as Queue # pylint: disable=import-error
+import traceback
+import numpy as np
+import tensorflow as tf
+import PIL.Image
+import dnnlib.tflib as tflib
+
+from training import dataset
+
+#----------------------------------------------------------------------------
+
+def error(msg):
+    print('Error: ' + msg)
+    exit(1)
+
+#----------------------------------------------------------------------------
+
+class TFRecordExporter:
+    def __init__(self, tfrecord_dir, expected_images, print_progress=True, progress_interval=10):
+        self.tfrecord_dir       = tfrecord_dir
+        self.tfr_prefix         = os.path.join(self.tfrecord_dir, os.path.basename(self.tfrecord_dir))
+        self.expected_images    = expected_images
+        self.cur_images         = 0
+        self.shape              = None
+        self.resolution_log2    = None
+        self.tfr_writers        = []
+        self.print_progress     = print_progress
+        self.progress_interval  = progress_interval
+
+        if self.print_progress:
+            print('Creating dataset "%s"' % tfrecord_dir)
+        if not os.path.isdir(self.tfrecord_dir):
+            os.makedirs(self.tfrecord_dir)
+        assert os.path.isdir(self.tfrecord_dir)
+
+    def close(self):
+        if self.print_progress:
+            print('%-40s\r' % 'Flushing data...', end='', flush=True)
+        for tfr_writer in self.tfr_writers:
+            tfr_writer.close()
+        self.tfr_writers = []
+        if self.print_progress:
+            print('%-40s\r' % '', end='', flush=True)
+            print('Added %d images.' % self.cur_images)
+
+    def choose_shuffled_order(self): # Note: Images and labels must be added in shuffled order.
+        order = np.arange(self.expected_images)
+        np.random.RandomState(123).shuffle(order)
+        return order
+
+    def add_image(self, img):
+        if self.print_progress and self.cur_images % self.progress_interval == 0:
+            print('%d / %d\r' % (self.cur_images, self.expected_images), end='', flush=True)
+        if self.shape is None:
+            self.shape = img.shape
+            self.resolution_log2 = int(np.log2(self.shape[1]))
+            assert self.shape[0] in [1, 3]
+            assert self.shape[1] == self.shape[2]
+            assert self.shape[1] == 2**self.resolution_log2
+            tfr_opt = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.NONE)
+            for lod in range(self.resolution_log2 - 1):
+                tfr_file = self.tfr_prefix + '-r%02d.tfrecords' % (self.resolution_log2 - lod)
+                self.tfr_writers.append(tf.python_io.TFRecordWriter(tfr_file, tfr_opt))
+        assert img.shape == self.shape
+        for lod, tfr_writer in enumerate(self.tfr_writers):
+            if lod:
+                img = img.astype(np.float32)
+                img = (img[:, 0::2, 0::2] + img[:, 0::2, 1::2] + img[:, 1::2, 0::2] + img[:, 1::2, 1::2]) * 0.25
+            quant = np.rint(img).clip(0, 255).astype(np.uint8)
+            ex = tf.train.Example(features=tf.train.Features(feature={
+                'shape': tf.train.Feature(int64_list=tf.train.Int64List(value=quant.shape)),
+                'data': tf.train.Feature(bytes_list=tf.train.BytesList(value=[quant.tostring()]))}))
+            tfr_writer.write(ex.SerializeToString())
+        self.cur_images += 1
+
+    def add_labels(self, labels):
+        if self.print_progress:
+            print('%-40s\r' % 'Saving labels...', end='', flush=True)
+        assert labels.shape[0] == self.cur_images
+        with open(self.tfr_prefix + '-rxx.labels', 'wb') as f:
+            np.save(f, labels.astype(np.float32))
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+#----------------------------------------------------------------------------
+
+class ExceptionInfo(object):
+    def __init__(self):
+        self.value = sys.exc_info()[1]
+        self.traceback = traceback.format_exc()
+
+#----------------------------------------------------------------------------
+
+class WorkerThread(threading.Thread):
+    def __init__(self, task_queue):
+        threading.Thread.__init__(self)
+        self.task_queue = task_queue
+
+    def run(self):
+        while True:
+            func, args, result_queue = self.task_queue.get()
+            if func is None:
+                break
+            try:
+                result = func(*args)
+            except:
+                result = ExceptionInfo()
+            result_queue.put((result, args))
+
+#----------------------------------------------------------------------------
+
+class ThreadPool(object):
+    def __init__(self, num_threads):
+        assert num_threads >= 1
+        self.task_queue = Queue.Queue()
+        self.result_queues = dict()
+        self.num_threads = num_threads
+        for _idx in range(self.num_threads):
+            thread = WorkerThread(self.task_queue)
+            thread.daemon = True
+            thread.start()
+
+    def add_task(self, func, args=()):
+        assert hasattr(func, '__call__') # must be a function
+        if func not in self.result_queues:
+            self.result_queues[func] = Queue.Queue()
+        self.task_queue.put((func, args, self.result_queues[func]))
+
+    def get_result(self, func): # returns (result, args)
+        result, args = self.result_queues[func].get()
+        if isinstance(result, ExceptionInfo):
+            print('\n\nWorker thread caught an exception:\n' + result.traceback)
+            raise result.value
+        return result, args
+
+    def finish(self):
+        for _idx in range(self.num_threads):
+            self.task_queue.put((None, (), None))
+
+    def __enter__(self): # for 'with' statement
+        return self
+
+    def __exit__(self, *excinfo):
+        self.finish()
+
+    def process_items_concurrently(self, item_iterator, process_func=lambda x: x, pre_func=lambda x: x, post_func=lambda x: x, max_items_in_flight=None):
+        if max_items_in_flight is None: max_items_in_flight = self.num_threads * 4
+        assert max_items_in_flight >= 1
+        results = []
+        retire_idx = [0]
+
+        def task_func(prepared, _idx):
+            return process_func(prepared)
+
+        def retire_result():
+            processed, (_prepared, idx) = self.get_result(task_func)
+            results[idx] = processed
+            while retire_idx[0] < len(results) and results[retire_idx[0]] is not None:
+                yield post_func(results[retire_idx[0]])
+                results[retire_idx[0]] = None
+                retire_idx[0] += 1
+
+        for idx, item in enumerate(item_iterator):
+            prepared = pre_func(item)
+            results.append(None)
+            self.add_task(func=task_func, args=(prepared, idx))
+            while retire_idx[0] < idx - max_items_in_flight + 2:
+                for res in retire_result(): yield res
+        while retire_idx[0] < len(results):
+            for res in retire_result(): yield res
+
+#----------------------------------------------------------------------------
+
+def display(tfrecord_dir):
+    print('Loading dataset "%s"' % tfrecord_dir)
+    tflib.init_tf({'gpu_options.allow_growth': True})
+    dset = dataset.TFRecordDataset(tfrecord_dir, max_label_size='full', repeat=False, shuffle_mb=0)
+    tflib.init_uninitialized_vars()
+    import cv2  # pip install opencv-python
+
+    idx = 0
+    while True:
+        try:
+            images, labels = dset.get_minibatch_np(1)
+        except tf.errors.OutOfRangeError:
+            break
+        if idx == 0:
+            print('Displaying images')
+            cv2.namedWindow('dataset_tool')
+            print('Press SPACE or ENTER to advance, ESC to exit')
+        print('\nidx = %-8d\nlabel = %s' % (idx, labels[0].tolist()))
+        cv2.imshow('dataset_tool', images[0].transpose(1, 2, 0)[:, :, ::-1]) # CHW => HWC, RGB => BGR
+        idx += 1
+        if cv2.waitKey() == 27:
+            break
+    print('\nDisplayed %d images.' % idx)
+
+#----------------------------------------------------------------------------
+
+def extract(tfrecord_dir, output_dir):
+    print('Loading dataset "%s"' % tfrecord_dir)
+    tflib.init_tf({'gpu_options.allow_growth': True})
+    dset = dataset.TFRecordDataset(tfrecord_dir, max_label_size=0, repeat=False, shuffle_mb=0)
+    tflib.init_uninitialized_vars()
+
+    print('Extracting images to "%s"' % output_dir)
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+    idx = 0
+    while True:
+        if idx % 10 == 0:
+            print('%d\r' % idx, end='', flush=True)
+        try:
+            images, _labels = dset.get_minibatch_np(1)
+        except tf.errors.OutOfRangeError:
+            break
+        if images.shape[1] == 1:
+            img = PIL.Image.fromarray(images[0][0], 'L')
+        else:
+            img = PIL.Image.fromarray(images[0].transpose(1, 2, 0), 'RGB')
+        img.save(os.path.join(output_dir, 'img%08d.png' % idx))
+        idx += 1
+    print('Extracted %d images.' % idx)
+
+#----------------------------------------------------------------------------
+
+def compare(tfrecord_dir_a, tfrecord_dir_b, ignore_labels):
+    max_label_size = 0 if ignore_labels else 'full'
+    print('Loading dataset "%s"' % tfrecord_dir_a)
+    tflib.init_tf({'gpu_options.allow_growth': True})
+    dset_a = dataset.TFRecordDataset(tfrecord_dir_a, max_label_size=max_label_size, repeat=False, shuffle_mb=0)
+    print('Loading dataset "%s"' % tfrecord_dir_b)
+    dset_b = dataset.TFRecordDataset(tfrecord_dir_b, max_label_size=max_label_size, repeat=False, shuffle_mb=0)
+    tflib.init_uninitialized_vars()
+
+    print('Comparing datasets')
+    idx = 0
+    identical_images = 0
+    identical_labels = 0
+    while True:
+        if idx % 100 == 0:
+            print('%d\r' % idx, end='', flush=True)
+        try:
+            images_a, labels_a = dset_a.get_minibatch_np(1)
+        except tf.errors.OutOfRangeError:
+            images_a, labels_a = None, None
+        try:
+            images_b, labels_b = dset_b.get_minibatch_np(1)
+        except tf.errors.OutOfRangeError:
+            images_b, labels_b = None, None
+        if images_a is None or images_b is None:
+            if images_a is not None or images_b is not None:
+                print('Datasets contain different number of images')
+            break
+        if images_a.shape == images_b.shape and np.all(images_a == images_b):
+            identical_images += 1
+        else:
+            print('Image %d is different' % idx)
+        if labels_a.shape == labels_b.shape and np.all(labels_a == labels_b):
+            identical_labels += 1
+        else:
+            print('Label %d is different' % idx)
+        idx += 1
+    print('Identical images: %d / %d' % (identical_images, idx))
+    if not ignore_labels:
+        print('Identical labels: %d / %d' % (identical_labels, idx))
+
+#----------------------------------------------------------------------------
+
+def create_mnist(tfrecord_dir, mnist_dir):
+    print('Loading MNIST from "%s"' % mnist_dir)
+    import gzip
+    with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file:
+        images = np.frombuffer(file.read(), np.uint8, offset=16)
+    with gzip.open(os.path.join(mnist_dir, 'train-labels-idx1-ubyte.gz'), 'rb') as file:
+        labels = np.frombuffer(file.read(), np.uint8, offset=8)
+    images = images.reshape(-1, 1, 28, 28)
+    images = np.pad(images, [(0,0), (0,0), (2,2), (2,2)], 'constant', constant_values=0)
+    assert images.shape == (60000, 1, 32, 32) and images.dtype == np.uint8
+    assert labels.shape == (60000,) and labels.dtype == np.uint8
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 9
+    onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
+    onehot[np.arange(labels.size), labels] = 1.0
+
+    with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
+        order = tfr.choose_shuffled_order()
+        for idx in range(order.size):
+            tfr.add_image(images[order[idx]])
+        tfr.add_labels(onehot[order])
+
+#----------------------------------------------------------------------------
+
+def create_mnistrgb(tfrecord_dir, mnist_dir, num_images=1000000, random_seed=123):
+    print('Loading MNIST from "%s"' % mnist_dir)
+    import gzip
+    with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file:
+        images = np.frombuffer(file.read(), np.uint8, offset=16)
+    images = images.reshape(-1, 28, 28)
+    images = np.pad(images, [(0,0), (2,2), (2,2)], 'constant', constant_values=0)
+    assert images.shape == (60000, 32, 32) and images.dtype == np.uint8
+    assert np.min(images) == 0 and np.max(images) == 255
+
+    with TFRecordExporter(tfrecord_dir, num_images) as tfr:
+        rnd = np.random.RandomState(random_seed)
+        for _idx in range(num_images):
+            tfr.add_image(images[rnd.randint(images.shape[0], size=3)])
+
+#----------------------------------------------------------------------------
+
+def create_cifar10(tfrecord_dir, cifar10_dir):
+    print('Loading CIFAR-10 from "%s"' % cifar10_dir)
+    import pickle
+    images = []
+    labels = []
+    for batch in range(1, 6):
+        with open(os.path.join(cifar10_dir, 'data_batch_%d' % batch), 'rb') as file:
+            data = pickle.load(file, encoding='latin1')
+        images.append(data['data'].reshape(-1, 3, 32, 32))
+        labels.append(data['labels'])
+    images = np.concatenate(images)
+    labels = np.concatenate(labels)
+    assert images.shape == (50000, 3, 32, 32) and images.dtype == np.uint8
+    assert labels.shape == (50000,) and labels.dtype == np.int32
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 9
+    onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
+    onehot[np.arange(labels.size), labels] = 1.0
+
+    with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
+        order = tfr.choose_shuffled_order()
+        for idx in range(order.size):
+            tfr.add_image(images[order[idx]])
+        tfr.add_labels(onehot[order])
+
+#----------------------------------------------------------------------------
+
+def create_cifar100(tfrecord_dir, cifar100_dir):
+    print('Loading CIFAR-100 from "%s"' % cifar100_dir)
+    import pickle
+    with open(os.path.join(cifar100_dir, 'train'), 'rb') as file:
+        data = pickle.load(file, encoding='latin1')
+    images = data['data'].reshape(-1, 3, 32, 32)
+    labels = np.array(data['fine_labels'])
+    assert images.shape == (50000, 3, 32, 32) and images.dtype == np.uint8
+    assert labels.shape == (50000,) and labels.dtype == np.int32
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 99
+    onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
+    onehot[np.arange(labels.size), labels] = 1.0
+
+    with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
+        order = tfr.choose_shuffled_order()
+        for idx in range(order.size):
+            tfr.add_image(images[order[idx]])
+        tfr.add_labels(onehot[order])
+
+#----------------------------------------------------------------------------
+
+def create_svhn(tfrecord_dir, svhn_dir):
+    print('Loading SVHN from "%s"' % svhn_dir)
+    import pickle
+    images = []
+    labels = []
+    for batch in range(1, 4):
+        with open(os.path.join(svhn_dir, 'train_%d.pkl' % batch), 'rb') as file:
+            data = pickle.load(file, encoding='latin1')
+        images.append(data[0])
+        labels.append(data[1])
+    images = np.concatenate(images)
+    labels = np.concatenate(labels)
+    assert images.shape == (73257, 3, 32, 32) and images.dtype == np.uint8
+    assert labels.shape == (73257,) and labels.dtype == np.uint8
+    assert np.min(images) == 0 and np.max(images) == 255
+    assert np.min(labels) == 0 and np.max(labels) == 9
+    onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32)
+    onehot[np.arange(labels.size), labels] = 1.0
+
+    with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr:
+        order = tfr.choose_shuffled_order()
+        for idx in range(order.size):
+            tfr.add_image(images[order[idx]])
+        tfr.add_labels(onehot[order])
+
+#----------------------------------------------------------------------------
+
+def create_lsun(tfrecord_dir, lmdb_dir, resolution=256, max_images=None):
+    print('Loading LSUN dataset from "%s"' % lmdb_dir)
+    import lmdb # pip install lmdb # pylint: disable=import-error
+    import cv2 # pip install opencv-python
+    import io
+    with lmdb.open(lmdb_dir, readonly=True).begin(write=False) as txn:
+        total_images = txn.stat()['entries'] # pylint: disable=no-value-for-parameter
+        if max_images is None:
+            max_images = total_images
+        with TFRecordExporter(tfrecord_dir, max_images) as tfr:
+            for _idx, (_key, value) in enumerate(txn.cursor()):
+                try:
+                    try:
+                        img = cv2.imdecode(np.fromstring(value, dtype=np.uint8), 1)
+                        if img is None:
+                            raise IOError('cv2.imdecode failed')
+                        img = img[:, :, ::-1] # BGR => RGB
+                    except IOError:
+                        img = np.asarray(PIL.Image.open(io.BytesIO(value)))
+                    crop = np.min(img.shape[:2])
+                    img = img[(img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2, (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2]
+                    img = PIL.Image.fromarray(img, 'RGB')
+                    img = img.resize((resolution, resolution), PIL.Image.ANTIALIAS)
+                    img = np.asarray(img)
+                    img = img.transpose([2, 0, 1]) # HWC => CHW
+                    tfr.add_image(img)
+                except:
+                    print(sys.exc_info()[1])
+                if tfr.cur_images == max_images:
+                    break
+
+#----------------------------------------------------------------------------
+
+def create_lsun_wide(tfrecord_dir, lmdb_dir, width=512, height=384, max_images=None):
+    assert width == 2 ** int(np.round(np.log2(width)))
+    assert height <= width
+    print('Loading LSUN dataset from "%s"' % lmdb_dir)
+    import lmdb # pip install lmdb # pylint: disable=import-error
+    import cv2 # pip install opencv-python
+    import io
+    with lmdb.open(lmdb_dir, readonly=True).begin(write=False) as txn:
+        total_images = txn.stat()['entries'] # pylint: disable=no-value-for-parameter
+        if max_images is None:
+            max_images = total_images
+        with TFRecordExporter(tfrecord_dir, max_images, print_progress=False) as tfr:
+            for idx, (_key, value) in enumerate(txn.cursor()):
+                try:
+                    try:
+                        img = cv2.imdecode(np.fromstring(value, dtype=np.uint8), 1)
+                        if img is None:
+                            raise IOError('cv2.imdecode failed')
+                        img = img[:, :, ::-1] # BGR => RGB
+                    except IOError:
+                        img = np.asarray(PIL.Image.open(io.BytesIO(value)))
+
+                    ch = int(np.round(width * img.shape[0] / img.shape[1]))
+                    if img.shape[1] < width or ch < height:
+                        continue
+
+                    img = img[(img.shape[0] - ch) // 2 : (img.shape[0] + ch) // 2]
+                    img = PIL.Image.fromarray(img, 'RGB')
+                    img = img.resize((width, height), PIL.Image.ANTIALIAS)
+                    img = np.asarray(img)
+                    img = img.transpose([2, 0, 1]) # HWC => CHW
+
+                    canvas = np.zeros([3, width, width], dtype=np.uint8)
+                    canvas[:, (width - height) // 2 : (width + height) // 2] = img
+                    tfr.add_image(canvas)
+                    print('\r%d / %d => %d ' % (idx + 1, total_images, tfr.cur_images), end='')
+
+                except:
+                    print(sys.exc_info()[1])
+                if tfr.cur_images == max_images:
+                    break
+    print()
+
+#----------------------------------------------------------------------------
+
+def create_celeba(tfrecord_dir, celeba_dir, cx=89, cy=121):
+    print('Loading CelebA from "%s"' % celeba_dir)
+    glob_pattern = os.path.join(celeba_dir, 'img_align_celeba_png', '*.png')
+    image_filenames = sorted(glob.glob(glob_pattern))
+    expected_images = 202599
+    if len(image_filenames) != expected_images:
+        error('Expected to find %d images' % expected_images)
+
+    with TFRecordExporter(tfrecord_dir, len(image_filenames)) as tfr:
+        order = tfr.choose_shuffled_order()
+        for idx in range(order.size):
+            img = np.asarray(PIL.Image.open(image_filenames[order[idx]]))
+            assert img.shape == (218, 178, 3)
+            img = img[cy - 64 : cy + 64, cx - 64 : cx + 64]
+            img = img.transpose(2, 0, 1) # HWC => CHW
+            tfr.add_image(img)
+
+#----------------------------------------------------------------------------
+
+def create_from_images(tfrecord_dir, image_dir, shuffle):
+    print('Loading images from "%s"' % image_dir)
+    image_filenames = sorted(glob.glob(os.path.join(image_dir, '*')))
+    if len(image_filenames) == 0:
+        error('No input images found')
+
+    img = np.asarray(PIL.Image.open(image_filenames[0]))
+    resolution = img.shape[0]
+    channels = img.shape[2] if img.ndim == 3 else 1
+    if img.shape[1] != resolution:
+        error('Input images must have the same width and height')
+    if resolution != 2 ** int(np.floor(np.log2(resolution))):
+        error('Input image resolution must be a power-of-two')
+    if channels not in [1, 3]:
+        error('Input images must be stored as RGB or grayscale')
+
+    with TFRecordExporter(tfrecord_dir, len(image_filenames)) as tfr:
+        order = tfr.choose_shuffled_order() if shuffle else np.arange(len(image_filenames))
+        for idx in range(order.size):
+            img = np.asarray(PIL.Image.open(image_filenames[order[idx]]))
+            if channels == 1:
+                img = img[np.newaxis, :, :] # HW => CHW
+            else:
+                img = img.transpose([2, 0, 1]) # HWC => CHW
+            tfr.add_image(img)
+
+#----------------------------------------------------------------------------
+
+def create_from_hdf5(tfrecord_dir, hdf5_filename, shuffle):
+    print('Loading HDF5 archive from "%s"' % hdf5_filename)
+    import h5py # conda install h5py
+    with h5py.File(hdf5_filename, 'r') as hdf5_file:
+        hdf5_data = max([value for key, value in hdf5_file.items() if key.startswith('data')], key=lambda lod: lod.shape[3])
+        with TFRecordExporter(tfrecord_dir, hdf5_data.shape[0]) as tfr:
+            order = tfr.choose_shuffled_order() if shuffle else np.arange(hdf5_data.shape[0])
+            for idx in range(order.size):
+                tfr.add_image(hdf5_data[order[idx]])
+            npy_filename = os.path.splitext(hdf5_filename)[0] + '-labels.npy'
+            if os.path.isfile(npy_filename):
+                tfr.add_labels(np.load(npy_filename)[order])
+
+#----------------------------------------------------------------------------
+
+def execute_cmdline(argv):
+    prog = argv[0]
+    parser = argparse.ArgumentParser(
+        prog        = prog,
+        description = 'Tool for creating multi-resolution TFRecords datasets for StyleGAN and ProGAN.',
+        epilog      = 'Type "%s <command> -h" for more information.' % prog)
+
+    subparsers = parser.add_subparsers(dest='command')
+    subparsers.required = True
+    def add_command(cmd, desc, example=None):
+        epilog = 'Example: %s %s' % (prog, example) if example is not None else None
+        return subparsers.add_parser(cmd, description=desc, help=desc, epilog=epilog)
+
+    p = add_command(    'display',          'Display images in dataset.',
+                                            'display datasets/mnist')
+    p.add_argument(     'tfrecord_dir',     help='Directory containing dataset')
+
+    p = add_command(    'extract',          'Extract images from dataset.',
+                                            'extract datasets/mnist mnist-images')
+    p.add_argument(     'tfrecord_dir',     help='Directory containing dataset')
+    p.add_argument(     'output_dir',       help='Directory to extract the images into')
+
+    p = add_command(    'compare',          'Compare two datasets.',
+                                            'compare datasets/mydataset datasets/mnist')
+    p.add_argument(     'tfrecord_dir_a',   help='Directory containing first dataset')
+    p.add_argument(     'tfrecord_dir_b',   help='Directory containing second dataset')
+    p.add_argument(     '--ignore_labels',  help='Ignore labels (default: 0)', type=int, default=0)
+
+    p = add_command(    'create_mnist',     'Create dataset for MNIST.',
+                                            'create_mnist datasets/mnist ~/downloads/mnist')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'mnist_dir',        help='Directory containing MNIST')
+
+    p = add_command(    'create_mnistrgb',  'Create dataset for MNIST-RGB.',
+                                            'create_mnistrgb datasets/mnistrgb ~/downloads/mnist')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'mnist_dir',        help='Directory containing MNIST')
+    p.add_argument(     '--num_images',     help='Number of composite images to create (default: 1000000)', type=int, default=1000000)
+    p.add_argument(     '--random_seed',    help='Random seed (default: 123)', type=int, default=123)
+
+    p = add_command(    'create_cifar10',   'Create dataset for CIFAR-10.',
+                                            'create_cifar10 datasets/cifar10 ~/downloads/cifar10')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'cifar10_dir',      help='Directory containing CIFAR-10')
+
+    p = add_command(    'create_cifar100',  'Create dataset for CIFAR-100.',
+                                            'create_cifar100 datasets/cifar100 ~/downloads/cifar100')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'cifar100_dir',     help='Directory containing CIFAR-100')
+
+    p = add_command(    'create_svhn',      'Create dataset for SVHN.',
+                                            'create_svhn datasets/svhn ~/downloads/svhn')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'svhn_dir',         help='Directory containing SVHN')
+
+    p = add_command(    'create_lsun',      'Create dataset for single LSUN category.',
+                                            'create_lsun datasets/lsun-car-100k ~/downloads/lsun/car_lmdb --resolution 256 --max_images 100000')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'lmdb_dir',         help='Directory containing LMDB database')
+    p.add_argument(     '--resolution',     help='Output resolution (default: 256)', type=int, default=256)
+    p.add_argument(     '--max_images',     help='Maximum number of images (default: none)', type=int, default=None)
+
+    p = add_command(    'create_lsun_wide', 'Create LSUN dataset with non-square aspect ratio.',
+                                            'create_lsun_wide datasets/lsun-car-512x384 ~/downloads/lsun/car_lmdb --width 512 --height 384')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'lmdb_dir',         help='Directory containing LMDB database')
+    p.add_argument(     '--width',          help='Output width (default: 512)', type=int, default=512)
+    p.add_argument(     '--height',         help='Output height (default: 384)', type=int, default=384)
+    p.add_argument(     '--max_images',     help='Maximum number of images (default: none)', type=int, default=None)
+
+    p = add_command(    'create_celeba',    'Create dataset for CelebA.',
+                                            'create_celeba datasets/celeba ~/downloads/celeba')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'celeba_dir',       help='Directory containing CelebA')
+    p.add_argument(     '--cx',             help='Center X coordinate (default: 89)', type=int, default=89)
+    p.add_argument(     '--cy',             help='Center Y coordinate (default: 121)', type=int, default=121)
+
+    p = add_command(    'create_from_images', 'Create dataset from a directory full of images.',
+                                            'create_from_images datasets/mydataset myimagedir')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'image_dir',        help='Directory containing the images')
+    p.add_argument(     '--shuffle',        help='Randomize image order (default: 1)', type=int, default=1)
+
+    p = add_command(    'create_from_hdf5', 'Create dataset from legacy HDF5 archive.',
+                                            'create_from_hdf5 datasets/celebahq ~/downloads/celeba-hq-1024x1024.h5')
+    p.add_argument(     'tfrecord_dir',     help='New dataset directory to be created')
+    p.add_argument(     'hdf5_filename',    help='HDF5 archive containing the images')
+    p.add_argument(     '--shuffle',        help='Randomize image order (default: 1)', type=int, default=1)
+
+    args = parser.parse_args(argv[1:] if len(argv) > 1 else ['-h'])
+    func = globals()[args.command]
+    del args.command
+    func(**vars(args))
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    execute_cmdline(sys.argv)
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34112b628e3d526739681eac984c5c2db704814
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+from . import submission
+
+from .submission.run_context import RunContext
+
+from .submission.submit import SubmitTarget
+from .submission.submit import PathType
+from .submission.submit import SubmitConfig
+from .submission.submit import submit_run
+from .submission.submit import get_path_from_template
+from .submission.submit import convert_path
+from .submission.submit import make_run_dir_path
+
+from .util import EasyDict
+
+submit_config: SubmitConfig = None # Package level variable for SubmitConfig which is only valid when inside the run function.
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf2fbee4b216cb9f2a0b73993fd1c7042e2248d
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+from . import run_context
+from . import submit
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f11279893d6056e8cb6f9e04e12aad07a776496
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+from . import local
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/local.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..c03c79e93ca19704157782a0bae556a7752b775c
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/internal/local.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+class TargetOptions():
+    def __init__(self):
+        self.do_not_copy_source_files = False
+
+class Target():
+    def __init__(self):
+        pass
+
+    def finalize_submit_config(self, submit_config, host_run_dir):
+        print ('Local submit ', end='', flush=True)
+        submit_config.run_dir = host_run_dir
+
+    def submit(self, submit_config, host_run_dir):
+        from ..submit import run_wrapper, convert_path
+        print('- run_dir: %s' % convert_path(submit_config.run_dir), flush=True)
+        return run_wrapper(submit_config)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/run_context.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/run_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..f246563a755c0c8923b6cc311547e54b08063d24
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/run_context.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helpers for managing the run/training loop."""
+
+import datetime
+import os
+import pprint
+import time
+import types
+
+from typing import Any
+
+from . import submit
+
+# Singleton RunContext
+_run_context = None
+
+class RunContext(object):
+    """Helper class for managing the run/training loop.
+
+    The context will hide the implementation details of a basic run/training loop.
+    It will set things up properly, tell if run should be stopped, and then cleans up.
+    User should call update periodically and use should_stop to determine if run should be stopped.
+
+    Args:
+        submit_config: The SubmitConfig that is used for the current run.
+        config_module: (deprecated) The whole config module that is used for the current run.
+    """
+
+    def __init__(self, submit_config: submit.SubmitConfig, config_module: types.ModuleType = None):
+        global _run_context
+        # Only a single RunContext can be alive
+        assert _run_context is None
+        _run_context = self
+        self.submit_config = submit_config
+        self.should_stop_flag = False
+        self.has_closed = False
+        self.start_time = time.time()
+        self.last_update_time = time.time()
+        self.last_update_interval = 0.0
+        self.progress_monitor_file_path = None
+
+        # vestigial config_module support just prints a warning
+        if config_module is not None:
+            print("RunContext.config_module parameter support has been removed.")
+
+        # write out details about the run to a text file
+        self.run_txt_data = {"task_name": submit_config.task_name, "host_name": submit_config.host_name, "start_time": datetime.datetime.now().isoformat(sep=" ")}
+        with open(os.path.join(submit_config.run_dir, "run.txt"), "w") as f:
+            pprint.pprint(self.run_txt_data, stream=f, indent=4, width=200, compact=False)
+
+    def __enter__(self) -> "RunContext":
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+    def update(self, loss: Any = 0, cur_epoch: Any = 0, max_epoch: Any = None) -> None:
+        """Do general housekeeping and keep the state of the context up-to-date.
+        Should be called often enough but not in a tight loop."""
+        assert not self.has_closed
+
+        self.last_update_interval = time.time() - self.last_update_time
+        self.last_update_time = time.time()
+
+        if os.path.exists(os.path.join(self.submit_config.run_dir, "abort.txt")):
+            self.should_stop_flag = True
+
+    def should_stop(self) -> bool:
+        """Tell whether a stopping condition has been triggered one way or another."""
+        return self.should_stop_flag
+
+    def get_time_since_start(self) -> float:
+        """How much time has passed since the creation of the context."""
+        return time.time() - self.start_time
+
+    def get_time_since_last_update(self) -> float:
+        """How much time has passed since the last call to update."""
+        return time.time() - self.last_update_time
+
+    def get_last_update_interval(self) -> float:
+        """How much time passed between the previous two calls to update."""
+        return self.last_update_interval
+
+    def close(self) -> None:
+        """Close the context and clean up.
+        Should only be called once."""
+        if not self.has_closed:
+            # update the run.txt with stopping time
+            self.run_txt_data["stop_time"] = datetime.datetime.now().isoformat(sep=" ")
+            with open(os.path.join(self.submit_config.run_dir, "run.txt"), "w") as f:
+                pprint.pprint(self.run_txt_data, stream=f, indent=4, width=200, compact=False)
+            self.has_closed = True
+
+            # detach the global singleton
+            global _run_context
+            if _run_context is self:
+                _run_context = None
+
+    @staticmethod
+    def get():
+        import dnnlib
+        if _run_context is not None:
+            return _run_context
+        return RunContext(dnnlib.submit_config)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/submit.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..857ac3a7b4af05c1e911ca6a8eefbde51dce6518
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/submission/submit.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Submit a function to be run either locally or in a computing cluster."""
+
+import copy
+import inspect
+import os
+import pathlib
+import pickle
+import platform
+import pprint
+import re
+import shutil
+import sys
+import time
+import traceback
+
+from enum import Enum
+
+from .. import util
+
+
+class SubmitTarget(Enum):
+    """The target where the function should be run.
+
+    LOCAL: Run it locally.
+    """
+    LOCAL = 1
+
+
+class PathType(Enum):
+    """Determines in which format should a path be formatted.
+
+    WINDOWS: Format with Windows style.
+    LINUX: Format with Linux/Posix style.
+    AUTO: Use current OS type to select either WINDOWS or LINUX.
+    """
+    WINDOWS = 1
+    LINUX = 2
+    AUTO = 3
+
+
+class PlatformExtras:
+    """A mixed bag of values used by dnnlib heuristics.
+
+    Attributes:
+
+        data_reader_buffer_size: Used by DataReader to size internal shared memory buffers.
+        data_reader_process_count: Number of worker processes to spawn (zero for single thread operation)
+    """
+    def __init__(self):
+        self.data_reader_buffer_size = 1<<30    # 1 GB
+        self.data_reader_process_count = 0      # single threaded default
+
+
+_user_name_override = None
+
+class SubmitConfig(util.EasyDict):
+    """Strongly typed config dict needed to submit runs.
+
+    Attributes:
+        run_dir_root: Path to the run dir root. Can be optionally templated with tags. Needs to always be run through get_path_from_template.
+        run_desc: Description of the run. Will be used in the run dir and task name.
+        run_dir_ignore: List of file patterns used to ignore files when copying files to the run dir.
+        run_dir_extra_files: List of (abs_path, rel_path) tuples of file paths. rel_path root will be the src directory inside the run dir.
+        submit_target: Submit target enum value. Used to select where the run is actually launched.
+        num_gpus: Number of GPUs used/requested for the run.
+        print_info: Whether to print debug information when submitting.
+        local.do_not_copy_source_files: Do not copy source files from the working directory to the run dir.
+        run_id: Automatically populated value during submit.
+        run_name: Automatically populated value during submit.
+        run_dir: Automatically populated value during submit.
+        run_func_name: Automatically populated value during submit.
+        run_func_kwargs: Automatically populated value during submit.
+        user_name: Automatically populated value during submit. Can be set by the user which will then override the automatic value.
+        task_name: Automatically populated value during submit.
+        host_name: Automatically populated value during submit.
+        platform_extras: Automatically populated values during submit.  Used by various dnnlib libraries such as the DataReader class.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        # run (set these)
+        self.run_dir_root = ""  # should always be passed through get_path_from_template
+        self.run_desc = ""
+        self.run_dir_ignore = ["__pycache__", "*.pyproj", "*.sln", "*.suo", ".cache", ".idea", ".vs", ".vscode", "_cudacache"]
+        self.run_dir_extra_files = []
+
+        # submit (set these)
+        self.submit_target = SubmitTarget.LOCAL
+        self.num_gpus = 1
+        self.print_info = False
+        self.nvprof = False
+        self.local = external.stylegan2.dnnlib.submission.internal.local.TargetOptions()
+        self.datasets = []
+
+        # (automatically populated)
+        self.run_id = None
+        self.run_name = None
+        self.run_dir = None
+        self.run_func_name = None
+        self.run_func_kwargs = None
+        self.user_name = None
+        self.task_name = None
+        self.host_name = "localhost"
+        self.platform_extras = PlatformExtras()
+
+
+def get_path_from_template(path_template: str, path_type: PathType = PathType.AUTO) -> str:
+    """Replace tags in the given path template and return either Windows or Linux formatted path."""
+    # automatically select path type depending on running OS
+    if path_type == PathType.AUTO:
+        if platform.system() == "Windows":
+            path_type = PathType.WINDOWS
+        elif platform.system() == "Linux":
+            path_type = PathType.LINUX
+        else:
+            raise RuntimeError("Unknown platform")
+
+    path_template = path_template.replace("<USERNAME>", get_user_name())
+
+    # return correctly formatted path
+    if path_type == PathType.WINDOWS:
+        return str(pathlib.PureWindowsPath(path_template))
+    elif path_type == PathType.LINUX:
+        return str(pathlib.PurePosixPath(path_template))
+    else:
+        raise RuntimeError("Unknown platform")
+
+
+def get_template_from_path(path: str) -> str:
+    """Convert a normal path back to its template representation."""
+    path = path.replace("\\", "/")
+    return path
+
+
+def convert_path(path: str, path_type: PathType = PathType.AUTO) -> str:
+    """Convert a normal path to template and the convert it back to a normal path with given path type."""
+    path_template = get_template_from_path(path)
+    path = get_path_from_template(path_template, path_type)
+    return path
+
+
+def set_user_name_override(name: str) -> None:
+    """Set the global username override value."""
+    global _user_name_override
+    _user_name_override = name
+
+
+def get_user_name():
+    """Get the current user name."""
+    if _user_name_override is not None:
+        return _user_name_override
+    elif platform.system() == "Windows":
+        return os.getlogin()
+    elif platform.system() == "Linux":
+        try:
+            import pwd
+            return pwd.getpwuid(os.geteuid()).pw_name
+        except:
+            return "unknown"
+    else:
+        raise RuntimeError("Unknown platform")
+
+
+def make_run_dir_path(*paths):
+    """Make a path/filename that resides under the current submit run_dir.
+
+    Args:
+        *paths: Path components to be passed to os.path.join
+
+    Returns:
+        A file/dirname rooted at submit_config.run_dir.  If there's no
+        submit_config or run_dir, the base directory is the current
+        working directory.
+
+    E.g., `os.path.join(dnnlib.submit_config.run_dir, "output.txt"))`
+    """
+    import dnnlib
+    if (dnnlib.submit_config is None) or (dnnlib.submit_config.run_dir is None):
+        return os.path.join(os.getcwd(), *paths)
+    return os.path.join(dnnlib.submit_config.run_dir, *paths)
+
+
+def _create_run_dir_local(submit_config: SubmitConfig) -> str:
+    """Create a new run dir with increasing ID number at the start."""
+    run_dir_root = get_path_from_template(submit_config.run_dir_root, PathType.AUTO)
+
+    if not os.path.exists(run_dir_root):
+        os.makedirs(run_dir_root)
+
+    submit_config.run_id = _get_next_run_id_local(run_dir_root)
+    submit_config.run_name = "{0:05d}-{1}".format(submit_config.run_id, submit_config.run_desc)
+    run_dir = os.path.join(run_dir_root, submit_config.run_name)
+
+    if os.path.exists(run_dir):
+        raise RuntimeError("The run dir already exists! ({0})".format(run_dir))
+
+    os.makedirs(run_dir)
+
+    return run_dir
+
+
+def _get_next_run_id_local(run_dir_root: str) -> int:
+    """Reads all directory names in a given directory (non-recursive) and returns the next (increasing) run id. Assumes IDs are numbers at the start of the directory names."""
+    dir_names = [d for d in os.listdir(run_dir_root) if os.path.isdir(os.path.join(run_dir_root, d))]
+    r = re.compile("^\\d+")  # match one or more digits at the start of the string
+    run_id = 0
+
+    for dir_name in dir_names:
+        m = r.match(dir_name)
+
+        if m is not None:
+            i = int(m.group())
+            run_id = max(run_id, i + 1)
+
+    return run_id
+
+
+def _populate_run_dir(submit_config: SubmitConfig, run_dir: str) -> None:
+    """Copy all necessary files into the run dir. Assumes that the dir exists, is local, and is writable."""
+    pickle.dump(submit_config, open(os.path.join(run_dir, "submit_config.pkl"), "wb"))
+    with open(os.path.join(run_dir, "submit_config.txt"), "w") as f:
+        pprint.pprint(submit_config, stream=f, indent=4, width=200, compact=False)
+
+    if (submit_config.submit_target == SubmitTarget.LOCAL) and submit_config.local.do_not_copy_source_files:
+        return
+
+    files = []
+
+    run_func_module_dir_path = util.get_module_dir_by_obj_name(submit_config.run_func_name)
+    assert '.' in submit_config.run_func_name
+    for _idx in range(submit_config.run_func_name.count('.') - 1):
+        run_func_module_dir_path = os.path.dirname(run_func_module_dir_path)
+    files += util.list_dir_recursively_with_ignore(run_func_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=False)
+
+    dnnlib_module_dir_path = util.get_module_dir_by_obj_name("dnnlib")
+    files += util.list_dir_recursively_with_ignore(dnnlib_module_dir_path, ignores=submit_config.run_dir_ignore, add_base_to_relative=True)
+
+    files += submit_config.run_dir_extra_files
+
+    files = [(f[0], os.path.join(run_dir, "src", f[1])) for f in files]
+    files += [(os.path.join(dnnlib_module_dir_path, "submission", "internal", "run.py"), os.path.join(run_dir, "run.py"))]
+
+    util.copy_files_and_create_dirs(files)
+
+
+
+def run_wrapper(submit_config: SubmitConfig) -> None:
+    """Wrap the actual run function call for handling logging, exceptions, typing, etc."""
+    is_local = submit_config.submit_target == SubmitTarget.LOCAL
+
+    # when running locally, redirect stderr to stdout, log stdout to a file, and force flushing
+    if is_local:
+        logger = util.Logger(file_name=os.path.join(submit_config.run_dir, "log.txt"), file_mode="w", should_flush=True)
+    else:  # when running in a cluster, redirect stderr to stdout, and just force flushing (log writing is handled by run.sh)
+        logger = util.Logger(file_name=None, should_flush=True)
+
+    import dnnlib
+    dnnlib.submit_config = submit_config
+
+    exit_with_errcode = False
+    try:
+        print("dnnlib: Running {0}() on {1}...".format(submit_config.run_func_name, submit_config.host_name))
+        start_time = time.time()
+
+        run_func_obj = util.get_obj_by_name(submit_config.run_func_name)
+        assert callable(run_func_obj)
+        sig = inspect.signature(run_func_obj)
+        if 'submit_config' in sig.parameters:
+            run_func_obj(submit_config=submit_config, **submit_config.run_func_kwargs)
+        else:
+            run_func_obj(**submit_config.run_func_kwargs)
+
+        print("dnnlib: Finished {0}() in {1}.".format(submit_config.run_func_name, util.format_time(time.time() - start_time)))
+    except:
+        if is_local:
+            raise
+        else:
+            traceback.print_exc()
+
+            log_src = os.path.join(submit_config.run_dir, "log.txt")
+            log_dst = os.path.join(get_path_from_template(submit_config.run_dir_root), "{0}-error.txt".format(submit_config.run_name))
+            shutil.copyfile(log_src, log_dst)
+
+            # Defer sys.exit(1) to happen after we close the logs and create a _finished.txt
+            exit_with_errcode = True
+    finally:
+        open(os.path.join(submit_config.run_dir, "_finished.txt"), "w").close()
+
+    dnnlib.RunContext.get().close()
+    dnnlib.submit_config = None
+    logger.close()
+
+    # If we hit an error, get out of the script now and signal the error
+    # to whatever process that started this script.
+    if exit_with_errcode:
+        sys.exit(1)
+
+    return submit_config
+
+
+def submit_run(submit_config: SubmitConfig, run_func_name: str, **run_func_kwargs) -> None:
+    """Create a run dir, gather files related to the run, copy files to the run dir, and launch the run in appropriate place."""
+    submit_config = copy.deepcopy(submit_config)
+
+    submit_target = submit_config.submit_target
+    farm = None
+    if submit_target == SubmitTarget.LOCAL:
+        farm = external.stylegan2.dnnlib.submission.internal.local.Target()
+    assert farm is not None # unknown target
+
+    # Disallow submitting jobs with zero num_gpus.
+    if (submit_config.num_gpus is None) or (submit_config.num_gpus == 0):
+        raise RuntimeError("submit_config.num_gpus must be set to a non-zero value")
+
+    if submit_config.user_name is None:
+        submit_config.user_name = get_user_name()
+
+    submit_config.run_func_name = run_func_name
+    submit_config.run_func_kwargs = run_func_kwargs
+
+    #--------------------------------------------------------------------
+    # Prepare submission by populating the run dir
+    #--------------------------------------------------------------------
+    host_run_dir = _create_run_dir_local(submit_config)
+
+    submit_config.task_name = "{0}-{1:05d}-{2}".format(submit_config.user_name, submit_config.run_id, submit_config.run_desc)
+    docker_valid_name_regex = "^[a-zA-Z0-9][a-zA-Z0-9_.-]+$"
+    if not re.match(docker_valid_name_regex, submit_config.task_name):
+        raise RuntimeError("Invalid task name.  Probable reason: unacceptable characters in your submit_config.run_desc.  Task name must be accepted by the following regex: " + docker_valid_name_regex + ", got " + submit_config.task_name)
+
+    # Farm specific preparations for a submit
+    farm.finalize_submit_config(submit_config, host_run_dir)
+    _populate_run_dir(submit_config, host_run_dir)
+    return farm.submit(submit_config, host_run_dir)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c25173d3f2391c88b142cf80af02cd93b0b5a0
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+from . import autosummary
+from . import network
+from . import optimizer
+from . import tfutil
+from . import custom_ops
+
+from .tfutil import *
+from .network import Network
+
+from .optimizer import Optimizer
+
+from .custom_ops import get_plugin
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/autosummary.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/autosummary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0d80b371620bedadf8164772b7d6f87806fc11
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/autosummary.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper for adding automatically tracked values to Tensorboard.
+
+Autosummary creates an identity op that internally keeps track of the input
+values and automatically shows up in TensorBoard. The reported value
+represents an average over input components. The average is accumulated
+constantly over time and flushed when save_summaries() is called.
+
+Notes:
+- The output tensor must be used as an input for something else in the
+  graph. Otherwise, the autosummary op will not get executed, and the average
+  value will not get accumulated.
+- It is perfectly fine to include autosummaries with the same name in
+  several places throughout the graph, even if they are executed concurrently.
+- It is ok to also pass in a python scalar or numpy array. In this case, it
+  is added to the average immediately.
+"""
+
+from collections import OrderedDict
+import numpy as np
+import tensorflow as tf
+from tensorboard import summary as summary_lib
+from tensorboard.plugins.custom_scalar import layout_pb2
+
+from . import tfutil
+from .tfutil import TfExpression
+from .tfutil import TfExpressionEx
+
+# Enable "Custom scalars" tab in TensorBoard for advanced formatting.
+# Disabled by default to reduce tfevents file size.
+enable_custom_scalars = False
+
+_dtype = tf.float64
+_vars = OrderedDict()  # name => [var, ...]
+_immediate = OrderedDict()  # name => update_op, update_value
+_finalized = False
+_merge_op = None
+
+
+def _create_var(name: str, value_expr: TfExpression) -> TfExpression:
+    """Internal helper for creating autosummary accumulators."""
+    assert not _finalized
+    name_id = name.replace("/", "_")
+    v = tf.cast(value_expr, _dtype)
+
+    if v.shape.is_fully_defined():
+        size = np.prod(v.shape.as_list())
+        size_expr = tf.constant(size, dtype=_dtype)
+    else:
+        size = None
+        size_expr = tf.reduce_prod(tf.cast(tf.shape(v), _dtype))
+
+    if size == 1:
+        if v.shape.ndims != 0:
+            v = tf.reshape(v, [])
+        v = [size_expr, v, tf.square(v)]
+    else:
+        v = [size_expr, tf.reduce_sum(v), tf.reduce_sum(tf.square(v))]
+    v = tf.cond(tf.is_finite(v[1]), lambda: tf.stack(v), lambda: tf.zeros(3, dtype=_dtype))
+
+    with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.control_dependencies(None):
+        var = tf.Variable(tf.zeros(3, dtype=_dtype), trainable=False)  # [sum(1), sum(x), sum(x**2)]
+    update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v))
+
+    if name in _vars:
+        _vars[name].append(var)
+    else:
+        _vars[name] = [var]
+    return update_op
+
+
+def autosummary(name: str, value: TfExpressionEx, passthru: TfExpressionEx = None, condition: TfExpressionEx = True) -> TfExpressionEx:
+    """Create a new autosummary.
+
+    Args:
+        name:     Name to use in TensorBoard
+        value:    TensorFlow expression or python value to track
+        passthru: Optionally return this TF node without modifications but tack an autosummary update side-effect to this node.
+
+    Example use of the passthru mechanism:
+
+    n = autosummary('l2loss', loss, passthru=n)
+
+    This is a shorthand for the following code:
+
+    with tf.control_dependencies([autosummary('l2loss', loss)]):
+        n = tf.identity(n)
+    """
+    tfutil.assert_tf_initialized()
+    name_id = name.replace("/", "_")
+
+    if tfutil.is_tf_expression(value):
+        with tf.name_scope("summary_" + name_id), tf.device(value.device):
+            condition = tf.convert_to_tensor(condition, name='condition')
+            update_op = tf.cond(condition, lambda: tf.group(_create_var(name, value)), tf.no_op)
+            with tf.control_dependencies([update_op]):
+                return tf.identity(value if passthru is None else passthru)
+
+    else:  # python scalar or numpy array
+        assert not tfutil.is_tf_expression(passthru)
+        assert not tfutil.is_tf_expression(condition)
+        if condition:
+            if name not in _immediate:
+                with tfutil.absolute_name_scope("Autosummary/" + name_id), tf.device(None), tf.control_dependencies(None):
+                    update_value = tf.placeholder(_dtype)
+                    update_op = _create_var(name, update_value)
+                    _immediate[name] = update_op, update_value
+            update_op, update_value = _immediate[name]
+            tfutil.run(update_op, {update_value: value})
+        return value if passthru is None else passthru
+
+
+def finalize_autosummaries() -> None:
+    """Create the necessary ops to include autosummaries in TensorBoard report.
+    Note: This should be done only once per graph.
+    """
+    global _finalized
+    tfutil.assert_tf_initialized()
+
+    if _finalized:
+        return None
+
+    _finalized = True
+    tfutil.init_uninitialized_vars([var for vars_list in _vars.values() for var in vars_list])
+
+    # Create summary ops.
+    with tf.device(None), tf.control_dependencies(None):
+        for name, vars_list in _vars.items():
+            name_id = name.replace("/", "_")
+            with tfutil.absolute_name_scope("Autosummary/" + name_id):
+                moments = tf.add_n(vars_list)
+                moments /= moments[0]
+                with tf.control_dependencies([moments]):  # read before resetting
+                    reset_ops = [tf.assign(var, tf.zeros(3, dtype=_dtype)) for var in vars_list]
+                    with tf.name_scope(None), tf.control_dependencies(reset_ops):  # reset before reporting
+                        mean = moments[1]
+                        std = tf.sqrt(moments[2] - tf.square(moments[1]))
+                        tf.summary.scalar(name, mean)
+                        if enable_custom_scalars:
+                            tf.summary.scalar("xCustomScalars/" + name + "/margin_lo", mean - std)
+                            tf.summary.scalar("xCustomScalars/" + name + "/margin_hi", mean + std)
+
+    # Setup layout for custom scalars.
+    layout = None
+    if enable_custom_scalars:
+        cat_dict = OrderedDict()
+        for series_name in sorted(_vars.keys()):
+            p = series_name.split("/")
+            cat = p[0] if len(p) >= 2 else ""
+            chart = "/".join(p[1:-1]) if len(p) >= 3 else p[-1]
+            if cat not in cat_dict:
+                cat_dict[cat] = OrderedDict()
+            if chart not in cat_dict[cat]:
+                cat_dict[cat][chart] = []
+            cat_dict[cat][chart].append(series_name)
+        categories = []
+        for cat_name, chart_dict in cat_dict.items():
+            charts = []
+            for chart_name, series_names in chart_dict.items():
+                series = []
+                for series_name in series_names:
+                    series.append(layout_pb2.MarginChartContent.Series(
+                        value=series_name,
+                        lower="xCustomScalars/" + series_name + "/margin_lo",
+                        upper="xCustomScalars/" + series_name + "/margin_hi"))
+                margin = layout_pb2.MarginChartContent(series=series)
+                charts.append(layout_pb2.Chart(title=chart_name, margin=margin))
+            categories.append(layout_pb2.Category(title=cat_name, chart=charts))
+        layout = summary_lib.custom_scalar_pb(layout_pb2.Layout(category=categories))
+    return layout
+
+def save_summaries(file_writer, global_step=None):
+    """Call FileWriter.add_summary() with all summaries in the default graph,
+    automatically finalizing and merging them on the first call.
+    """
+    global _merge_op
+    tfutil.assert_tf_initialized()
+
+    if _merge_op is None:
+        layout = finalize_autosummaries()
+        if layout is not None:
+            file_writer.add_summary(layout)
+        with tf.device(None), tf.control_dependencies(None):
+            _merge_op = tf.summary.merge_all()
+
+    file_writer.add_summary(_merge_op.eval(), global_step)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/custom_ops.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..02434417f1078f966c7219764debf77d4e1983af
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/custom_ops.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""TensorFlow custom ops builder.
+"""
+
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+cuda_cache_path = os.path.join(os.path.dirname(__file__), '_cudacache')
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+compiler_bindir_search_path = [
+    'C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Tools/MSVC/14.14.26428/bin/Hostx64/x64',
+    'C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.23.28105/bin/Hostx64/x64',
+    'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin',
+]
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    for compiler_path in compiler_bindir_search_path:
+        if os.path.isdir(compiler_path):
+            return compiler_path
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc --std=c++11 -DNDEBUG ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        bin_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/network.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..afd3298c7559880696ff3e249e9b46355bea61b8
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/network.py
@@ -0,0 +1,597 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper for managing networks."""
+
+import types
+import inspect
+import re
+import uuid
+import sys
+import numpy as np
+import tensorflow as tf
+
+from collections import OrderedDict
+from typing import Any, List, Tuple, Union
+
+from . import tfutil
+from .. import util
+
+from .tfutil import TfExpression, TfExpressionEx
+
+_import_handlers = []  # Custom import handlers for dealing with legacy data in pickle import.
+_import_module_src = dict()  # Source code for temporary modules created during pickle import.
+
+
+def import_handler(handler_func):
+    """Function decorator for declaring custom import handlers."""
+    _import_handlers.append(handler_func)
+    return handler_func
+
+
+class Network:
+    """Generic network abstraction.
+
+    Acts as a convenience wrapper for a parameterized network construction
+    function, providing several utility methods and convenient access to
+    the inputs/outputs/weights.
+
+    Network objects can be safely pickled and unpickled for long-term
+    archival purposes. The pickling works reliably as long as the underlying
+    network construction function is defined in a standalone Python module
+    that has no side effects or application-specific imports.
+
+    Args:
+        name: Network name. Used to select TensorFlow name and variable scopes.
+        func_name: Fully qualified name of the underlying network construction function, or a top-level function object.
+        static_kwargs: Keyword arguments to be passed in to the network construction function.
+
+    Attributes:
+        name: User-specified name, defaults to build func name if None.
+        scope: Unique TensorFlow scope containing template graph and variables, derived from the user-specified name.
+        static_kwargs: Arguments passed to the user-supplied build func.
+        components: Container for sub-networks. Passed to the build func, and retained between calls.
+        num_inputs: Number of input tensors.
+        num_outputs: Number of output tensors.
+        input_shapes: Input tensor shapes (NC or NCHW), including minibatch dimension.
+        output_shapes: Output tensor shapes (NC or NCHW), including minibatch dimension.
+        input_shape: Short-hand for input_shapes[0].
+        output_shape: Short-hand for output_shapes[0].
+        input_templates: Input placeholders in the template graph.
+        output_templates: Output tensors in the template graph.
+        input_names: Name string for each input.
+        output_names: Name string for each output.
+        own_vars: Variables defined by this network (local_name => var), excluding sub-networks.
+        vars: All variables (local_name => var).
+        trainables: All trainable variables (local_name => var).
+        var_global_to_local: Mapping from variable global names to local names.
+    """
+
+    def __init__(self, name: str = None, func_name: Any = None, **static_kwargs):
+        tfutil.assert_tf_initialized()
+        assert isinstance(name, str) or name is None
+        assert func_name is not None
+        assert isinstance(func_name, str) or util.is_top_level_function(func_name)
+        assert util.is_pickleable(static_kwargs)
+
+        self._init_fields()
+        self.name = name
+        self.static_kwargs = util.EasyDict(static_kwargs)
+
+        # Locate the user-specified network build function.
+        if util.is_top_level_function(func_name):
+            func_name = util.get_top_level_function_name(func_name)
+        module, self._build_func_name = util.get_module_from_obj_name(func_name)
+        self._build_func = util.get_obj_from_module(module, self._build_func_name)
+        assert callable(self._build_func)
+
+        # Dig up source code for the module containing the build function.
+        self._build_module_src = _import_module_src.get(module, None)
+        if self._build_module_src is None:
+            self._build_module_src = inspect.getsource(module)
+
+        # Init TensorFlow graph.
+        self._init_graph()
+        self.reset_own_vars()
+
+    def _init_fields(self) -> None:
+        self.name = None
+        self.scope = None
+        self.static_kwargs = util.EasyDict()
+        self.components = util.EasyDict()
+        self.num_inputs = 0
+        self.num_outputs = 0
+        self.input_shapes = [[]]
+        self.output_shapes = [[]]
+        self.input_shape = []
+        self.output_shape = []
+        self.input_templates = []
+        self.output_templates = []
+        self.input_names = []
+        self.output_names = []
+        self.own_vars = OrderedDict()
+        self.vars = OrderedDict()
+        self.trainables = OrderedDict()
+        self.var_global_to_local = OrderedDict()
+
+        self._build_func = None  # User-supplied build function that constructs the network.
+        self._build_func_name = None  # Name of the build function.
+        self._build_module_src = None  # Full source code of the module containing the build function.
+        self._run_cache = dict()  # Cached graph data for Network.run().
+
+    def _init_graph(self) -> None:
+        # Collect inputs.
+        self.input_names = []
+
+        for param in inspect.signature(self._build_func).parameters.values():
+            if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty:
+                self.input_names.append(param.name)
+
+        self.num_inputs = len(self.input_names)
+        assert self.num_inputs >= 1
+
+        # Choose name and scope.
+        if self.name is None:
+            self.name = self._build_func_name
+        assert re.match("^[A-Za-z0-9_.\\-]*$", self.name)
+        with tf.name_scope(None):
+            self.scope = tf.get_default_graph().unique_name(self.name, mark_as_used=True)
+
+        # Finalize build func kwargs.
+        build_kwargs = dict(self.static_kwargs)
+        build_kwargs["is_template_graph"] = True
+        build_kwargs["components"] = self.components
+
+        # Build template graph.
+        with tfutil.absolute_variable_scope(self.scope, reuse=False), tfutil.absolute_name_scope(self.scope):  # ignore surrounding scopes
+            assert tf.get_variable_scope().name == self.scope
+            assert tf.get_default_graph().get_name_scope() == self.scope
+            with tf.control_dependencies(None):  # ignore surrounding control dependencies
+                self.input_templates = [tf.placeholder(tf.float32, name=name) for name in self.input_names]
+                out_expr = self._build_func(*self.input_templates, **build_kwargs)
+
+        # Collect outputs.
+        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
+        self.output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
+        self.num_outputs = len(self.output_templates)
+        assert self.num_outputs >= 1
+        assert all(tfutil.is_tf_expression(t) for t in self.output_templates)
+
+        # Perform sanity checks.
+        if any(t.shape.ndims is None for t in self.input_templates):
+            raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.")
+        if any(t.shape.ndims is None for t in self.output_templates):
+            raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.")
+        if any(not isinstance(comp, Network) for comp in self.components.values()):
+            raise ValueError("Components of a Network must be Networks themselves.")
+        if len(self.components) != len(set(comp.name for comp in self.components.values())):
+            raise ValueError("Components of a Network must have unique names.")
+
+        # List inputs and outputs.
+        self.input_shapes = [t.shape.as_list() for t in self.input_templates]
+        self.output_shapes = [t.shape.as_list() for t in self.output_templates]
+        self.input_shape = self.input_shapes[0]
+        self.output_shape = self.output_shapes[0]
+        self.output_names = [t.name.split("/")[-1].split(":")[0] for t in self.output_templates]
+
+        # List variables.
+        self.own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/"))
+        self.vars = OrderedDict(self.own_vars)
+        self.vars.update((comp.name + "/" + name, var) for comp in self.components.values() for name, var in comp.vars.items())
+        self.trainables = OrderedDict((name, var) for name, var in self.vars.items() if var.trainable)
+        self.var_global_to_local = OrderedDict((var.name.split(":")[0], name) for name, var in self.vars.items())
+
+    def reset_own_vars(self) -> None:
+        """Re-initialize all variables of this network, excluding sub-networks."""
+        tfutil.run([var.initializer for var in self.own_vars.values()])
+
+    def reset_vars(self) -> None:
+        """Re-initialize all variables of this network, including sub-networks."""
+        tfutil.run([var.initializer for var in self.vars.values()])
+
+    def reset_trainables(self) -> None:
+        """Re-initialize all trainable variables of this network, including sub-networks."""
+        tfutil.run([var.initializer for var in self.trainables.values()])
+
+    def get_output_for(self, *in_expr: TfExpression, return_as_list: bool = False, **dynamic_kwargs) -> Union[TfExpression, List[TfExpression]]:
+        """Construct TensorFlow expression(s) for the output(s) of this network, given the input expression(s)."""
+        assert len(in_expr) == self.num_inputs
+        assert not all(expr is None for expr in in_expr)
+
+        # Finalize build func kwargs.
+        build_kwargs = dict(self.static_kwargs)
+        build_kwargs.update(dynamic_kwargs)
+        build_kwargs["is_template_graph"] = False
+        build_kwargs["components"] = self.components
+
+        # Build TensorFlow graph to evaluate the network.
+        with tfutil.absolute_variable_scope(self.scope, reuse=True), tf.name_scope(self.name):
+            assert tf.get_variable_scope().name == self.scope
+            valid_inputs = [expr for expr in in_expr if expr is not None]
+            final_inputs = []
+            for expr, name, shape in zip(in_expr, self.input_names, self.input_shapes):
+                if expr is not None:
+                    expr = tf.identity(expr, name=name)
+                else:
+                    expr = tf.zeros([tf.shape(valid_inputs[0])[0]] + shape[1:], name=name)
+                final_inputs.append(expr)
+            out_expr = self._build_func(*final_inputs, **build_kwargs)
+
+        # Propagate input shapes back to the user-specified expressions.
+        for expr, final in zip(in_expr, final_inputs):
+            if isinstance(expr, tf.Tensor):
+                expr.set_shape(final.shape)
+
+        # Express outputs in the desired format.
+        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
+        if return_as_list:
+            out_expr = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
+        return out_expr
+
+    def get_var_local_name(self, var_or_global_name: Union[TfExpression, str]) -> str:
+        """Get the local name of a given variable, without any surrounding name scopes."""
+        assert tfutil.is_tf_expression(var_or_global_name) or isinstance(var_or_global_name, str)
+        global_name = var_or_global_name if isinstance(var_or_global_name, str) else var_or_global_name.name
+        return self.var_global_to_local[global_name]
+
+    def find_var(self, var_or_local_name: Union[TfExpression, str]) -> TfExpression:
+        """Find variable by local or global name."""
+        assert tfutil.is_tf_expression(var_or_local_name) or isinstance(var_or_local_name, str)
+        return self.vars[var_or_local_name] if isinstance(var_or_local_name, str) else var_or_local_name
+
+    def get_var(self, var_or_local_name: Union[TfExpression, str]) -> np.ndarray:
+        """Get the value of a given variable as NumPy array.
+        Note: This method is very inefficient -- prefer to use tflib.run(list_of_vars) whenever possible."""
+        return self.find_var(var_or_local_name).eval()
+
+    def set_var(self, var_or_local_name: Union[TfExpression, str], new_value: Union[int, float, np.ndarray]) -> None:
+        """Set the value of a given variable based on the given NumPy array.
+        Note: This method is very inefficient -- prefer to use tflib.set_vars() whenever possible."""
+        tfutil.set_vars({self.find_var(var_or_local_name): new_value})
+
+    def __getstate__(self) -> dict:
+        """Pickle export."""
+        state = dict()
+        state["version"]            = 4
+        state["name"]               = self.name
+        state["static_kwargs"]      = dict(self.static_kwargs)
+        state["components"]         = dict(self.components)
+        state["build_module_src"]   = self._build_module_src
+        state["build_func_name"]    = self._build_func_name
+        state["variables"]          = list(zip(self.own_vars.keys(), tfutil.run(list(self.own_vars.values()))))
+        return state
+
+    def __setstate__(self, state: dict) -> None:
+        """Pickle import."""
+        # pylint: disable=attribute-defined-outside-init
+        tfutil.assert_tf_initialized()
+        self._init_fields()
+
+        # Execute custom import handlers.
+        for handler in _import_handlers:
+            state = handler(state)
+
+        # Set basic fields.
+        assert state["version"] in [2, 3, 4]
+        self.name = state["name"]
+        self.static_kwargs = util.EasyDict(state["static_kwargs"])
+        self.components = util.EasyDict(state.get("components", {}))
+        self._build_module_src = state["build_module_src"]
+        self._build_func_name = state["build_func_name"]
+
+        # Create temporary module from the imported source code.
+        module_name = "_tflib_network_import_" + uuid.uuid4().hex
+        module = types.ModuleType(module_name)
+        sys.modules[module_name] = module
+        _import_module_src[module] = self._build_module_src
+        exec(self._build_module_src, module.__dict__) # pylint: disable=exec-used
+
+        # Locate network build function in the temporary module.
+        self._build_func = util.get_obj_from_module(module, self._build_func_name)
+        assert callable(self._build_func)
+
+        # Init TensorFlow graph.
+        self._init_graph()
+        self.reset_own_vars()
+        tfutil.set_vars({self.find_var(name): value for name, value in state["variables"]})
+
+    def clone(self, name: str = None, **new_static_kwargs) -> "Network":
+        """Create a clone of this network with its own copy of the variables."""
+        # pylint: disable=protected-access
+        net = object.__new__(Network)
+        net._init_fields()
+        net.name = name if name is not None else self.name
+        net.static_kwargs = util.EasyDict(self.static_kwargs)
+        net.static_kwargs.update(new_static_kwargs)
+        net._build_module_src = self._build_module_src
+        net._build_func_name = self._build_func_name
+        net._build_func = self._build_func
+        net._init_graph()
+        net.copy_vars_from(self)
+        return net
+
+    def copy_own_vars_from(self, src_net: "Network") -> None:
+        """Copy the values of all variables from the given network, excluding sub-networks."""
+        names = [name for name in self.own_vars.keys() if name in src_net.own_vars]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def copy_vars_from(self, src_net: "Network") -> None:
+        """Copy the values of all variables from the given network, including sub-networks."""
+        names = [name for name in self.vars.keys() if name in src_net.vars]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def copy_trainables_from(self, src_net: "Network") -> None:
+        """Copy the values of all trainable variables from the given network, including sub-networks."""
+        names = [name for name in self.trainables.keys() if name in src_net.trainables]
+        tfutil.set_vars(tfutil.run({self.vars[name]: src_net.vars[name] for name in names}))
+
+    def convert(self, new_func_name: str, new_name: str = None, **new_static_kwargs) -> "Network":
+        """Create new network with the given parameters, and copy all variables from this network."""
+        if new_name is None:
+            new_name = self.name
+        static_kwargs = dict(self.static_kwargs)
+        static_kwargs.update(new_static_kwargs)
+        net = Network(name=new_name, func_name=new_func_name, **static_kwargs)
+        net.copy_vars_from(self)
+        return net
+
+    def setup_as_moving_average_of(self, src_net: "Network", beta: TfExpressionEx = 0.99, beta_nontrainable: TfExpressionEx = 0.0) -> tf.Operation:
+        """Construct a TensorFlow op that updates the variables of this network
+        to be slightly closer to those of the given network."""
+        with tfutil.absolute_name_scope(self.scope + "/_MovingAvg"):
+            ops = []
+            for name, var in self.vars.items():
+                if name in src_net.vars:
+                    cur_beta = beta if name in self.trainables else beta_nontrainable
+                    new_value = tfutil.lerp(src_net.vars[name], var, cur_beta)
+                    ops.append(var.assign(new_value))
+            return tf.group(*ops)
+
+    def run(self,
+            *in_arrays: Tuple[Union[np.ndarray, None], ...],
+            input_transform: dict = None,
+            output_transform: dict = None,
+            return_as_list: bool = False,
+            print_progress: bool = False,
+            minibatch_size: int = None,
+            num_gpus: int = 1,
+            assume_frozen: bool = False,
+            custom_inputs: Any = None,
+            **dynamic_kwargs) -> Union[np.ndarray, Tuple[np.ndarray, ...], List[np.ndarray]]:
+        """Run this network for the given NumPy array(s), and return the output(s) as NumPy array(s).
+
+        Args:
+            input_transform:    A dict specifying a custom transformation to be applied to the input tensor(s) before evaluating the network.
+                                The dict must contain a 'func' field that points to a top-level function. The function is called with the input
+                                TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs.
+            output_transform:   A dict specifying a custom transformation to be applied to the output tensor(s) after evaluating the network.
+                                The dict must contain a 'func' field that points to a top-level function. The function is called with the output
+                                TensorFlow expression(s) as positional arguments. Any remaining fields of the dict will be passed in as kwargs.
+            return_as_list:     True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs.
+            print_progress:     Print progress to the console? Useful for very large input arrays.
+            minibatch_size:     Maximum minibatch size to use, None = disable batching.
+            num_gpus:           Number of GPUs to use.
+            assume_frozen:      Improve multi-GPU performance by assuming that the trainable parameters will remain changed between calls.
+            custom_inputs:      Allow to use another tensor as input instead of default placeholders.
+            dynamic_kwargs:     Additional keyword arguments to be passed into the network build function.
+        """
+        assert len(in_arrays) == self.num_inputs
+        assert not all(arr is None for arr in in_arrays)
+        assert input_transform is None or util.is_top_level_function(input_transform["func"])
+        assert output_transform is None or util.is_top_level_function(output_transform["func"])
+        output_transform, dynamic_kwargs = _handle_legacy_output_transforms(output_transform, dynamic_kwargs)
+        num_items = in_arrays[0].shape[0]
+        if minibatch_size is None:
+            minibatch_size = num_items
+
+        # Construct unique hash key from all arguments that affect the TensorFlow graph.
+        key = dict(input_transform=input_transform, output_transform=output_transform, num_gpus=num_gpus, assume_frozen=assume_frozen, dynamic_kwargs=dynamic_kwargs)
+        def unwind_key(obj):
+            if isinstance(obj, dict):
+                return [(key, unwind_key(value)) for key, value in sorted(obj.items())]
+            if callable(obj):
+                return util.get_top_level_function_name(obj)
+            return obj
+        key = repr(unwind_key(key))
+
+        # Build graph.
+        if key not in self._run_cache:
+            with tfutil.absolute_name_scope(self.scope + "/_Run"), tf.control_dependencies(None):
+                if custom_inputs is not None:
+                    with tf.device("/gpu:0"):
+                        in_expr = [input_builder(name) for input_builder, name in zip(custom_inputs, self.input_names)]
+                        in_split = list(zip(*[tf.split(x, num_gpus) for x in in_expr]))
+                else:
+                    with tf.device("/cpu:0"):
+                        in_expr = [tf.placeholder(tf.float32, name=name) for name in self.input_names]
+                        in_split = list(zip(*[tf.split(x, num_gpus) for x in in_expr]))
+
+                out_split = []
+                for gpu in range(num_gpus):
+                    with tf.device("/gpu:%d" % gpu):
+                        net_gpu = self.clone() if assume_frozen else self
+                        in_gpu = in_split[gpu]
+
+                        if input_transform is not None:
+                            in_kwargs = dict(input_transform)
+                            in_gpu = in_kwargs.pop("func")(*in_gpu, **in_kwargs)
+                            in_gpu = [in_gpu] if tfutil.is_tf_expression(in_gpu) else list(in_gpu)
+
+                        assert len(in_gpu) == self.num_inputs
+                        out_gpu = net_gpu.get_output_for(*in_gpu, return_as_list=True, **dynamic_kwargs)
+
+                        if output_transform is not None:
+                            out_kwargs = dict(output_transform)
+                            out_gpu = out_kwargs.pop("func")(*out_gpu, **out_kwargs)
+                            out_gpu = [out_gpu] if tfutil.is_tf_expression(out_gpu) else list(out_gpu)
+
+                        assert len(out_gpu) == self.num_outputs
+                        out_split.append(out_gpu)
+
+                with tf.device("/cpu:0"):
+                    out_expr = [tf.concat(outputs, axis=0) for outputs in zip(*out_split)]
+                    self._run_cache[key] = in_expr, out_expr
+
+        # Run minibatches.
+        in_expr, out_expr = self._run_cache[key]
+        out_arrays = [np.empty([num_items] + expr.shape.as_list()[1:], expr.dtype.name) for expr in out_expr]
+
+        for mb_begin in range(0, num_items, minibatch_size):
+            if print_progress:
+                print("\r%d / %d" % (mb_begin, num_items), end="")
+
+            mb_end = min(mb_begin + minibatch_size, num_items)
+            mb_num = mb_end - mb_begin
+            mb_in = [src[mb_begin : mb_end] if src is not None else np.zeros([mb_num] + shape[1:]) for src, shape in zip(in_arrays, self.input_shapes)]
+            mb_out = tf.get_default_session().run(out_expr, dict(zip(in_expr, mb_in)))
+
+            for dst, src in zip(out_arrays, mb_out):
+                dst[mb_begin: mb_end] = src
+
+        # Done.
+        if print_progress:
+            print("\r%d / %d" % (num_items, num_items))
+
+        if not return_as_list:
+            out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple(out_arrays)
+        return out_arrays
+
+    def list_ops(self) -> List[TfExpression]:
+        include_prefix = self.scope + "/"
+        exclude_prefix = include_prefix + "_"
+        ops = tf.get_default_graph().get_operations()
+        ops = [op for op in ops if op.name.startswith(include_prefix)]
+        ops = [op for op in ops if not op.name.startswith(exclude_prefix)]
+        return ops
+
+    def list_layers(self) -> List[Tuple[str, TfExpression, List[TfExpression]]]:
+        """Returns a list of (layer_name, output_expr, trainable_vars) tuples corresponding to
+        individual layers of the network. Mainly intended to be used for reporting."""
+        layers = []
+
+        def recurse(scope, parent_ops, parent_vars, level):
+            # Ignore specific patterns.
+            if any(p in scope for p in ["/Shape", "/strided_slice", "/Cast", "/concat", "/Assign"]):
+                return
+
+            # Filter ops and vars by scope.
+            global_prefix = scope + "/"
+            local_prefix = global_prefix[len(self.scope) + 1:]
+            cur_ops = [op for op in parent_ops if op.name.startswith(global_prefix) or op.name == global_prefix[:-1]]
+            cur_vars = [(name, var) for name, var in parent_vars if name.startswith(local_prefix) or name == local_prefix[:-1]]
+            if not cur_ops and not cur_vars:
+                return
+
+            # Filter out all ops related to variables.
+            for var in [op for op in cur_ops if op.type.startswith("Variable")]:
+                var_prefix = var.name + "/"
+                cur_ops = [op for op in cur_ops if not op.name.startswith(var_prefix)]
+
+            # Scope does not contain ops as immediate children => recurse deeper.
+            contains_direct_ops = any("/" not in op.name[len(global_prefix):] and op.type not in ["Identity", "Cast", "Transpose"] for op in cur_ops)
+            if (level == 0 or not contains_direct_ops) and (len(cur_ops) + len(cur_vars)) > 1:
+                visited = set()
+                for rel_name in [op.name[len(global_prefix):] for op in cur_ops] + [name[len(local_prefix):] for name, _var in cur_vars]:
+                    token = rel_name.split("/")[0]
+                    if token not in visited:
+                        recurse(global_prefix + token, cur_ops, cur_vars, level + 1)
+                        visited.add(token)
+                return
+
+            # Report layer.
+            layer_name = scope[len(self.scope) + 1:]
+            layer_output = cur_ops[-1].outputs[0] if cur_ops else cur_vars[-1][1]
+            layer_trainables = [var for _name, var in cur_vars if var.trainable]
+            layers.append((layer_name, layer_output, layer_trainables))
+
+        recurse(self.scope, self.list_ops(), list(self.vars.items()), 0)
+        return layers
+
+    def print_layers(self, title: str = None, hide_layers_with_no_params: bool = False) -> None:
+        """Print a summary table of the network structure."""
+        rows = [[title if title is not None else self.name, "Params", "OutputShape", "WeightShape"]]
+        rows += [["---"] * 4]
+        total_params = 0
+
+        for layer_name, layer_output, layer_trainables in self.list_layers():
+            num_params = sum(int(np.prod(var.shape.as_list())) for var in layer_trainables)
+            weights = [var for var in layer_trainables if var.name.endswith("/weight:0")]
+            weights.sort(key=lambda x: len(x.name))
+            if len(weights) == 0 and len(layer_trainables) == 1:
+                weights = layer_trainables
+            total_params += num_params
+
+            if not hide_layers_with_no_params or num_params != 0:
+                num_params_str = str(num_params) if num_params > 0 else "-"
+                output_shape_str = str(layer_output.shape)
+                weight_shape_str = str(weights[0].shape) if len(weights) >= 1 else "-"
+                rows += [[layer_name, num_params_str, output_shape_str, weight_shape_str]]
+
+        rows += [["---"] * 4]
+        rows += [["Total", str(total_params), "", ""]]
+
+        widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+        print()
+        for row in rows:
+            print("  ".join(cell + " " * (width - len(cell)) for cell, width in zip(row, widths)))
+        print()
+
+    def setup_weight_histograms(self, title: str = None) -> None:
+        """Construct summary ops to include histograms of all trainable parameters in TensorBoard."""
+        if title is None:
+            title = self.name
+
+        with tf.name_scope(None), tf.device(None), tf.control_dependencies(None):
+            for local_name, var in self.trainables.items():
+                if "/" in local_name:
+                    p = local_name.split("/")
+                    name = title + "_" + p[-1] + "/" + "_".join(p[:-1])
+                else:
+                    name = title + "_toplevel/" + local_name
+
+                tf.summary.histogram(name, var)
+
+#----------------------------------------------------------------------------
+# Backwards-compatible emulation of legacy output transformation in Network.run().
+
+_print_legacy_warning = True
+
+def _handle_legacy_output_transforms(output_transform, dynamic_kwargs):
+    global _print_legacy_warning
+    legacy_kwargs = ["out_mul", "out_add", "out_shrink", "out_dtype"]
+    if not any(kwarg in dynamic_kwargs for kwarg in legacy_kwargs):
+        return output_transform, dynamic_kwargs
+
+    if _print_legacy_warning:
+        _print_legacy_warning = False
+        print()
+        print("WARNING: Old-style output transformations in Network.run() are deprecated.")
+        print("Consider using 'output_transform=dict(func=tflib.convert_images_to_uint8)'")
+        print("instead of 'out_mul=127.5, out_add=127.5, out_dtype=np.uint8'.")
+        print()
+    assert output_transform is None
+
+    new_kwargs = dict(dynamic_kwargs)
+    new_transform = {kwarg: new_kwargs.pop(kwarg) for kwarg in legacy_kwargs if kwarg in dynamic_kwargs}
+    new_transform["func"] = _legacy_output_transform_func
+    return new_transform, new_kwargs
+
+def _legacy_output_transform_func(*expr, out_mul=1.0, out_add=0.0, out_shrink=1, out_dtype=None):
+    if out_mul != 1.0:
+        expr = [x * out_mul for x in expr]
+
+    if out_add != 0.0:
+        expr = [x + out_add for x in expr]
+
+    if out_shrink > 1:
+        ksize = [1, 1, out_shrink, out_shrink]
+        expr = [tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW") for x in expr]
+
+    if out_dtype is not None:
+        if tf.as_dtype(out_dtype).is_integer:
+            expr = [tf.round(x) for x in expr]
+        expr = [tf.saturate_cast(x, out_dtype) for x in expr]
+    return expr
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab9908efa3cb38af52e8d5bcaa8acffde5a8875
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+# empty
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.cu b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1102f624fadd0b803bdfb99fecfe145d7ec8abc4
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.cu
@@ -0,0 +1,188 @@
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#define EIGEN_USE_GPU
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <stdio.h>
+
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal(cudaGetErrorName(err))); } while (false)
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T>
+struct FusedBiasActKernelParams
+{
+    const T*    x;      // [sizeX]
+    const T*    b;      // [sizeB] or NULL
+    const T*    ref;    // [sizeX] or NULL
+    T*          y;      // [sizeX]
+
+    int         grad;
+    int         axis;
+    int         act;
+    float       alpha;
+    float       gain;
+
+    int         sizeX;
+    int         sizeB;
+    int         stepB;
+    int         loopX;
+};
+
+template <class T>
+static __global__ void FusedBiasActKernel(const FusedBiasActKernelParams<T> p)
+{
+    const float expRange        = 80.0f;
+    const float halfExpRange    = 40.0f;
+    const float seluScale       = 1.0507009873554804934193349852946f;
+    const float seluAlpha       = 1.6732632423543772848170429916717f;
+
+    // Loop over elements.
+    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
+    {
+        // Load and apply bias.
+        float x = (float)p.x[xi];
+        if (p.b)
+            x += (float)p.b[(xi / p.stepB) % p.sizeB];
+        float ref = (p.ref) ? (float)p.ref[xi] : 0.0f;
+        if (p.gain != 0.0f & p.act != 9)
+            ref /= p.gain;
+
+        // Evaluate activation func.
+        float y;
+        switch (p.act * 10 + p.grad)
+        {
+            // linear
+            default:
+            case 10: y = x; break;
+            case 11: y = x; break;
+            case 12: y = 0.0f; break;
+
+            // relu
+            case 20: y = (x > 0.0f) ? x : 0.0f; break;
+            case 21: y = (ref > 0.0f) ? x : 0.0f; break;
+            case 22: y = 0.0f; break;
+
+            // lrelu
+            case 30: y = (x > 0.0f) ? x : x * p.alpha; break;
+            case 31: y = (ref > 0.0f) ? x : x * p.alpha; break;
+            case 32: y = 0.0f; break;
+
+            // tanh
+            case 40: { float c = expf(x); float d = 1.0f / c; y = (x < -expRange) ? -1.0f : (x > expRange) ? 1.0f : (c - d) / (c + d); } break;
+            case 41: y = x * (1.0f - ref * ref); break;
+            case 42: y = x * (1.0f - ref * ref) * (-2.0f * ref); break;
+
+            // sigmoid
+            case 50: y = (x < -expRange) ? 0.0f : 1.0f / (expf(-x) + 1.0f); break;
+            case 51: y = x * ref * (1.0f - ref); break;
+            case 52: y = x * ref * (1.0f - ref) * (1.0f - 2.0f * ref); break;
+
+            // elu
+            case 60: y = (x >= 0.0f) ? x : expf(x) - 1.0f; break;
+            case 61: y = (ref >= 0.0f) ? x : x * (ref + 1.0f); break;
+            case 62: y = (ref >= 0.0f) ? 0.0f : x * (ref + 1.0f); break;
+
+            // selu
+            case 70: y = (x >= 0.0f) ? seluScale * x : (seluScale * seluAlpha) * (expf(x) - 1.0f); break;
+            case 71: y = (ref >= 0.0f) ? x * seluScale : x * (ref + seluScale * seluAlpha); break;
+            case 72: y = (ref >= 0.0f) ? 0.0f : x * (ref + seluScale * seluAlpha); break;
+
+            // softplus
+            case 80: y = (x > expRange) ? x : logf(expf(x) + 1.0f); break;
+            case 81: y = x * (1.0f - expf(-ref)); break;
+            case 82: { float c = expf(-ref); y = x * c * (1.0f - c); } break;
+
+            // swish
+            case 90: y = (x < -expRange) ? 0.0f : x / (expf(-x) + 1.0f); break;
+            case 91: { float c = expf(ref); float d = c + 1.0f; y = (ref > halfExpRange) ? x : x * c * (ref + d) / (d * d); } break;
+            case 92: { float c = expf(ref); float d = c + 1.0f; y = (ref > halfExpRange) ? 0.0f : x * c * (ref * (2.0f - d) + 2.0f * d) / (d * d * d); } break;
+        }
+
+        // Apply gain and store.
+        p.y[xi] = (T)(y * p.gain);
+    }
+}
+
+//------------------------------------------------------------------------
+// TensorFlow op.
+
+template <class T>
+struct FusedBiasActOp : public OpKernel
+{
+    FusedBiasActKernelParams<T> m_attribs;
+
+    FusedBiasActOp(OpKernelConstruction* ctx) : OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("grad", &m_attribs.grad));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &m_attribs.axis));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("act", &m_attribs.act));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &m_attribs.alpha));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("gain", &m_attribs.gain));
+        OP_REQUIRES(ctx, m_attribs.grad >= 0, errors::InvalidArgument("grad must be non-negative"));
+        OP_REQUIRES(ctx, m_attribs.axis >= 0, errors::InvalidArgument("axis must be non-negative"));
+        OP_REQUIRES(ctx, m_attribs.act >= 0, errors::InvalidArgument("act must be non-negative"));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        FusedBiasActKernelParams<T> p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        const Tensor& x     = ctx->input(0); // [...]
+        const Tensor& b     = ctx->input(1); // [sizeB] or [0]
+        const Tensor& ref   = ctx->input(2); // x.shape or [0]
+        p.x = x.flat<T>().data();
+        p.b = (b.NumElements()) ? b.flat<T>().data() : NULL;
+        p.ref = (ref.NumElements()) ? ref.flat<T>().data() : NULL;
+        OP_REQUIRES(ctx, b.NumElements() == 0 || m_attribs.axis < x.dims(), errors::InvalidArgument("axis out of bounds"));
+        OP_REQUIRES(ctx, b.dims() == 1, errors::InvalidArgument("b must have rank 1"));
+        OP_REQUIRES(ctx, b.NumElements() == 0 || b.NumElements() == x.dim_size(m_attribs.axis), errors::InvalidArgument("b has wrong number of elements"));
+        OP_REQUIRES(ctx, ref.NumElements() == ((p.grad == 0) ? 0 : x.NumElements()), errors::InvalidArgument("ref has wrong number of elements"));
+        OP_REQUIRES(ctx, x.NumElements() <= kint32max, errors::InvalidArgument("x is too large"));
+
+        p.sizeX = (int)x.NumElements();
+        p.sizeB = (int)b.NumElements();
+        p.stepB = 1;
+        for (int i = m_attribs.axis + 1; i < x.dims(); i++)
+            p.stepB *= (int)x.dim_size(i);
+
+        Tensor* y = NULL; // x.shape
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+        p.y = y->flat<T>().data();
+
+        p.loopX = 4;
+        int blockSize = 4 * 32;
+        int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+        void* args[] = {&p};
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)FusedBiasActKernel<T>, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("FusedBiasAct")
+    .Input      ("x: T")
+    .Input      ("b: T")
+    .Input      ("ref: T")
+    .Output     ("y: T")
+    .Attr       ("T: {float, half}")
+    .Attr       ("grad: int = 0")
+    .Attr       ("axis: int = 1")
+    .Attr       ("act: int = 0")
+    .Attr       ("alpha: float = 0.0")
+    .Attr       ("gain: float = 1.0");
+REGISTER_KERNEL_BUILDER(Name("FusedBiasAct").Device(DEVICE_GPU).TypeConstraint<float>("T"), FusedBiasActOp<float>);
+REGISTER_KERNEL_BUILDER(Name("FusedBiasAct").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), FusedBiasActOp<Eigen::half>);
+
+//------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f6bfd77a4b0151103c1a76fa877e084831f7c4
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/fused_bias_act.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Custom TensorFlow ops for efficient bias and activation."""
+
+import os
+import numpy as np
+import tensorflow as tf
+from .. import custom_ops
+from ...util import EasyDict
+
+def _get_plugin():
+    return custom_ops.get_plugin(os.path.splitext(__file__)[0] + '.cu')
+
+#----------------------------------------------------------------------------
+
+activation_funcs = {
+    'linear':   EasyDict(func=lambda x, **_:        x,                          def_alpha=None, def_gain=1.0,           cuda_idx=1, ref='y', zero_2nd_grad=True),
+    'relu':     EasyDict(func=lambda x, **_:        tf.nn.relu(x),              def_alpha=None, def_gain=np.sqrt(2),    cuda_idx=2, ref='y', zero_2nd_grad=True),
+    'lrelu':    EasyDict(func=lambda x, alpha, **_: tf.nn.leaky_relu(x, alpha), def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', zero_2nd_grad=True),
+    'tanh':     EasyDict(func=lambda x, **_:        tf.nn.tanh(x),              def_alpha=None, def_gain=1.0,           cuda_idx=4, ref='y', zero_2nd_grad=False),
+    'sigmoid':  EasyDict(func=lambda x, **_:        tf.nn.sigmoid(x),           def_alpha=None, def_gain=1.0,           cuda_idx=5, ref='y', zero_2nd_grad=False),
+    'elu':      EasyDict(func=lambda x, **_:        tf.nn.elu(x),               def_alpha=None, def_gain=1.0,           cuda_idx=6, ref='y', zero_2nd_grad=False),
+    'selu':     EasyDict(func=lambda x, **_:        tf.nn.selu(x),              def_alpha=None, def_gain=1.0,           cuda_idx=7, ref='y', zero_2nd_grad=False),
+    'softplus': EasyDict(func=lambda x, **_:        tf.nn.softplus(x),          def_alpha=None, def_gain=1.0,           cuda_idx=8, ref='y', zero_2nd_grad=False),
+    'swish':    EasyDict(func=lambda x, **_:        tf.nn.sigmoid(x) * x,       def_alpha=None, def_gain=np.sqrt(2),    cuda_idx=9, ref='x', zero_2nd_grad=False),
+}
+
+#----------------------------------------------------------------------------
+
+def fused_bias_act(x, b=None, axis=1, act='linear', alpha=None, gain=None, impl='cuda'):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can have any shape, but if `b` is defined, the
+                dimension corresponding to `axis`, as well as the rank, must be known.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `axis`.
+        axis:   The dimension in `x` corresponding to the elements of `b`.
+                The value of `axis` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying `1.0`.
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+
+    impl_dict = {
+        'ref':  _fused_bias_act_ref,
+        'cuda': _fused_bias_act_cuda,
+    }
+    return impl_dict[impl](x=x, b=b, axis=axis, act=act, alpha=alpha, gain=gain)
+
+#----------------------------------------------------------------------------
+
+def _fused_bias_act_ref(x, b, axis, act, alpha, gain):
+    """Slow reference implementation of `fused_bias_act()` using standard TensorFlow ops."""
+
+    # Validate arguments.
+    x = tf.convert_to_tensor(x)
+    b = tf.convert_to_tensor(b) if b is not None else tf.constant([], dtype=x.dtype)
+    act_spec = activation_funcs[act]
+    assert b.shape.rank == 1 and (b.shape[0] == 0 or b.shape[0] == x.shape[axis])
+    assert b.shape[0] == 0 or 0 <= axis < x.shape.rank
+    if alpha is None:
+        alpha = act_spec.def_alpha
+    if gain is None:
+        gain = act_spec.def_gain
+
+    # Add bias.
+    if b.shape[0] != 0:
+        x += tf.reshape(b, [-1 if i == axis else 1 for i in range(x.shape.rank)])
+
+    # Evaluate activation function.
+    x = act_spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    if gain != 1:
+        x *= gain
+    return x
+
+#----------------------------------------------------------------------------
+
+def _fused_bias_act_cuda(x, b, axis, act, alpha, gain):
+    """Fast CUDA implementation of `fused_bias_act()` using custom ops."""
+
+    # Validate arguments.
+    x = tf.convert_to_tensor(x)
+    empty_tensor = tf.constant([], dtype=x.dtype)
+    b = tf.convert_to_tensor(b) if b is not None else empty_tensor
+    act_spec = activation_funcs[act]
+    assert b.shape.rank == 1 and (b.shape[0] == 0 or b.shape[0] == x.shape[axis])
+    assert b.shape[0] == 0 or 0 <= axis < x.shape.rank
+    if alpha is None:
+        alpha = act_spec.def_alpha
+    if gain is None:
+        gain = act_spec.def_gain
+
+    # Special cases.
+    if act == 'linear' and b is None and gain == 1.0:
+        return x
+    if act_spec.cuda_idx is None:
+        return _fused_bias_act_ref(x=x, b=b, axis=axis, act=act, alpha=alpha, gain=gain)
+
+    # CUDA kernel.
+    cuda_kernel = _get_plugin().fused_bias_act
+    cuda_kwargs = dict(axis=axis, act=act_spec.cuda_idx, alpha=alpha, gain=gain)
+
+    # Forward pass: y = func(x, b).
+    def func_y(x, b):
+        y = cuda_kernel(x=x, b=b, ref=empty_tensor, grad=0, **cuda_kwargs)
+        y.set_shape(x.shape)
+        return y
+
+    # Backward pass: dx, db = grad(dy, x, y)
+    def grad_dx(dy, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        dx = cuda_kernel(x=dy, b=empty_tensor, ref=ref, grad=1, **cuda_kwargs)
+        dx.set_shape(x.shape)
+        return dx
+    def grad_db(dx):
+        if b.shape[0] == 0:
+            return empty_tensor
+        db = dx
+        if axis < x.shape.rank - 1:
+            db = tf.reduce_sum(db, list(range(axis + 1, x.shape.rank)))
+        if axis > 0:
+            db = tf.reduce_sum(db, list(range(axis)))
+        db.set_shape(b.shape)
+        return db
+
+    # Second order gradients: d_dy, d_x = grad2(d_dx, d_db, x, y)
+    def grad2_d_dy(d_dx, d_db, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        d_dy = cuda_kernel(x=d_dx, b=d_db, ref=ref, grad=1, **cuda_kwargs)
+        d_dy.set_shape(x.shape)
+        return d_dy
+    def grad2_d_x(d_dx, d_db, x, y):
+        ref = {'x': x, 'y': y}[act_spec.ref]
+        d_x = cuda_kernel(x=d_dx, b=d_db, ref=ref, grad=2, **cuda_kwargs)
+        d_x.set_shape(x.shape)
+        return d_x
+
+    # Fast version for piecewise-linear activation funcs.
+    @tf.custom_gradient
+    def func_zero_2nd_grad(x, b):
+        y = func_y(x, b)
+        @tf.custom_gradient
+        def grad(dy):
+            dx = grad_dx(dy, x, y)
+            db = grad_db(dx)
+            def grad2(d_dx, d_db):
+                d_dy = grad2_d_dy(d_dx, d_db, x, y)
+                return d_dy
+            return (dx, db), grad2
+        return y, grad
+
+    # Slow version for general activation funcs.
+    @tf.custom_gradient
+    def func_nonzero_2nd_grad(x, b):
+        y = func_y(x, b)
+        def grad_wrap(dy):
+            @tf.custom_gradient
+            def grad_impl(dy, x):
+                dx = grad_dx(dy, x, y)
+                db = grad_db(dx)
+                def grad2(d_dx, d_db):
+                    d_dy = grad2_d_dy(d_dx, d_db, x, y)
+                    d_x = grad2_d_x(d_dx, d_db, x, y)
+                    return d_dy, d_x
+                return (dx, db), grad2
+            return grad_impl(dy, x)
+        return y, grad_wrap
+
+    # Which version to use?
+    if act_spec.zero_2nd_grad:
+        return func_zero_2nd_grad(x, b)
+    return func_nonzero_2nd_grad(x, b)
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.cu b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b97ef36c9e5ba46a92a380dbc687e275235a1ccf
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.cu
@@ -0,0 +1,326 @@
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#define EIGEN_USE_GPU
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <stdio.h>
+
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal(cudaGetErrorName(err))); } while (false)
+
+static __host__ __device__ __forceinline__ int floorDiv(int a, int b)
+{
+    int c = a / b;
+    if (c * b > a)
+        c--;
+    return c;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+template <class T>
+struct UpFirDn2DKernelParams
+{
+    const T*    x;          // [majorDim, inH, inW, minorDim]
+    const T*    k;          // [kernelH, kernelW]
+    T*          y;          // [majorDim, outH, outW, minorDim]
+
+    int         upx;
+    int         upy;
+    int         downx;
+    int         downy;
+    int         padx0;
+    int         padx1;
+    int         pady0;
+    int         pady1;
+
+    int         majorDim;
+    int         inH;
+    int         inW;
+    int         minorDim;
+    int         kernelH;
+    int         kernelW;
+    int         outH;
+    int         outW;
+    int         loopMajor;
+    int         loopX;
+};
+
+//------------------------------------------------------------------------
+// General CUDA implementation for large filter kernels.
+
+template <class T>
+static __global__ void UpFirDn2DKernel_large(const UpFirDn2DKernelParams<T> p)
+{
+    // Calculate thread index.
+    int minorIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    int outY = minorIdx / p.minorDim;
+    minorIdx -= outY * p.minorDim;
+    int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+    int majorIdxBase = blockIdx.z * p.loopMajor;
+    if (outXBase >= p.outW || outY >= p.outH || majorIdxBase >= p.majorDim)
+        return;
+
+    // Setup Y receptive field.
+    int midY = outY * p.downy + p.upy - 1 - p.pady0;
+    int inY = min(max(floorDiv(midY, p.upy), 0), p.inH);
+    int h = min(max(floorDiv(midY + p.kernelH, p.upy), 0), p.inH) - inY;
+    int kernelY = midY + p.kernelH - (inY + 1) * p.upy;
+
+    // Loop over majorDim and outX.
+    for (int loopMajor = 0, majorIdx = majorIdxBase; loopMajor < p.loopMajor && majorIdx < p.majorDim; loopMajor++, majorIdx++)
+    for (int loopX = 0, outX = outXBase; loopX < p.loopX && outX < p.outW; loopX++, outX += blockDim.y)
+    {
+        // Setup X receptive field.
+        int midX = outX * p.downx + p.upx - 1 - p.padx0;
+        int inX = min(max(floorDiv(midX, p.upx), 0), p.inW);
+        int w = min(max(floorDiv(midX + p.kernelW, p.upx), 0), p.inW) - inX;
+        int kernelX = midX + p.kernelW - (inX + 1) * p.upx;
+
+        // Initialize pointers.
+        const T* xp = &p.x[((majorIdx * p.inH + inY) * p.inW + inX) * p.minorDim + minorIdx];
+        const T* kp = &p.k[kernelY * p.kernelW + kernelX];
+        int xpx = p.minorDim;
+        int kpx = -p.upx;
+        int xpy = p.inW * p.minorDim;
+        int kpy = -p.upy * p.kernelW;
+
+        // Inner loop.
+        float v = 0.0f;
+        for (int y = 0; y < h; y++)
+        {
+            for (int x = 0; x < w; x++)
+            {
+                v += (float)(*xp) * (float)(*kp);
+                xp += xpx;
+                kp += kpx;
+            }
+            xp += xpy - w * xpx;
+            kp += kpy - w * kpx;
+        }
+
+        // Store result.
+        p.y[((majorIdx * p.outH + outY) * p.outW + outX) * p.minorDim + minorIdx] = (T)v;
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filter kernels.
+
+template <class T, int upx, int upy, int downx, int downy, int kernelW, int kernelH, int tileOutW, int tileOutH>
+static __global__ void UpFirDn2DKernel_small(const UpFirDn2DKernelParams<T> p)
+{
+    //assert(kernelW % upx == 0);
+    //assert(kernelH % upy == 0);
+    const int tileInW = ((tileOutW - 1) * downx + kernelW - 1) / upx + 1;
+    const int tileInH = ((tileOutH - 1) * downy + kernelH - 1) / upy + 1;
+    __shared__ volatile float sk[kernelH][kernelW];
+    __shared__ volatile float sx[tileInH][tileInW];
+
+    // Calculate tile index.
+    int minorIdx = blockIdx.x;
+    int tileOutY = minorIdx / p.minorDim;
+    minorIdx -= tileOutY * p.minorDim;
+    tileOutY *= tileOutH;
+    int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+    int majorIdxBase = blockIdx.z * p.loopMajor;
+    if (tileOutXBase >= p.outW | tileOutY >= p.outH | majorIdxBase >= p.majorDim)
+        return;
+
+    // Load filter kernel (flipped).
+    for (int tapIdx = threadIdx.x; tapIdx < kernelH * kernelW; tapIdx += blockDim.x)
+    {
+        int ky = tapIdx / kernelW;
+        int kx = tapIdx - ky * kernelW;
+        float v = 0.0f;
+        if (kx < p.kernelW & ky < p.kernelH)
+            v = (float)p.k[(p.kernelH - 1 - ky) * p.kernelW + (p.kernelW - 1 - kx)];
+        sk[ky][kx] = v;
+    }
+
+    // Loop over majorDim and outX.
+    for (int loopMajor = 0, majorIdx = majorIdxBase; loopMajor < p.loopMajor & majorIdx < p.majorDim; loopMajor++, majorIdx++)
+    for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outW; loopX++, tileOutX += tileOutW)
+    {
+        // Load input pixels.
+        int tileMidX = tileOutX * downx + upx - 1 - p.padx0;
+        int tileMidY = tileOutY * downy + upy - 1 - p.pady0;
+        int tileInX = floorDiv(tileMidX, upx);
+        int tileInY = floorDiv(tileMidY, upy);
+        __syncthreads();
+        for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW; inIdx += blockDim.x)
+        {
+            int relInY = inIdx / tileInW;
+            int relInX = inIdx - relInY * tileInW;
+            int inX = relInX + tileInX;
+            int inY = relInY + tileInY;
+            float v = 0.0f;
+            if (inX >= 0 & inY >= 0 & inX < p.inW & inY < p.inH)
+                v = (float)p.x[((majorIdx * p.inH + inY) * p.inW + inX) * p.minorDim + minorIdx];
+            sx[relInY][relInX] = v;
+        }
+
+        // Loop over output pixels.
+        __syncthreads();
+        for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW; outIdx += blockDim.x)
+        {
+            int relOutY = outIdx / tileOutW;
+            int relOutX = outIdx - relOutY * tileOutW;
+            int outX = relOutX + tileOutX;
+            int outY = relOutY + tileOutY;
+
+            // Setup receptive field.
+            int midX = tileMidX + relOutX * downx;
+            int midY = tileMidY + relOutY * downy;
+            int inX = floorDiv(midX, upx);
+            int inY = floorDiv(midY, upy);
+            int relInX = inX - tileInX;
+            int relInY = inY - tileInY;
+            int kernelX = (inX + 1) * upx - midX - 1; // flipped
+            int kernelY = (inY + 1) * upy - midY - 1; // flipped
+
+            // Inner loop.
+            float v = 0.0f;
+            #pragma unroll
+            for (int y = 0; y < kernelH / upy; y++)
+                #pragma unroll
+                for (int x = 0; x < kernelW / upx; x++)
+                    v += sx[relInY + y][relInX + x] * sk[kernelY + y * upy][kernelX + x * upx];
+
+            // Store result.
+            if (outX < p.outW & outY < p.outH)
+                p.y[((majorIdx * p.outH + outY) * p.outW + outX) * p.minorDim + minorIdx] = (T)v;
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// TensorFlow op.
+
+template <class T>
+struct UpFirDn2DOp : public OpKernel
+{
+    UpFirDn2DKernelParams<T> m_attribs;
+
+    UpFirDn2DOp(OpKernelConstruction* ctx) : OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("upx", &m_attribs.upx));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("upy", &m_attribs.upy));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("downx", &m_attribs.downx));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("downy", &m_attribs.downy));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("padx0", &m_attribs.padx0));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("padx1", &m_attribs.padx1));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("pady0", &m_attribs.pady0));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("pady1", &m_attribs.pady1));
+        OP_REQUIRES(ctx, m_attribs.upx >= 1 && m_attribs.upy >= 1, errors::InvalidArgument("upx and upy must be at least 1x1"));
+        OP_REQUIRES(ctx, m_attribs.downx >= 1 && m_attribs.downy >= 1, errors::InvalidArgument("downx and downy must be at least 1x1"));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        UpFirDn2DKernelParams<T> p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        const Tensor& x = ctx->input(0); // [majorDim, inH, inW, minorDim]
+        const Tensor& k = ctx->input(1); // [kernelH, kernelW]
+        p.x = x.flat<T>().data();
+        p.k = k.flat<T>().data();
+        OP_REQUIRES(ctx, x.dims() == 4, errors::InvalidArgument("input must have rank 4"));
+        OP_REQUIRES(ctx, k.dims() == 2, errors::InvalidArgument("kernel must have rank 2"));
+        OP_REQUIRES(ctx, x.NumElements() <= kint32max, errors::InvalidArgument("input too large"));
+        OP_REQUIRES(ctx, k.NumElements() <= kint32max, errors::InvalidArgument("kernel too large"));
+
+        p.majorDim  = (int)x.dim_size(0);
+        p.inH       = (int)x.dim_size(1);
+        p.inW       = (int)x.dim_size(2);
+        p.minorDim  = (int)x.dim_size(3);
+        p.kernelH   = (int)k.dim_size(0);
+        p.kernelW   = (int)k.dim_size(1);
+        OP_REQUIRES(ctx, p.kernelW >= 1 && p.kernelH >= 1, errors::InvalidArgument("kernel must be at least 1x1"));
+
+        p.outW = (p.inW * p.upx + p.padx0 + p.padx1 - p.kernelW + p.downx) / p.downx;
+        p.outH = (p.inH * p.upy + p.pady0 + p.pady1 - p.kernelH + p.downy) / p.downy;
+        OP_REQUIRES(ctx, p.outW >= 1 && p.outH >= 1, errors::InvalidArgument("output must be at least 1x1"));
+
+        Tensor* y = NULL; // [majorDim, outH, outW, minorDim]
+        TensorShape ys;
+        ys.AddDim(p.majorDim);
+        ys.AddDim(p.outH);
+        ys.AddDim(p.outW);
+        ys.AddDim(p.minorDim);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, ys, &y));
+        p.y = y->flat<T>().data();
+        OP_REQUIRES(ctx, y->NumElements() <= kint32max, errors::InvalidArgument("output too large"));
+
+        // Choose CUDA kernel to use.
+        void* cudaKernel = (void*)UpFirDn2DKernel_large<T>;
+        int tileOutW = -1;
+        int tileOutH = -1;
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 7 && p.kernelH <= 7) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 7,7, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 6,6, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 5 && p.kernelH <= 5) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 5,5, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 4,4, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 1 && p.downy == 1 && p.kernelW <= 3 && p.kernelH <= 3) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 1,1, 3,3, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 8 && p.kernelH <= 8) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 8,8, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 6,6, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 4,4, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 2 && p.upy == 2 && p.downx == 1 && p.downy == 1 && p.kernelW <= 2 && p.kernelH <= 2) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 2,2, 1,1, 2,2, 64,16>; tileOutW = 64; tileOutH = 16; }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 8 && p.kernelH <= 8) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 8,8, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 6 && p.kernelH <= 6) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 6,6, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 4 && p.kernelH <= 4) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 4,4, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+        if (p.upx == 1 && p.upy == 1 && p.downx == 2 && p.downy == 2 && p.kernelW <= 2 && p.kernelH <= 2) { cudaKernel = (void*)UpFirDn2DKernel_small<T, 1,1, 2,2, 2,2, 32,8>;  tileOutW = 32; tileOutH = 8;  }
+
+        // Choose launch params.
+        dim3 blockSize;
+        dim3 gridSize;
+        if (tileOutW > 0 && tileOutH > 0) // small
+        {
+            p.loopMajor = (p.majorDim - 1) / 16384 + 1;
+            p.loopX = 1;
+            blockSize = dim3(32 * 8, 1, 1);
+            gridSize = dim3(((p.outH - 1) / tileOutH + 1) * p.minorDim, (p.outW - 1) / (p.loopX * tileOutW) + 1, (p.majorDim - 1) / p.loopMajor + 1);
+        }
+        else // large
+        {
+            p.loopMajor = (p.majorDim - 1) / 16384 + 1;
+            p.loopX = 4;
+            blockSize = dim3(4, 32, 1);
+            gridSize = dim3((p.outH * p.minorDim - 1) / blockSize.x + 1, (p.outW - 1) / (p.loopX * blockSize.y) + 1, (p.majorDim - 1) / p.loopMajor + 1);
+        }
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(cudaKernel, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("UpFirDn2D")
+    .Input      ("x: T")
+    .Input      ("k: T")
+    .Output     ("y: T")
+    .Attr       ("T: {float, half}")
+    .Attr       ("upx: int = 1")
+    .Attr       ("upy: int = 1")
+    .Attr       ("downx: int = 1")
+    .Attr       ("downy: int = 1")
+    .Attr       ("padx0: int = 0")
+    .Attr       ("padx1: int = 0")
+    .Attr       ("pady0: int = 0")
+    .Attr       ("pady1: int = 0");
+REGISTER_KERNEL_BUILDER(Name("UpFirDn2D").Device(DEVICE_GPU).TypeConstraint<float>("T"), UpFirDn2DOp<float>);
+REGISTER_KERNEL_BUILDER(Name("UpFirDn2D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), UpFirDn2DOp<Eigen::half>);
+
+//------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd23777ebb87bc83e8728d6fe3904fbbfb5c524c
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/ops/upfirdn_2d.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Custom TensorFlow ops for efficient resampling of 2D images."""
+
+import os
+import numpy as np
+import tensorflow as tf
+from .. import custom_ops
+
+def _get_plugin():
+    return custom_ops.get_plugin(os.path.splitext(__file__)[0] + '.cu')
+
+#----------------------------------------------------------------------------
+
+def upfirdn_2d(x, k, upx=1, upy=1, downx=1, downy=1, padx0=0, padx1=0, pady0=0, pady1=0, impl='cuda'):
+    r"""Pad, upsample, FIR filter, and downsample a batch of 2D images.
+
+    Accepts a batch of 2D images of the shape `[majorDim, inH, inW, minorDim]`
+    and performs the following operations for each image, batched across
+    `majorDim` and `minorDim`:
+
+    1. Pad the image with zeros by the specified number of pixels on each side
+       (`padx0`, `padx1`, `pady0`, `pady1`). Specifying a negative value
+       corresponds to cropping the image.
+
+    2. Upsample the image by inserting the zeros after each pixel (`upx`, `upy`).
+
+    3. Convolve the image with the specified 2D FIR filter (`k`), shrinking the
+       image so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by throwing away pixels (`downx`, `downy`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:      Input tensor of the shape `[majorDim, inH, inW, minorDim]`.
+        k:      2D FIR filter of the shape `[firH, firW]`.
+        upx:    Integer upsampling factor along the X-axis (default: 1).
+        upy:    Integer upsampling factor along the Y-axis (default: 1).
+        downx:  Integer downsampling factor along the X-axis (default: 1).
+        downy:  Integer downsampling factor along the Y-axis (default: 1).
+        padx0:  Number of pixels to pad on the left side (default: 0).
+        padx1:  Number of pixels to pad on the right side (default: 0).
+        pady0:  Number of pixels to pad on the top side (default: 0).
+        pady1:  Number of pixels to pad on the bottom side (default: 0).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[majorDim, outH, outW, minorDim]`, and same datatype as `x`.
+    """
+
+    impl_dict = {
+        'ref':  _upfirdn_2d_ref,
+        'cuda': _upfirdn_2d_cuda,
+    }
+    return impl_dict[impl](x=x, k=k, upx=upx, upy=upy, downx=downx, downy=downy, padx0=padx0, padx1=padx1, pady0=pady0, pady1=pady1)
+
+#----------------------------------------------------------------------------
+
+def _upfirdn_2d_ref(x, k, upx, upy, downx, downy, padx0, padx1, pady0, pady1):
+    """Slow reference implementation of `upfirdn_2d()` using standard TensorFlow ops."""
+
+    x = tf.convert_to_tensor(x)
+    k = np.asarray(k, dtype=np.float32)
+    assert x.shape.rank == 4
+    inH = x.shape[1].value
+    inW = x.shape[2].value
+    minorDim = _shape(x, 3)
+    kernelH, kernelW = k.shape
+    assert inW >= 1 and inH >= 1
+    assert kernelW >= 1 and kernelH >= 1
+    assert isinstance(upx, int) and isinstance(upy, int)
+    assert isinstance(downx, int) and isinstance(downy, int)
+    assert isinstance(padx0, int) and isinstance(padx1, int)
+    assert isinstance(pady0, int) and isinstance(pady1, int)
+
+    # Upsample (insert zeros).
+    x = tf.reshape(x, [-1, inH, 1, inW, 1, minorDim])
+    x = tf.pad(x, [[0, 0], [0, 0], [0, upy - 1], [0, 0], [0, upx - 1], [0, 0]])
+    x = tf.reshape(x, [-1, inH * upy, inW * upx, minorDim])
+
+    # Pad (crop if negative).
+    x = tf.pad(x, [[0, 0], [max(pady0, 0), max(pady1, 0)], [max(padx0, 0), max(padx1, 0)], [0, 0]])
+    x = x[:, max(-pady0, 0) : x.shape[1].value - max(-pady1, 0), max(-padx0, 0) : x.shape[2].value - max(-padx1, 0), :]
+
+    # Convolve with filter.
+    x = tf.transpose(x, [0, 3, 1, 2])
+    x = tf.reshape(x, [-1, 1, inH * upy + pady0 + pady1, inW * upx + padx0 + padx1])
+    w = tf.constant(k[::-1, ::-1, np.newaxis, np.newaxis], dtype=x.dtype)
+    x = tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='VALID', data_format='NCHW')
+    x = tf.reshape(x, [-1, minorDim, inH * upy + pady0 + pady1 - kernelH + 1, inW * upx + padx0 + padx1 - kernelW + 1])
+    x = tf.transpose(x, [0, 2, 3, 1])
+
+    # Downsample (throw away pixels).
+    return x[:, ::downy, ::downx, :]
+
+#----------------------------------------------------------------------------
+
+def _upfirdn_2d_cuda(x, k, upx, upy, downx, downy, padx0, padx1, pady0, pady1):
+    """Fast CUDA implementation of `upfirdn_2d()` using custom ops."""
+
+    x = tf.convert_to_tensor(x)
+    k = np.asarray(k, dtype=np.float32)
+    majorDim, inH, inW, minorDim = x.shape.as_list()
+    kernelH, kernelW = k.shape
+    assert inW >= 1 and inH >= 1
+    assert kernelW >= 1 and kernelH >= 1
+    assert isinstance(upx, int) and isinstance(upy, int)
+    assert isinstance(downx, int) and isinstance(downy, int)
+    assert isinstance(padx0, int) and isinstance(padx1, int)
+    assert isinstance(pady0, int) and isinstance(pady1, int)
+
+    outW = (inW * upx + padx0 + padx1 - kernelW) // downx + 1
+    outH = (inH * upy + pady0 + pady1 - kernelH) // downy + 1
+    assert outW >= 1 and outH >= 1
+
+    kc = tf.constant(k, dtype=x.dtype)
+    gkc = tf.constant(k[::-1, ::-1], dtype=x.dtype)
+    gpadx0 = kernelW - padx0 - 1
+    gpady0 = kernelH - pady0 - 1
+    gpadx1 = inW * upx - outW * downx + padx0 - upx + 1
+    gpady1 = inH * upy - outH * downy + pady0 - upy + 1
+
+    @tf.custom_gradient
+    def func(x):
+        y = _get_plugin().up_fir_dn2d(x=x, k=kc, upx=upx, upy=upy, downx=downx, downy=downy, padx0=padx0, padx1=padx1, pady0=pady0, pady1=pady1)
+        y.set_shape([majorDim, outH, outW, minorDim])
+        @tf.custom_gradient
+        def grad(dy):
+            dx = _get_plugin().up_fir_dn2d(x=dy, k=gkc, upx=downx, upy=downy, downx=upx, downy=upy, padx0=gpadx0, padx1=gpadx1, pady0=gpady0, pady1=gpady1)
+            dx.set_shape([majorDim, inH, inW, minorDim])
+            return dx, func
+        return y, grad
+    return func(x)
+
+#----------------------------------------------------------------------------
+
+def filter_2d(x, k, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Filter a batch of 2D images with the given FIR filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and filters each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+
+    k = _setup_kernel(k) * gain
+    p = k.shape[0] - 1
+    return _simple_upfirdn_2d(x, k, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample_2d(x, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Upsample a batch of 2D images with the given filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and upsamples each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded with
+    zeros so that its shape is a multiple of the upsampling factor.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to nearest-neighbor
+                      upsampling.
+        factor:       Integer upsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H * factor, W * factor]` or
+        `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * (gain * (factor ** 2))
+    p = k.shape[0] - factor
+    return _simple_upfirdn_2d(x, k, up=factor, pad0=(p+1)//2+factor-1, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def downsample_2d(x, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Downsample a batch of 2D images with the given filter.
+
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and downsamples each image with the given filter. The filter is normalized so that
+    if the input pixels are constant, they will be scaled by the specified `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded with
+    zeros so that its shape is a multiple of the downsampling factor.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * gain
+    p = k.shape[0] - factor
+    return _simple_upfirdn_2d(x, k, down=factor, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def upsample_conv_2d(x, w, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
+
+    Padding is performed only once at the beginning, not between the operations.
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        w:            Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`.
+                      Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to nearest-neighbor
+                      upsampling.
+        factor:       Integer upsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H * factor, W * factor]` or
+        `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+
+    # Check weight shape.
+    w = tf.convert_to_tensor(w)
+    assert w.shape.rank == 4
+    convH = w.shape[0].value
+    convW = w.shape[1].value
+    inC = _shape(w, 2)
+    outC = _shape(w, 3)
+    assert convW == convH
+
+    # Setup filter kernel.
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * (gain * (factor ** 2))
+    p = (k.shape[0] - factor) - (convW - 1)
+
+    # Determine data dimensions.
+    if data_format == 'NCHW':
+        stride = [1, 1, factor, factor]
+        output_shape = [_shape(x, 0), outC, (_shape(x, 2) - 1) * factor + convH, (_shape(x, 3) - 1) * factor + convW]
+        num_groups = _shape(x, 1) // inC
+    else:
+        stride = [1, factor, factor, 1]
+        output_shape = [_shape(x, 0), (_shape(x, 1) - 1) * factor + convH, (_shape(x, 2) - 1) * factor + convW, outC]
+        num_groups = _shape(x, 3) // inC
+
+    # Transpose weights.
+    w = tf.reshape(w, [convH, convW, inC, num_groups, -1])
+    w = tf.transpose(w[::-1, ::-1], [0, 1, 4, 3, 2])
+    w = tf.reshape(w, [convH, convW, -1, num_groups * inC])
+
+    # Execute.
+    x = tf.nn.conv2d_transpose(x, w, output_shape=output_shape, strides=stride, padding='VALID', data_format=data_format)
+    return _simple_upfirdn_2d(x, k, pad0=(p+1)//2+factor-1, pad1=p//2+1, data_format=data_format, impl=impl)
+
+#----------------------------------------------------------------------------
+
+def conv_downsample_2d(x, w, k=None, factor=2, gain=1, data_format='NCHW', impl='cuda'):
+    r"""Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
+
+    Padding is performed only once at the beginning, not between the operations.
+    The fused op is considerably more efficient than performing the same calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        w:            Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`.
+                      Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]` (separable).
+                      The default is `[1] * factor`, which corresponds to average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+        data_format:  `'NCHW'` or `'NHWC'` (default: `'NCHW'`).
+        impl:         Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    w = tf.convert_to_tensor(w)
+    convH, convW, _inC, _outC = w.shape.as_list()
+    assert convW == convH
+    if k is None:
+        k = [1] * factor
+    k = _setup_kernel(k) * gain
+    p = (k.shape[0] - factor) + (convW - 1)
+    if data_format == 'NCHW':
+        s = [1, 1, factor, factor]
+    else:
+        s = [1, factor, factor, 1]
+    x = _simple_upfirdn_2d(x, k, pad0=(p+1)//2, pad1=p//2, data_format=data_format, impl=impl)
+    return tf.nn.conv2d(x, w, strides=s, padding='VALID', data_format=data_format)
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _shape(tf_expr, dim_idx):
+    if tf_expr.shape.rank is not None:
+        dim = tf_expr.shape[dim_idx].value
+        if dim is not None:
+            return dim
+    return tf.shape(tf_expr)[dim_idx]
+
+def _setup_kernel(k):
+    k = np.asarray(k, dtype=np.float32)
+    if k.ndim == 1:
+        k = np.outer(k, k)
+    k /= np.sum(k)
+    assert k.ndim == 2
+    assert k.shape[0] == k.shape[1]
+    return k
+
+def _simple_upfirdn_2d(x, k, up=1, down=1, pad0=0, pad1=0, data_format='NCHW', impl='cuda'):
+    assert data_format in ['NCHW', 'NHWC']
+    assert x.shape.rank == 4
+    y = x
+    if data_format == 'NCHW':
+        y = tf.reshape(y, [-1, _shape(y, 2), _shape(y, 3), 1])
+    y = upfirdn_2d(y, k, upx=up, upy=up, downx=down, downy=down, padx0=pad0, padx1=pad1, pady0=pad0, pady1=pad1, impl=impl)
+    if data_format == 'NCHW':
+        y = tf.reshape(y, [-1, _shape(x, 1), _shape(y, 1), _shape(y, 2)])
+    return y
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/optimizer.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..889e9011d481aec33e9d96bafd081db6b1e0eec3
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/optimizer.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Helper wrapper for a Tensorflow optimizer."""
+
+import numpy as np
+import tensorflow as tf
+
+from collections import OrderedDict
+from typing import List, Union
+
+from . import autosummary
+from . import tfutil
+from .. import util
+
+from .tfutil import TfExpression, TfExpressionEx
+
+try:
+    # TensorFlow 1.13
+    from tensorflow.python.ops import nccl_ops
+except:
+    # Older TensorFlow versions
+    import tensorflow.contrib.nccl as nccl_ops
+
+class Optimizer:
+    """A Wrapper for tf.train.Optimizer.
+
+    Automatically takes care of:
+    - Gradient averaging for multi-GPU training.
+    - Gradient accumulation for arbitrarily large minibatches.
+    - Dynamic loss scaling and typecasts for FP16 training.
+    - Ignoring corrupted gradients that contain NaNs/Infs.
+    - Reporting statistics.
+    - Well-chosen default settings.
+    """
+
+    def __init__(self,
+        name:                   str             = "Train",                  # Name string that will appear in TensorFlow graph.
+        tf_optimizer:           str             = "tf.train.AdamOptimizer", # Underlying optimizer class.
+        learning_rate:          TfExpressionEx  = 0.001,                    # Learning rate. Can vary over time.
+        minibatch_multiplier:   TfExpressionEx  = None,                     # Treat N consecutive minibatches as one by accumulating gradients.
+        share:                  "Optimizer"     = None,                     # Share internal state with a previously created optimizer?
+        use_loss_scaling:       bool            = False,                    # Enable dynamic loss scaling for robust mixed-precision training?
+        loss_scaling_init:      float           = 64.0,                     # Log2 of initial loss scaling factor.
+        loss_scaling_inc:       float           = 0.0005,                   # Log2 of per-minibatch loss scaling increment when there is no overflow.
+        loss_scaling_dec:       float           = 1.0,                      # Log2 of per-minibatch loss scaling decrement when there is an overflow.
+        report_mem_usage:       bool            = False,                    # Report fine-grained memory usage statistics in TensorBoard?
+        **kwargs):
+
+        # Public fields.
+        self.name                   = name
+        self.learning_rate          = learning_rate
+        self.minibatch_multiplier   = minibatch_multiplier
+        self.id                     = self.name.replace("/", ".")
+        self.scope                  = tf.get_default_graph().unique_name(self.id)
+        self.optimizer_class        = util.get_obj_by_name(tf_optimizer)
+        self.optimizer_kwargs       = dict(kwargs)
+        self.use_loss_scaling       = use_loss_scaling
+        self.loss_scaling_init      = loss_scaling_init
+        self.loss_scaling_inc       = loss_scaling_inc
+        self.loss_scaling_dec       = loss_scaling_dec
+
+        # Private fields.
+        self._updates_applied       = False
+        self._devices               = OrderedDict() # device_name => EasyDict()
+        self._shared_optimizers     = OrderedDict() # device_name => optimizer_class
+        self._gradient_shapes       = None          # [shape, ...]
+        self._report_mem_usage      = report_mem_usage
+
+        # Validate arguments.
+        assert callable(self.optimizer_class)
+
+        # Share internal state if requested.
+        if share is not None:
+            assert isinstance(share, Optimizer)
+            assert self.optimizer_class is share.optimizer_class
+            assert self.learning_rate is share.learning_rate
+            assert self.optimizer_kwargs == share.optimizer_kwargs
+            self._shared_optimizers = share._shared_optimizers # pylint: disable=protected-access
+
+    def _get_device(self, device_name: str):
+        """Get internal state for the given TensorFlow device."""
+        tfutil.assert_tf_initialized()
+        if device_name in self._devices:
+            return self._devices[device_name]
+
+        # Initialize fields.
+        device = util.EasyDict()
+        device.name             = device_name
+        device.optimizer        = None          # Underlying optimizer:     optimizer_class
+        device.loss_scaling_var = None          # Log2 of loss scaling:     tf.Variable
+        device.grad_raw         = OrderedDict() # Raw gradients:            var => [grad, ...]
+        device.grad_clean       = OrderedDict() # Clean gradients:          var => grad
+        device.grad_acc_vars    = OrderedDict() # Accumulation sums:        var => tf.Variable
+        device.grad_acc_count   = None          # Accumulation counter:     tf.Variable
+        device.grad_acc         = OrderedDict() # Accumulated gradients:    var => grad
+
+        # Setup TensorFlow objects.
+        with tfutil.absolute_name_scope(self.scope + "/Devices"), tf.device(device_name), tf.control_dependencies(None):
+            if device_name not in self._shared_optimizers:
+                optimizer_name = self.scope.replace("/", "_") + "_opt%d" % len(self._shared_optimizers)
+                self._shared_optimizers[device_name] = self.optimizer_class(name=optimizer_name, learning_rate=self.learning_rate, **self.optimizer_kwargs)
+            device.optimizer = self._shared_optimizers[device_name]
+            if self.use_loss_scaling:
+                device.loss_scaling_var = tf.Variable(np.float32(self.loss_scaling_init), trainable=False, name="loss_scaling_var")
+
+        # Register device.
+        self._devices[device_name] = device
+        return device
+
+    def register_gradients(self, loss: TfExpression, trainable_vars: Union[List, dict]) -> None:
+        """Register the gradients of the given loss function with respect to the given variables.
+        Intended to be called once per GPU."""
+        tfutil.assert_tf_initialized()
+        assert not self._updates_applied
+        device = self._get_device(loss.device)
+
+        # Validate trainables.
+        if isinstance(trainable_vars, dict):
+            trainable_vars = list(trainable_vars.values())  # allow passing in Network.trainables as vars
+        assert isinstance(trainable_vars, list) and len(trainable_vars) >= 1
+        assert all(tfutil.is_tf_expression(expr) for expr in trainable_vars + [loss])
+        assert all(var.device == device.name for var in trainable_vars)
+
+        # Validate shapes.
+        if self._gradient_shapes is None:
+            self._gradient_shapes = [var.shape.as_list() for var in trainable_vars]
+        assert len(trainable_vars) == len(self._gradient_shapes)
+        assert all(var.shape.as_list() == var_shape for var, var_shape in zip(trainable_vars, self._gradient_shapes))
+
+        # Report memory usage if requested.
+        deps = []
+        if self._report_mem_usage:
+            self._report_mem_usage = False
+            try:
+                with tf.name_scope(self.id + '_mem'), tf.device(device.name), tf.control_dependencies([loss]):
+                    deps.append(autosummary.autosummary(self.id + "/mem_usage_gb", tf.contrib.memory_stats.BytesInUse() / 2 ** 30))
+            except tf.errors.NotFoundError:
+                pass
+
+        # Compute gradients.
+        with tf.name_scope(self.id + "_grad"), tf.device(device.name), tf.control_dependencies(deps):
+            loss = self.apply_loss_scaling(tf.cast(loss, tf.float32))
+            gate = tf.train.Optimizer.GATE_NONE  # disable gating to reduce memory usage
+            grad_list = device.optimizer.compute_gradients(loss=loss, var_list=trainable_vars, gate_gradients=gate)
+
+        # Register gradients.
+        for grad, var in grad_list:
+            if var not in device.grad_raw:
+                device.grad_raw[var] = []
+            device.grad_raw[var].append(grad)
+
+    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
+        """Construct training op to update the registered variables based on their gradients."""
+        tfutil.assert_tf_initialized()
+        assert not self._updates_applied
+        self._updates_applied = True
+        all_ops = []
+
+        # Check for no-op.
+        if allow_no_op and len(self._devices) == 0:
+            with tfutil.absolute_name_scope(self.scope):
+                return tf.no_op(name='TrainingOp')
+
+        # Clean up gradients.
+        for device_idx, device in enumerate(self._devices.values()):
+            with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device(device.name):
+                for var, grad in device.grad_raw.items():
+
+                    # Filter out disconnected gradients and convert to float32.
+                    grad = [g for g in grad if g is not None]
+                    grad = [tf.cast(g, tf.float32) for g in grad]
+
+                    # Sum within the device.
+                    if len(grad) == 0:
+                        grad = tf.zeros(var.shape)  # No gradients => zero.
+                    elif len(grad) == 1:
+                        grad = grad[0]              # Single gradient => use as is.
+                    else:
+                        grad = tf.add_n(grad)       # Multiple gradients => sum.
+
+                    # Scale as needed.
+                    scale = 1.0 / len(device.grad_raw[var]) / len(self._devices)
+                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
+                    if self.minibatch_multiplier is not None:
+                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
+                    scale = self.undo_loss_scaling(scale)
+                    device.grad_clean[var] = grad * scale
+
+        # Sum gradients across devices.
+        if len(self._devices) > 1:
+            with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None):
+                for all_vars in zip(*[device.grad_clean.keys() for device in self._devices.values()]):
+                    if len(all_vars) > 0 and all(dim > 0 for dim in all_vars[0].shape.as_list()): # NCCL does not support zero-sized tensors.
+                        all_grads = [device.grad_clean[var] for device, var in zip(self._devices.values(), all_vars)]
+                        all_grads = nccl_ops.all_sum(all_grads)
+                        for device, var, grad in zip(self._devices.values(), all_vars, all_grads):
+                            device.grad_clean[var] = grad
+
+        # Apply updates separately on each device.
+        for device_idx, device in enumerate(self._devices.values()):
+            with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device(device.name):
+                # pylint: disable=cell-var-from-loop
+
+                # Accumulate gradients over time.
+                if self.minibatch_multiplier is None:
+                    acc_ok = tf.constant(True, name='acc_ok')
+                    device.grad_acc = OrderedDict(device.grad_clean)
+                else:
+                    # Create variables.
+                    with tf.control_dependencies(None):
+                        for var in device.grad_clean.keys():
+                            device.grad_acc_vars[var] = tf.Variable(tf.zeros(var.shape), trainable=False, name="grad_acc_var")
+                        device.grad_acc_count = tf.Variable(tf.zeros([]), trainable=False, name="grad_acc_count")
+
+                    # Track counter.
+                    count_cur = device.grad_acc_count + 1.0
+                    count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur)
+                    count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([]))
+                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32))
+                    all_ops.append(tf.cond(acc_ok, count_reset_op, count_inc_op))
+
+                    # Track gradients.
+                    for var, grad in device.grad_clean.items():
+                        acc_var = device.grad_acc_vars[var]
+                        acc_cur = acc_var + grad
+                        device.grad_acc[var] = acc_cur
+                        with tf.control_dependencies([acc_cur]):
+                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
+                            acc_reset_op = lambda: tf.assign(acc_var, tf.zeros(var.shape))
+                            all_ops.append(tf.cond(acc_ok, acc_reset_op, acc_inc_op))
+
+                # No overflow => apply gradients.
+                all_ok = tf.reduce_all(tf.stack([acc_ok] + [tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values()]))
+                apply_op = lambda: device.optimizer.apply_gradients([(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()])
+                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))
+
+                # Adjust loss scaling.
+                if self.use_loss_scaling:
+                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc)
+                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec)
+                    ls_update_op = lambda: tf.group(tf.cond(all_ok, ls_inc_op, ls_dec_op))
+                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))
+
+                # Last device => report statistics.
+                if device_idx == len(self._devices) - 1:
+                    all_ops.append(autosummary.autosummary(self.id + "/learning_rate", self.learning_rate))
+                    all_ops.append(autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok))
+                    if self.use_loss_scaling:
+                        all_ops.append(autosummary.autosummary(self.id + "/loss_scaling_log2", device.loss_scaling_var))
+
+        # Initialize variables.
+        self.reset_optimizer_state()
+        if self.use_loss_scaling:
+            tfutil.init_uninitialized_vars([device.loss_scaling_var for device in self._devices.values()])
+        if self.minibatch_multiplier is not None:
+            tfutil.run([var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count]])
+
+        # Group everything into a single op.
+        with tfutil.absolute_name_scope(self.scope):
+            return tf.group(*all_ops, name="TrainingOp")
+
+    def reset_optimizer_state(self) -> None:
+        """Reset internal state of the underlying optimizer."""
+        tfutil.assert_tf_initialized()
+        tfutil.run([var.initializer for device in self._devices.values() for var in device.optimizer.variables()])
+
+    def get_loss_scaling_var(self, device: str) -> Union[tf.Variable, None]:
+        """Get or create variable representing log2 of the current dynamic loss scaling factor."""
+        return self._get_device(device).loss_scaling_var
+
+    def apply_loss_scaling(self, value: TfExpression) -> TfExpression:
+        """Apply dynamic loss scaling for the given expression."""
+        assert tfutil.is_tf_expression(value)
+        if not self.use_loss_scaling:
+            return value
+        return value * tfutil.exp2(self.get_loss_scaling_var(value.device))
+
+    def undo_loss_scaling(self, value: TfExpression) -> TfExpression:
+        """Undo the effect of dynamic loss scaling for the given expression."""
+        assert tfutil.is_tf_expression(value)
+        if not self.use_loss_scaling:
+            return value
+        return value * tfutil.exp2(-self.get_loss_scaling_var(value.device)) # pylint: disable=invalid-unary-operand-type
+
+
+class SimpleAdam:
+    """Simplified version of tf.train.AdamOptimizer that behaves identically when used with dnnlib.tflib.Optimizer."""
+
+    def __init__(self, name="Adam", learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.name = name
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.all_state_vars = []
+
+    def variables(self):
+        return self.all_state_vars
+
+    def compute_gradients(self, loss, var_list, gate_gradients=tf.train.Optimizer.GATE_NONE):
+        assert gate_gradients == tf.train.Optimizer.GATE_NONE
+        return list(zip(tf.gradients(loss, var_list), var_list))
+
+    def apply_gradients(self, grads_and_vars):
+        with tf.name_scope(self.name):
+            state_vars = []
+            update_ops = []
+
+            # Adjust learning rate to deal with startup bias.
+            with tf.control_dependencies(None):
+                b1pow_var = tf.Variable(dtype=tf.float32, initial_value=1, trainable=False)
+                b2pow_var = tf.Variable(dtype=tf.float32, initial_value=1, trainable=False)
+                state_vars += [b1pow_var, b2pow_var]
+            b1pow_new = b1pow_var * self.beta1
+            b2pow_new = b2pow_var * self.beta2
+            update_ops += [tf.assign(b1pow_var, b1pow_new), tf.assign(b2pow_var, b2pow_new)]
+            lr_new = self.learning_rate * tf.sqrt(1 - b2pow_new) / (1 - b1pow_new)
+
+            # Construct ops to update each variable.
+            for grad, var in grads_and_vars:
+                with tf.control_dependencies(None):
+                    m_var = tf.Variable(dtype=tf.float32, initial_value=tf.zeros_like(var), trainable=False)
+                    v_var = tf.Variable(dtype=tf.float32, initial_value=tf.zeros_like(var), trainable=False)
+                    state_vars += [m_var, v_var]
+                m_new = self.beta1 * m_var + (1 - self.beta1) * grad
+                v_new = self.beta2 * v_var + (1 - self.beta2) * tf.square(grad)
+                var_delta = lr_new * m_new / (tf.sqrt(v_new) + self.epsilon)
+                update_ops += [tf.assign(m_var, m_new), tf.assign(v_var, v_new), tf.assign_sub(var, var_delta)]
+
+            # Group everything together.
+            self.all_state_vars += state_vars
+            return tf.group(*update_ops)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/tfutil.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/tfutil.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee972821770a8eca32c339f08d90066088f54d79
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/tflib/tfutil.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Miscellaneous helper utils for Tensorflow."""
+
+import os
+import numpy as np
+import tensorflow as tf
+
+# Silence deprecation warnings from TensorFlow 1.13 onwards
+import logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+import tensorflow.contrib   # requires TensorFlow 1.x!
+tf.contrib = tensorflow.contrib
+
+from typing import Any, Iterable, List, Union
+
+TfExpression = Union[tf.Tensor, tf.Variable, tf.Operation]
+"""A type that represents a valid Tensorflow expression."""
+
+TfExpressionEx = Union[TfExpression, int, float, np.ndarray]
+"""A type that can be converted to a valid Tensorflow expression."""
+
+
+def run(*args, **kwargs) -> Any:
+    """Run the specified ops in the default session."""
+    assert_tf_initialized()
+    return tf.get_default_session().run(*args, **kwargs)
+
+
+def is_tf_expression(x: Any) -> bool:
+    """Check whether the input is a valid Tensorflow expression, i.e., Tensorflow Tensor, Variable, or Operation."""
+    return isinstance(x, (tf.Tensor, tf.Variable, tf.Operation))
+
+
+def shape_to_list(shape: Iterable[tf.Dimension]) -> List[Union[int, None]]:
+    """Convert a Tensorflow shape to a list of ints. Retained for backwards compatibility -- use TensorShape.as_list() in new code."""
+    return [dim.value for dim in shape]
+
+
+def flatten(x: TfExpressionEx) -> TfExpression:
+    """Shortcut function for flattening a tensor."""
+    with tf.name_scope("Flatten"):
+        return tf.reshape(x, [-1])
+
+
+def log2(x: TfExpressionEx) -> TfExpression:
+    """Logarithm in base 2."""
+    with tf.name_scope("Log2"):
+        return tf.log(x) * np.float32(1.0 / np.log(2.0))
+
+
+def exp2(x: TfExpressionEx) -> TfExpression:
+    """Exponent in base 2."""
+    with tf.name_scope("Exp2"):
+        return tf.exp(x * np.float32(np.log(2.0)))
+
+
+def lerp(a: TfExpressionEx, b: TfExpressionEx, t: TfExpressionEx) -> TfExpressionEx:
+    """Linear interpolation."""
+    with tf.name_scope("Lerp"):
+        return a + (b - a) * t
+
+
+def lerp_clip(a: TfExpressionEx, b: TfExpressionEx, t: TfExpressionEx) -> TfExpression:
+    """Linear interpolation with clip."""
+    with tf.name_scope("LerpClip"):
+        return a + (b - a) * tf.clip_by_value(t, 0.0, 1.0)
+
+
+def absolute_name_scope(scope: str) -> tf.name_scope:
+    """Forcefully enter the specified name scope, ignoring any surrounding scopes."""
+    return tf.name_scope(scope + "/")
+
+
+def absolute_variable_scope(scope: str, **kwargs) -> tf.variable_scope:
+    """Forcefully enter the specified variable scope, ignoring any surrounding scopes."""
+    return tf.variable_scope(tf.VariableScope(name=scope, **kwargs), auxiliary_name_scope=False)
+
+
+def _sanitize_tf_config(config_dict: dict = None) -> dict:
+    # Defaults.
+    cfg = dict()
+    cfg["rnd.np_random_seed"]               = None      # Random seed for NumPy. None = keep as is.
+    cfg["rnd.tf_random_seed"]               = "auto"    # Random seed for TensorFlow. 'auto' = derive from NumPy random state. None = keep as is.
+    cfg["env.TF_CPP_MIN_LOG_LEVEL"]         = "1"       # 0 = Print all available debug info from TensorFlow. 1 = Print warnings and errors, but disable debug info.
+    cfg["graph_options.place_pruned_graph"] = True      # False = Check that all ops are available on the designated device. True = Skip the check for ops that are not used.
+    cfg["gpu_options.allow_growth"]         = True      # False = Allocate all GPU memory at the beginning. True = Allocate only as much GPU memory as needed.
+
+    # Remove defaults for environment variables that are already set.
+    for key in list(cfg):
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            if fields[1] in os.environ:
+                del cfg[key]
+
+    # User overrides.
+    if config_dict is not None:
+        cfg.update(config_dict)
+    return cfg
+
+
+def init_tf(config_dict: dict = None) -> None:
+    """Initialize TensorFlow session using good default settings."""
+    # Skip if already initialized.
+    if tf.get_default_session() is not None:
+        return
+
+    # Setup config dict and random seeds.
+    cfg = _sanitize_tf_config(config_dict)
+    np_random_seed = cfg["rnd.np_random_seed"]
+    if np_random_seed is not None:
+        np.random.seed(np_random_seed)
+    tf_random_seed = cfg["rnd.tf_random_seed"]
+    if tf_random_seed == "auto":
+        tf_random_seed = np.random.randint(1 << 31)
+    if tf_random_seed is not None:
+        tf.set_random_seed(tf_random_seed)
+
+    # Setup environment variables.
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            os.environ[fields[1]] = str(value)
+
+    # Create default TensorFlow session.
+    create_session(cfg, force_as_default=True)
+
+
+def assert_tf_initialized():
+    """Check that TensorFlow session has been initialized."""
+    if tf.get_default_session() is None:
+        raise RuntimeError("No default TensorFlow session found. Please call dnnlib.tflib.init_tf().")
+
+
+def create_session(config_dict: dict = None, force_as_default: bool = False) -> tf.Session:
+    """Create tf.Session based on config dict."""
+    # Setup TensorFlow config proto.
+    cfg = _sanitize_tf_config(config_dict)
+    config_proto = tf.ConfigProto()
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] not in ["rnd", "env"]:
+            obj = config_proto
+            for field in fields[:-1]:
+                obj = getattr(obj, field)
+            setattr(obj, fields[-1], value)
+
+    # Create session.
+    session = tf.Session(config=config_proto)
+    if force_as_default:
+        # pylint: disable=protected-access
+        session._default_session = session.as_default()
+        session._default_session.enforce_nesting = False
+        session._default_session.__enter__()
+    return session
+
+
+def init_uninitialized_vars(target_vars: List[tf.Variable] = None) -> None:
+    """Initialize all tf.Variables that have not already been initialized.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tf.variables_initializer(tf.report_uninitialized_variables()).run()
+    """
+    assert_tf_initialized()
+    if target_vars is None:
+        target_vars = tf.global_variables()
+
+    test_vars = []
+    test_ops = []
+
+    with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+        for var in target_vars:
+            assert is_tf_expression(var)
+
+            try:
+                tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/IsVariableInitialized:0"))
+            except KeyError:
+                # Op does not exist => variable may be uninitialized.
+                test_vars.append(var)
+
+                with absolute_name_scope(var.name.split(":")[0]):
+                    test_ops.append(tf.is_variable_initialized(var))
+
+    init_vars = [var for var, inited in zip(test_vars, run(test_ops)) if not inited]
+    run([var.initializer for var in init_vars])
+
+
+def set_vars(var_to_value_dict: dict) -> None:
+    """Set the values of given tf.Variables.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tflib.run([tf.assign(var, value) for var, value in var_to_value_dict.items()]
+    """
+    assert_tf_initialized()
+    ops = []
+    feed_dict = {}
+
+    for var, value in var_to_value_dict.items():
+        assert is_tf_expression(var)
+
+        try:
+            setter = tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/setter:0"))  # look for existing op
+        except KeyError:
+            with absolute_name_scope(var.name.split(":")[0]):
+                with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+                    setter = tf.assign(var, tf.placeholder(var.dtype, var.shape, "new_value"), name="setter")  # create new setter
+
+        ops.append(setter)
+        feed_dict[setter.op.inputs[1]] = value
+
+    run(ops, feed_dict)
+
+
+def create_var_with_large_initial_value(initial_value: np.ndarray, *args, **kwargs):
+    """Create tf.Variable with large initial value without bloating the tf graph."""
+    assert_tf_initialized()
+    assert isinstance(initial_value, np.ndarray)
+    zeros = tf.zeros(initial_value.shape, initial_value.dtype)
+    var = tf.Variable(zeros, *args, **kwargs)
+    set_vars({var: initial_value})
+    return var
+
+
+def convert_images_from_uint8(images, drange=[-1,1], nhwc_to_nchw=False):
+    """Convert a minibatch of images from uint8 to float32 with configurable dynamic range.
+    Can be used as an input transformation for Network.run().
+    """
+    images = tf.cast(images, tf.float32)
+    if nhwc_to_nchw:
+        images = tf.transpose(images, [0, 3, 1, 2])
+    return images * ((drange[1] - drange[0]) / 255) + drange[0]
+
+
+def convert_images_to_uint8(images, drange=[-1,1], nchw_to_nhwc=False, shrink=1, uint8_cast=True):
+    """Convert a minibatch of images from float32 to uint8 with configurable dynamic range.
+    Can be used as an output transformation for Network.run().
+    """
+    images = tf.cast(images, tf.float32)
+    if shrink > 1:
+        ksize = [1, 1, shrink, shrink]
+        images = tf.nn.avg_pool(images, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW")
+    if nchw_to_nhwc:
+        images = tf.transpose(images, [0, 2, 3, 1])
+    scale = 255 / (drange[1] - drange[0])
+    images = images * scale + (0.5 - drange[0] * scale)
+    if uint8_cast:
+        images = tf.saturate_cast(images, tf.uint8)
+    return images
diff --git a/insightface/reconstruction/ostec/external/stylegan2/dnnlib/util.py b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c98d73c26e03800a39c62386c322bb518bbd18
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/dnnlib/util.py
@@ -0,0 +1,410 @@
+﻿# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Miscellaneous utility classes and functions."""
+
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import uuid
+
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+
+
+# Util classes
+# ------------------------------------------------------------------------------------------
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+
+        sys.stdout = self
+        sys.stderr = self
+
+    def __enter__(self) -> "Logger":
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+
+    def write(self, text: str) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+
+        if self.file is not None:
+            self.file.write(text)
+
+        self.stdout.write(text)
+
+        if self.should_flush:
+            self.flush()
+
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+
+        self.stdout.flush()
+
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+
+        if self.file is not None:
+            self.file.close()
+
+
+# Small util functions
+# ------------------------------------------------------------------------------------------
+
+
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+
+
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+
+
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+
+    for v in t:
+        result *= v
+
+    return result
+
+
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+
+
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+
+    assert type_str in _str_to_ctype.keys()
+
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+
+    return my_dtype, my_ctype
+
+
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+
+
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name) # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+        except ImportError:
+            pass
+
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+
+
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+
+
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+
+
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    func_obj = get_obj_by_name(func_name)
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+
+
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+
+
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+
+
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    return obj.__module__ + "." + obj.__name__
+
+
+# File system helpers
+# ------------------------------------------------------------------------------------------
+
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+
+    if ignores is None:
+        ignores = []
+
+    result = []
+
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+
+    return result
+
+
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+
+        shutil.copyfile(file[0], file[1])
+
+
+# URL helpers
+# ------------------------------------------------------------------------------------------
+
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file:///'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+
+
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert is_url(url, allow_file_urls=True)
+    assert num_attempts >= 1
+
+    # Handle file URLs.
+    if url.startswith('file:///'):
+        return open(url[len('file:///'):], "rb")
+
+    # Lookup from cache.
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache_dir is not None:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            return open(cache_files[0], "rb")
+
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+
+    # Save to cache.
+    if cache_dir is not None:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file) # atomic
+
+    # Return data as file object.
+    return io.BytesIO(url_data)
diff --git a/insightface/reconstruction/ostec/external/stylegan2/docs/license.html b/insightface/reconstruction/ostec/external/stylegan2/docs/license.html
new file mode 100644
index 0000000000000000000000000000000000000000..583f7f89044f07e94e84a3beb6dae914e5e78cd7
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/docs/license.html
@@ -0,0 +1,160 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
+  <title>Nvidia Source Code License-NC</title>
+  <link href="https://fonts.googleapis.com/css?family=Helvetica+Neue" rel="stylesheet"/>
+  <style type="text/css">
+
+  body {
+      font-family: 'Helvetica Neue', sans-serif;
+      color: #000000;
+      line-height: 1.5;
+  }
+
+  h1, h2, h3, h4, h5, h6 {
+      color: #92D050;
+      font-weight: normal;
+  }
+
+  h1 {
+      line-height: 1.2;
+      font-size: 2em;
+      margin-top: 1.5em;
+  }
+
+  p {
+    margin-left: 0px;
+    margin-right: 0px;
+    margin-top: 0.75em;
+    margin-bottom: 0.75em;
+  }
+
+  p.tab {
+    margin-left: 3em;
+  }
+
+  hr {
+    border: 0px;
+    height: 1px;
+    background: #CCCCCC;
+  }
+
+  @media screen and (min-width: 680px) {
+      .max-width {
+          margin: 0 100px 0 170px;
+          max-width: 640px;
+      }
+  }
+  @media screen and (min-width: 980px) {
+      .max-width {
+          margin: 0 auto;
+      }
+  }
+  </style>
+</head>
+<body class="max-width">
+
+<h1>Nvidia Source Code License-NC</h1>
+
+<hr/>
+
+<h2>1. Definitions</h2>
+
+<p>&ldquo;Licensor&rdquo; means any person or entity that distributes its Work.</p>
+
+<p>&ldquo;Software&rdquo; means the original work of authorship made available under
+this License.</p>
+
+<p>&ldquo;Work&rdquo; means the Software and any additions to or derivative works of
+the Software that are made available under this License.</p>
+
+<p>&ldquo;Nvidia Processors&rdquo; means any central processing unit (CPU), graphics
+processing unit (GPU), field-programmable gate array (FPGA),
+application-specific integrated circuit (ASIC) or any combination
+thereof designed, made, sold, or provided by Nvidia or its affiliates.</p>
+
+<p>The terms &ldquo;reproduce,&rdquo; &ldquo;reproduction,&rdquo; &ldquo;derivative works,&rdquo; and
+&ldquo;distribution&rdquo; have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.</p>
+
+<p>Works, including the Software, are &ldquo;made available&rdquo; under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.<p>
+
+<h2>2. License Grants</h2>
+
+<p class="tab">2.1 Copyright Grant. Subject to the terms and conditions of this
+License, each Licensor grants to you a perpetual, worldwide,
+non-exclusive, royalty-free, copyright license to reproduce,
+prepare derivative works of, publicly display, publicly perform,
+sublicense and distribute its Work and any resulting derivative
+works in any form.</p>
+
+<h2>3. Limitations</h2>
+
+<p class="tab">3.1 Redistribution. You may reproduce or distribute the Work only
+if (a) you do so under this License, (b) you include a complete
+copy of this License with your distribution, and (c) you retain
+without modification any copyright, patent, trademark, or
+attribution notices that are present in the Work.</p>
+
+<p class="tab">3.2 Derivative Works. You may specify that additional or different
+terms apply to the use, reproduction, and distribution of your
+derivative works of the Work (&ldquo;Your Terms&rdquo;) only if (a) Your Terms
+provide that the use limitation in Section 3.3 applies to your
+derivative works, and (b) you identify the specific derivative
+works that are subject to Your Terms. Notwithstanding Your Terms,
+this License (including the redistribution requirements in Section
+3.1) will continue to apply to the Work itself.</p>
+
+<p class="tab">3.3 Use Limitation. The Work and any derivative works thereof only
+may be used or intended for use non-commercially. The Work or
+derivative works thereof may be used or intended for use by Nvidia
+or its affiliates commercially or non-commercially. As used herein,
+&ldquo;non-commercially&rdquo; means for research or evaluation purposes only.</p>
+
+<p class="tab">3.4 Patent Claims. If you bring or threaten to bring a patent claim
+against any Licensor (including any claim, cross-claim or
+counterclaim in a lawsuit) to enforce any patents that you allege
+are infringed by any Work, then your rights under this License from
+such Licensor (including the grants in Sections 2.1 and 2.2) will
+terminate immediately.</p>
+
+<p class="tab">3.5 Trademarks. This License does not grant any rights to use any
+Licensor&rsquo;s or its affiliates&rsquo; names, logos, or trademarks, except
+as necessary to reproduce the notices described in this License.</p>
+
+<p class="tab">3.6 Termination. If you violate any term of this License, then your
+rights under this License (including the grants in Sections 2.1 and
+2.2) will terminate immediately.</p>
+
+<h2>4. Disclaimer of Warranty.</h2>
+
+<p>THE WORK IS PROVIDED &ldquo;AS IS&rdquo; WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.</p>
+
+<h2>5. Limitation of Liability.</h2>
+
+<p>EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.</p>
+
+<hr/>
+<br/>
+
+</body>
+</html>
diff --git a/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-teaser-1024x256.png b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-teaser-1024x256.png
new file mode 100644
index 0000000000000000000000000000000000000000..5454cf10bf33112a1bb5e721996099e3bae392a8
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-teaser-1024x256.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a38ad060b4c1ce4e474f98587f88e31943ffc22b6139c8d90ad6c8dc5c4bc684
+size 431014
diff --git a/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-training-curves.png b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-training-curves.png
new file mode 100644
index 0000000000000000000000000000000000000000..694abe5b3b81d399bc05f7a1513fe444205e363b
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2-training-curves.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e1741f99f0b5680eea3b29932c1ca1d281732bd75f2cd8c27fc00a437ad00d7
+size 46577
diff --git a/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2encoder-teaser-1024x256.png b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2encoder-teaser-1024x256.png
new file mode 100644
index 0000000000000000000000000000000000000000..489072010aaddf626ed55846a6fb8d3706897e23
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/docs/stylegan2encoder-teaser-1024x256.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71a35d90c876a7bc5f52d2771598321ae583cc1241f712e1769c61b91a5442c9
+size 498184
diff --git a/insightface/reconstruction/ostec/external/stylegan2/docs/versions.html b/insightface/reconstruction/ostec/external/stylegan2/docs/versions.html
new file mode 100644
index 0000000000000000000000000000000000000000..a78bef3d8db9ba70fe83b4990bfc7f7c21bfd980
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/docs/versions.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
+  <title>StyleGAN versions</title>
+  <link href="https://fonts.googleapis.com/css?family=Montserrat|Source+Sans+Pro" rel="stylesheet"/>
+  <style type="text/css">
+
+  body {
+      font-family: 'Source Sans Pro', sans-serif;
+      color: #000000;
+      line-height: 1.5;
+  }
+
+  h1, h2, h3, h4, h5, h6 {
+      font-family: 'Montserrat', serif;
+  }
+
+  h1 {
+      line-height: 1.2;
+      font-size: 2em;
+      margin-top: 1.5em;
+  }
+
+  p {
+    margin-left: 0px;
+    margin-right: 0px;
+    margin-top: 0.75em;
+    margin-bottom: 0.75em;
+  }
+
+  @media screen and (min-width: 680px) {
+      .max-width {
+          margin: 0 100px 0 170px;
+          max-width: 640px;
+      }
+  }
+  @media screen and (min-width: 980px) {
+      .max-width {
+          margin: 0 auto;
+      }
+  }
+  </style>
+</head>
+<body class="max-width">
+
+<h1>StyleGAN2</h1>
+<ul>
+    <li>Paper: <a href="http://arxiv.org/abs/1912.04958">http://arxiv.org/abs/1912.04958</a></li>
+    <li>Video: <a href="https://youtu.be/c-NJtV9Jvp0">https://youtu.be/c-NJtV9Jvp0</a></li>
+    <li>Code: <a href="https://github.com/NVlabs/stylegan2">https://github.com/NVlabs/stylegan2</a></li>
+</ul>
+
+<h1>Original StyleGAN</h1>
+<ul>
+    <li>Paper: <a href="https://arxiv.org/abs/1812.04948">https://arxiv.org/abs/1812.04948</a></li>
+    <li>Video: <a href="https://youtu.be/kSLJriaOumA">https://youtu.be/kSLJriaOumA</a></li>
+    <li>Code: <a href="https://github.com/NVlabs/stylegan">https://github.com/NVlabs/stylegan</a></li>
+    <li>FFHQ: <a href="https://github.com/NVlabs/ffhq-dataset">https://github.com/NVlabs/ffhq-dataset</a></li>
+</ul>
+
+</body>
+</html>
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab9908efa3cb38af52e8d5bcaa8acffde5a8875
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+# empty
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/frechet_inception_distance.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/frechet_inception_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ace0d6ec253bd9e7e1d3ec16710e70243a7f3427
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/frechet_inception_distance.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Frechet Inception Distance (FID)."""
+
+import os
+import numpy as np
+import scipy
+import tensorflow as tf
+import dnnlib.tflib as tflib
+
+from metrics import metric_base
+from training import misc
+
+#----------------------------------------------------------------------------
+
+class FID(metric_base.MetricBase):
+    def __init__(self, num_images, minibatch_per_gpu, **kwargs):
+        super().__init__(**kwargs)
+        self.num_images = num_images
+        self.minibatch_per_gpu = minibatch_per_gpu
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        minibatch_size = num_gpus * self.minibatch_per_gpu
+        inception = misc.load_pkl('https://drive.google.com/uc?id=1MzTY44rLToO5APn8TZmfR7_ENSe5aZUn') # inception_v3_features.pkl
+        activations = np.empty([self.num_images, inception.output_shape[1]], dtype=np.float32)
+
+        # Calculate statistics for reals.
+        cache_file = self._get_cache_file_for_reals(num_images=self.num_images)
+        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+        if os.path.isfile(cache_file):
+            mu_real, sigma_real = misc.load_pkl(cache_file)
+        else:
+            for idx, images in enumerate(self._iterate_reals(minibatch_size=minibatch_size)):
+                begin = idx * minibatch_size
+                end = min(begin + minibatch_size, self.num_images)
+                activations[begin:end] = inception.run(images[:end-begin], num_gpus=num_gpus, assume_frozen=True)
+                if end == self.num_images:
+                    break
+            mu_real = np.mean(activations, axis=0)
+            sigma_real = np.cov(activations, rowvar=False)
+            misc.save_pkl((mu_real, sigma_real), cache_file)
+
+        # Construct TensorFlow graph.
+        result_expr = []
+        for gpu_idx in range(num_gpus):
+            with tf.device('/gpu:%d' % gpu_idx):
+                Gs_clone = Gs.clone()
+                inception_clone = inception.clone()
+                latents = tf.random_normal([self.minibatch_per_gpu] + Gs_clone.input_shape[1:])
+                labels = self._get_random_labels_tf(self.minibatch_per_gpu)
+                images = Gs_clone.get_output_for(latents, labels, **Gs_kwargs)
+                images = tflib.convert_images_to_uint8(images)
+                result_expr.append(inception_clone.get_output_for(images))
+
+        # Calculate statistics for fakes.
+        for begin in range(0, self.num_images, minibatch_size):
+            self._report_progress(begin, self.num_images)
+            end = min(begin + minibatch_size, self.num_images)
+            activations[begin:end] = np.concatenate(tflib.run(result_expr), axis=0)[:end-begin]
+        mu_fake = np.mean(activations, axis=0)
+        sigma_fake = np.cov(activations, rowvar=False)
+
+        # Calculate FID.
+        m = np.square(mu_fake - mu_real).sum()
+        s, _ = scipy.linalg.sqrtm(np.dot(sigma_fake, sigma_real), disp=False) # pylint: disable=no-member
+        dist = m + np.trace(sigma_fake + sigma_real - 2*s)
+        self._report_result(np.real(dist))
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/inception_score.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/inception_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff0543d412cea7fa520eba65118a2b0dae58759e
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/inception_score.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Inception Score (IS)."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib.tflib as tflib
+
+from metrics import metric_base
+from training import misc
+
+#----------------------------------------------------------------------------
+
+class IS(metric_base.MetricBase):
+    def __init__(self, num_images, num_splits, minibatch_per_gpu, **kwargs):
+        super().__init__(**kwargs)
+        self.num_images = num_images
+        self.num_splits = num_splits
+        self.minibatch_per_gpu = minibatch_per_gpu
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        minibatch_size = num_gpus * self.minibatch_per_gpu
+        inception = misc.load_pkl('https://drive.google.com/uc?id=1Mz9zQnIrusm3duZB91ng_aUIePFNI6Jx') # inception_v3_softmax.pkl
+        activations = np.empty([self.num_images, inception.output_shape[1]], dtype=np.float32)
+
+        # Construct TensorFlow graph.
+        result_expr = []
+        for gpu_idx in range(num_gpus):
+            with tf.device('/gpu:%d' % gpu_idx):
+                Gs_clone = Gs.clone()
+                inception_clone = inception.clone()
+                latents = tf.random_normal([self.minibatch_per_gpu] + Gs_clone.input_shape[1:])
+                labels = self._get_random_labels_tf(self.minibatch_per_gpu)
+                images = Gs_clone.get_output_for(latents, labels, **Gs_kwargs)
+                images = tflib.convert_images_to_uint8(images)
+                result_expr.append(inception_clone.get_output_for(images))
+
+        # Calculate activations for fakes.
+        for begin in range(0, self.num_images, minibatch_size):
+            self._report_progress(begin, self.num_images)
+            end = min(begin + minibatch_size, self.num_images)
+            activations[begin:end] = np.concatenate(tflib.run(result_expr), axis=0)[:end-begin]
+
+        # Calculate IS.
+        scores = []
+        for i in range(self.num_splits):
+            part = activations[i * self.num_images // self.num_splits : (i + 1) * self.num_images // self.num_splits]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        self._report_result(np.mean(scores), suffix='_mean')
+        self._report_result(np.std(scores), suffix='_std')
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/linear_separability.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/linear_separability.py
new file mode 100644
index 0000000000000000000000000000000000000000..14bfb9958a9b07c72b26fa545dbedcaeb136bd82
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/linear_separability.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Linear Separability (LS)."""
+
+from collections import defaultdict
+import numpy as np
+import sklearn.svm
+import tensorflow as tf
+import dnnlib.tflib as tflib
+
+from metrics import metric_base
+from training import misc
+
+#----------------------------------------------------------------------------
+
+classifier_urls = [
+    'https://drive.google.com/uc?id=1Q5-AI6TwWhCVM7Muu4tBM7rp5nG_gmCX', # celebahq-classifier-00-male.pkl
+    'https://drive.google.com/uc?id=1Q5c6HE__ReW2W8qYAXpao68V1ryuisGo', # celebahq-classifier-01-smiling.pkl
+    'https://drive.google.com/uc?id=1Q7738mgWTljPOJQrZtSMLxzShEhrvVsU', # celebahq-classifier-02-attractive.pkl
+    'https://drive.google.com/uc?id=1QBv2Mxe7ZLvOv1YBTLq-T4DS3HjmXV0o', # celebahq-classifier-03-wavy-hair.pkl
+    'https://drive.google.com/uc?id=1QIvKTrkYpUrdA45nf7pspwAqXDwWOLhV', # celebahq-classifier-04-young.pkl
+    'https://drive.google.com/uc?id=1QJPH5rW7MbIjFUdZT7vRYfyUjNYDl4_L', # celebahq-classifier-05-5-o-clock-shadow.pkl
+    'https://drive.google.com/uc?id=1QPZXSYf6cptQnApWS_T83sqFMun3rULY', # celebahq-classifier-06-arched-eyebrows.pkl
+    'https://drive.google.com/uc?id=1QPgoAZRqINXk_PFoQ6NwMmiJfxc5d2Pg', # celebahq-classifier-07-bags-under-eyes.pkl
+    'https://drive.google.com/uc?id=1QQPQgxgI6wrMWNyxFyTLSgMVZmRr1oO7', # celebahq-classifier-08-bald.pkl
+    'https://drive.google.com/uc?id=1QcSphAmV62UrCIqhMGgcIlZfoe8hfWaF', # celebahq-classifier-09-bangs.pkl
+    'https://drive.google.com/uc?id=1QdWTVwljClTFrrrcZnPuPOR4mEuz7jGh', # celebahq-classifier-10-big-lips.pkl
+    'https://drive.google.com/uc?id=1QgvEWEtr2mS4yj1b_Y3WKe6cLWL3LYmK', # celebahq-classifier-11-big-nose.pkl
+    'https://drive.google.com/uc?id=1QidfMk9FOKgmUUIziTCeo8t-kTGwcT18', # celebahq-classifier-12-black-hair.pkl
+    'https://drive.google.com/uc?id=1QthrJt-wY31GPtV8SbnZQZ0_UEdhasHO', # celebahq-classifier-13-blond-hair.pkl
+    'https://drive.google.com/uc?id=1QvCAkXxdYT4sIwCzYDnCL9Nb5TDYUxGW', # celebahq-classifier-14-blurry.pkl
+    'https://drive.google.com/uc?id=1QvLWuwSuWI9Ln8cpxSGHIciUsnmaw8L0', # celebahq-classifier-15-brown-hair.pkl
+    'https://drive.google.com/uc?id=1QxW6THPI2fqDoiFEMaV6pWWHhKI_OoA7', # celebahq-classifier-16-bushy-eyebrows.pkl
+    'https://drive.google.com/uc?id=1R71xKw8oTW2IHyqmRDChhTBkW9wq4N9v', # celebahq-classifier-17-chubby.pkl
+    'https://drive.google.com/uc?id=1RDn_fiLfEGbTc7JjazRXuAxJpr-4Pl67', # celebahq-classifier-18-double-chin.pkl
+    'https://drive.google.com/uc?id=1RGBuwXbaz5052bM4VFvaSJaqNvVM4_cI', # celebahq-classifier-19-eyeglasses.pkl
+    'https://drive.google.com/uc?id=1RIxOiWxDpUwhB-9HzDkbkLegkd7euRU9', # celebahq-classifier-20-goatee.pkl
+    'https://drive.google.com/uc?id=1RPaNiEnJODdr-fwXhUFdoSQLFFZC7rC-', # celebahq-classifier-21-gray-hair.pkl
+    'https://drive.google.com/uc?id=1RQH8lPSwOI2K_9XQCZ2Ktz7xm46o80ep', # celebahq-classifier-22-heavy-makeup.pkl
+    'https://drive.google.com/uc?id=1RXZM61xCzlwUZKq-X7QhxOg0D2telPow', # celebahq-classifier-23-high-cheekbones.pkl
+    'https://drive.google.com/uc?id=1RgASVHW8EWMyOCiRb5fsUijFu-HfxONM', # celebahq-classifier-24-mouth-slightly-open.pkl
+    'https://drive.google.com/uc?id=1RkC8JLqLosWMaRne3DARRgolhbtg_wnr', # celebahq-classifier-25-mustache.pkl
+    'https://drive.google.com/uc?id=1RqtbtFT2EuwpGTqsTYJDyXdnDsFCPtLO', # celebahq-classifier-26-narrow-eyes.pkl
+    'https://drive.google.com/uc?id=1Rs7hU-re8bBMeRHR-fKgMbjPh-RIbrsh', # celebahq-classifier-27-no-beard.pkl
+    'https://drive.google.com/uc?id=1RynDJQWdGOAGffmkPVCrLJqy_fciPF9E', # celebahq-classifier-28-oval-face.pkl
+    'https://drive.google.com/uc?id=1S0TZ_Hdv5cb06NDaCD8NqVfKy7MuXZsN', # celebahq-classifier-29-pale-skin.pkl
+    'https://drive.google.com/uc?id=1S3JPhZH2B4gVZZYCWkxoRP11q09PjCkA', # celebahq-classifier-30-pointy-nose.pkl
+    'https://drive.google.com/uc?id=1S3pQuUz-Jiywq_euhsfezWfGkfzLZ87W', # celebahq-classifier-31-receding-hairline.pkl
+    'https://drive.google.com/uc?id=1S6nyIl_SEI3M4l748xEdTV2vymB_-lrY', # celebahq-classifier-32-rosy-cheeks.pkl
+    'https://drive.google.com/uc?id=1S9P5WCi3GYIBPVYiPTWygrYIUSIKGxbU', # celebahq-classifier-33-sideburns.pkl
+    'https://drive.google.com/uc?id=1SANviG-pp08n7AFpE9wrARzozPIlbfCH', # celebahq-classifier-34-straight-hair.pkl
+    'https://drive.google.com/uc?id=1SArgyMl6_z7P7coAuArqUC2zbmckecEY', # celebahq-classifier-35-wearing-earrings.pkl
+    'https://drive.google.com/uc?id=1SC5JjS5J-J4zXFO9Vk2ZU2DT82TZUza_', # celebahq-classifier-36-wearing-hat.pkl
+    'https://drive.google.com/uc?id=1SDAQWz03HGiu0MSOKyn7gvrp3wdIGoj-', # celebahq-classifier-37-wearing-lipstick.pkl
+    'https://drive.google.com/uc?id=1SEtrVK-TQUC0XeGkBE9y7L8VXfbchyKX', # celebahq-classifier-38-wearing-necklace.pkl
+    'https://drive.google.com/uc?id=1SF_mJIdyGINXoV-I6IAxHB_k5dxiF6M-', # celebahq-classifier-39-wearing-necktie.pkl
+]
+
+#----------------------------------------------------------------------------
+
+def prob_normalize(p):
+    p = np.asarray(p).astype(np.float32)
+    assert len(p.shape) == 2
+    return p / np.sum(p)
+
+def mutual_information(p):
+    p = prob_normalize(p)
+    px = np.sum(p, axis=1)
+    py = np.sum(p, axis=0)
+    result = 0.0
+    for x in range(p.shape[0]):
+        p_x = px[x]
+        for y in range(p.shape[1]):
+            p_xy = p[x][y]
+            p_y = py[y]
+            if p_xy > 0.0:
+                result += p_xy * np.log2(p_xy / (p_x * p_y)) # get bits as output
+    return result
+
+def entropy(p):
+    p = prob_normalize(p)
+    result = 0.0
+    for x in range(p.shape[0]):
+        for y in range(p.shape[1]):
+            p_xy = p[x][y]
+            if p_xy > 0.0:
+                result -= p_xy * np.log2(p_xy)
+    return result
+
+def conditional_entropy(p):
+    # H(Y|X) where X corresponds to axis 0, Y to axis 1
+    # i.e., How many bits of additional information are needed to where we are on axis 1 if we know where we are on axis 0?
+    p = prob_normalize(p)
+    y = np.sum(p, axis=0, keepdims=True) # marginalize to calculate H(Y)
+    return max(0.0, entropy(y) - mutual_information(p)) # can slip just below 0 due to FP inaccuracies, clean those up.
+
+#----------------------------------------------------------------------------
+
+class LS(metric_base.MetricBase):
+    def __init__(self, num_samples, num_keep, attrib_indices, minibatch_per_gpu, **kwargs):
+        assert num_keep <= num_samples
+        super().__init__(**kwargs)
+        self.num_samples = num_samples
+        self.num_keep = num_keep
+        self.attrib_indices = attrib_indices
+        self.minibatch_per_gpu = minibatch_per_gpu
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        minibatch_size = num_gpus * self.minibatch_per_gpu
+
+        # Construct TensorFlow graph for each GPU.
+        result_expr = []
+        for gpu_idx in range(num_gpus):
+            with tf.device('/gpu:%d' % gpu_idx):
+                Gs_clone = Gs.clone()
+
+                # Generate images.
+                latents = tf.random_normal([self.minibatch_per_gpu] + Gs_clone.input_shape[1:])
+                labels = self._get_random_labels_tf(self.minibatch_per_gpu)
+                dlatents = Gs_clone.components.mapping.get_output_for(latents, labels, **Gs_kwargs)
+                images = Gs_clone.get_output_for(latents, None, **Gs_kwargs)
+
+                # Downsample to 256x256. The attribute classifiers were built for 256x256.
+                if images.shape[2] > 256:
+                    factor = images.shape[2] // 256
+                    images = tf.reshape(images, [-1, images.shape[1], images.shape[2] // factor, factor, images.shape[3] // factor, factor])
+                    images = tf.reduce_mean(images, axis=[3, 5])
+
+                # Run classifier for each attribute.
+                result_dict = dict(latents=latents, dlatents=dlatents[:,-1])
+                for attrib_idx in self.attrib_indices:
+                    classifier = misc.load_pkl(classifier_urls[attrib_idx])
+                    logits = classifier.get_output_for(images, None)
+                    predictions = tf.nn.softmax(tf.concat([logits, -logits], axis=1))
+                    result_dict[attrib_idx] = predictions
+                result_expr.append(result_dict)
+
+        # Sampling loop.
+        results = []
+        for begin in range(0, self.num_samples, minibatch_size):
+            self._report_progress(begin, self.num_samples)
+            results += tflib.run(result_expr)
+        results = {key: np.concatenate([value[key] for value in results], axis=0) for key in results[0].keys()}
+
+        # Calculate conditional entropy for each attribute.
+        conditional_entropies = defaultdict(list)
+        for attrib_idx in self.attrib_indices:
+            # Prune the least confident samples.
+            pruned_indices = list(range(self.num_samples))
+            pruned_indices = sorted(pruned_indices, key=lambda i: -np.max(results[attrib_idx][i]))
+            pruned_indices = pruned_indices[:self.num_keep]
+
+            # Fit SVM to the remaining samples.
+            svm_targets = np.argmax(results[attrib_idx][pruned_indices], axis=1)
+            for space in ['latents', 'dlatents']:
+                svm_inputs = results[space][pruned_indices]
+                try:
+                    svm = sklearn.svm.LinearSVC()
+                    svm.fit(svm_inputs, svm_targets)
+                    svm.score(svm_inputs, svm_targets)
+                    svm_outputs = svm.predict(svm_inputs)
+                except:
+                    svm_outputs = svm_targets # assume perfect prediction
+
+                # Calculate conditional entropy.
+                p = [[np.mean([case == (row, col) for case in zip(svm_outputs, svm_targets)]) for col in (0, 1)] for row in (0, 1)]
+                conditional_entropies[space].append(conditional_entropy(p))
+
+        # Calculate separability scores.
+        scores = {key: 2**np.sum(values) for key, values in conditional_entropies.items()}
+        self._report_result(scores['latents'], suffix='_z')
+        self._report_result(scores['dlatents'], suffix='_w')
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_base.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd0276d28d31ce3f3cec99b2142ed11d9bef340
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_base.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Common definitions for GAN metrics."""
+
+import os
+import time
+import hashlib
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+
+from training import misc
+from training import dataset
+
+#----------------------------------------------------------------------------
+# Base class for metrics.
+
+class MetricBase:
+    def __init__(self, name):
+        self.name = name
+        self._dataset_obj = None
+        self._progress_lo = None
+        self._progress_hi = None
+        self._progress_max = None
+        self._progress_sec = None
+        self._progress_time = None
+        self._reset()
+
+    def close(self):
+        self._reset()
+
+    def _reset(self, network_pkl=None, run_dir=None, data_dir=None, dataset_args=None, mirror_augment=None):
+        if self._dataset_obj is not None:
+            self._dataset_obj.close()
+
+        self._network_pkl = network_pkl
+        self._data_dir = data_dir
+        self._dataset_args = dataset_args
+        self._dataset_obj = None
+        self._mirror_augment = mirror_augment
+        self._eval_time = 0
+        self._results = []
+
+        if (dataset_args is None or mirror_augment is None) and run_dir is not None:
+            run_config = misc.parse_config_for_previous_run(run_dir)
+            self._dataset_args = dict(run_config['dataset'])
+            self._dataset_args['shuffle_mb'] = 0
+            self._mirror_augment = run_config['train'].get('mirror_augment', False)
+
+    def configure_progress_reports(self, plo, phi, pmax, psec=15):
+        self._progress_lo = plo
+        self._progress_hi = phi
+        self._progress_max = pmax
+        self._progress_sec = psec
+
+    def run(self, network_pkl, run_dir=None, data_dir=None, dataset_args=None, mirror_augment=None, num_gpus=1, tf_config=None, log_results=True, Gs_kwargs=dict(is_validation=True)):
+        self._reset(network_pkl=network_pkl, run_dir=run_dir, data_dir=data_dir, dataset_args=dataset_args, mirror_augment=mirror_augment)
+        time_begin = time.time()
+        with tf.Graph().as_default(), tflib.create_session(tf_config).as_default(): # pylint: disable=not-context-manager
+            self._report_progress(0, 1)
+            _G, _D, Gs = misc.load_pkl(self._network_pkl)
+            self._evaluate(Gs, Gs_kwargs=Gs_kwargs, num_gpus=num_gpus)
+            self._report_progress(1, 1)
+        self._eval_time = time.time() - time_begin # pylint: disable=attribute-defined-outside-init
+
+        if log_results:
+            if run_dir is not None:
+                log_file = os.path.join(run_dir, 'metric-%s.txt' % self.name)
+                with dnnlib.util.Logger(log_file, 'a'):
+                    print(self.get_result_str().strip())
+            else:
+                print(self.get_result_str().strip())
+
+    def get_result_str(self):
+        network_name = os.path.splitext(os.path.basename(self._network_pkl))[0]
+        if len(network_name) > 29:
+            network_name = '...' + network_name[-26:]
+        result_str = '%-30s' % network_name
+        result_str += ' time %-12s' % dnnlib.util.format_time(self._eval_time)
+        for res in self._results:
+            result_str += ' ' + self.name + res.suffix + ' '
+            result_str += res.fmt % res.value
+        return result_str
+
+    def update_autosummaries(self):
+        for res in self._results:
+            tflib.autosummary.autosummary('Metrics/' + self.name + res.suffix, res.value)
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        raise NotImplementedError # to be overridden by subclasses
+
+    def _report_result(self, value, suffix='', fmt='%-10.4f'):
+        self._results += [dnnlib.EasyDict(value=value, suffix=suffix, fmt=fmt)]
+
+    def _report_progress(self, pcur, pmax, status_str=''):
+        if self._progress_lo is None or self._progress_hi is None or self._progress_max is None:
+            return
+        t = time.time()
+        if self._progress_sec is not None and self._progress_time is not None and t < self._progress_time + self._progress_sec:
+            return
+        self._progress_time = t
+        val = self._progress_lo + (pcur / pmax) * (self._progress_hi - self._progress_lo)
+        dnnlib.RunContext.get().update(status_str, int(val), self._progress_max)
+
+    def _get_cache_file_for_reals(self, extension='pkl', **kwargs):
+        all_args = dnnlib.EasyDict(metric_name=self.name, mirror_augment=self._mirror_augment)
+        all_args.update(self._dataset_args)
+        all_args.update(kwargs)
+        md5 = hashlib.md5(repr(sorted(all_args.items())).encode('utf-8'))
+        dataset_name = self._dataset_args.get('tfrecord_dir', None) or self._dataset_args.get('h5_file', None)
+        dataset_name = os.path.splitext(os.path.basename(dataset_name))[0]
+        return os.path.join('.stylegan2-cache', '%s-%s-%s.%s' % (md5.hexdigest(), self.name, dataset_name, extension))
+
+    def _get_dataset_obj(self):
+        if self._dataset_obj is None:
+            self._dataset_obj = dataset.load_dataset(data_dir=self._data_dir, **self._dataset_args)
+        return self._dataset_obj
+
+    def _iterate_reals(self, minibatch_size):
+        dataset_obj = self._get_dataset_obj()
+        while True:
+            images, _labels = dataset_obj.get_minibatch_np(minibatch_size)
+            if self._mirror_augment:
+                images = misc.apply_mirror_augment(images)
+            yield images
+
+    def _iterate_fakes(self, Gs, minibatch_size, num_gpus):
+        while True:
+            latents = np.random.randn(minibatch_size, *Gs.input_shape[1:])
+            fmt = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
+            images = Gs.run(latents, None, output_transform=fmt, is_validation=True, num_gpus=num_gpus, assume_frozen=True)
+            yield images
+
+    def _get_random_labels_tf(self, minibatch_size):
+        return self._get_dataset_obj().get_random_labels_tf(minibatch_size)
+
+#----------------------------------------------------------------------------
+# Group of multiple metrics.
+
+class MetricGroup:
+    def __init__(self, metric_kwarg_list):
+        self.metrics = [dnnlib.util.call_func_by_name(**kwargs) for kwargs in metric_kwarg_list]
+
+    def run(self, *args, **kwargs):
+        for metric in self.metrics:
+            metric.run(*args, **kwargs)
+
+    def get_result_str(self):
+        return ' '.join(metric.get_result_str() for metric in self.metrics)
+
+    def update_autosummaries(self):
+        for metric in self.metrics:
+            metric.update_autosummaries()
+
+#----------------------------------------------------------------------------
+# Dummy metric for debugging purposes.
+
+class DummyMetric(MetricBase):
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        _ = Gs, Gs_kwargs, num_gpus
+        self._report_result(0.0)
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_defaults.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..4371db8180ab71a625eb0f0520522816e4bd93b4
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/metric_defaults.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Default metric definitions."""
+
+from dnnlib import EasyDict
+
+#----------------------------------------------------------------------------
+
+metric_defaults = EasyDict([(args.name, args) for args in [
+    EasyDict(name='fid50k',    func_name='metrics.frechet_inception_distance.FID', num_images=50000, minibatch_per_gpu=8),
+    EasyDict(name='is50k',     func_name='metrics.inception_score.IS',             num_images=50000, num_splits=10, minibatch_per_gpu=8),
+    EasyDict(name='ppl_zfull', func_name='metrics.perceptual_path_length.PPL',     num_samples=50000, epsilon=1e-4, space='z', sampling='full', crop=True, minibatch_per_gpu=4, Gs_overrides=dict(dtype='float32', mapping_dtype='float32')),
+    EasyDict(name='ppl_wfull', func_name='metrics.perceptual_path_length.PPL',     num_samples=50000, epsilon=1e-4, space='w', sampling='full', crop=True, minibatch_per_gpu=4, Gs_overrides=dict(dtype='float32', mapping_dtype='float32')),
+    EasyDict(name='ppl_zend',  func_name='metrics.perceptual_path_length.PPL',     num_samples=50000, epsilon=1e-4, space='z', sampling='end', crop=True, minibatch_per_gpu=4, Gs_overrides=dict(dtype='float32', mapping_dtype='float32')),
+    EasyDict(name='ppl_wend',  func_name='metrics.perceptual_path_length.PPL',     num_samples=50000, epsilon=1e-4, space='w', sampling='end', crop=True, minibatch_per_gpu=4, Gs_overrides=dict(dtype='float32', mapping_dtype='float32')),
+    EasyDict(name='ppl2_wend', func_name='metrics.perceptual_path_length.PPL',     num_samples=50000, epsilon=1e-4, space='w', sampling='end', crop=False, minibatch_per_gpu=4, Gs_overrides=dict(dtype='float32', mapping_dtype='float32')),
+    EasyDict(name='ls',        func_name='metrics.linear_separability.LS',         num_samples=200000, num_keep=100000, attrib_indices=range(40), minibatch_per_gpu=4),
+    EasyDict(name='pr50k3',    func_name='metrics.precision_recall.PR',            num_images=50000, nhood_size=3, minibatch_per_gpu=8, row_batch_size=10000, col_batch_size=10000),
+]])
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/perceptual_path_length.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/perceptual_path_length.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e5e4d9e3156cd5210a1e1b50b0cc2b6df2fd0bc
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/perceptual_path_length.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Perceptual Path Length (PPL)."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib.tflib as tflib
+
+from metrics import metric_base
+from training import misc
+
+#----------------------------------------------------------------------------
+
+# Normalize batch of vectors.
+def normalize(v):
+    return v / tf.sqrt(tf.reduce_sum(tf.square(v), axis=-1, keepdims=True))
+
+# Spherical interpolation of a batch of vectors.
+def slerp(a, b, t):
+    a = normalize(a)
+    b = normalize(b)
+    d = tf.reduce_sum(a * b, axis=-1, keepdims=True)
+    p = t * tf.math.acos(d)
+    c = normalize(b - d * a)
+    d = a * tf.math.cos(p) + c * tf.math.sin(p)
+    return normalize(d)
+
+#----------------------------------------------------------------------------
+
+class PPL(metric_base.MetricBase):
+    def __init__(self, num_samples, epsilon, space, sampling, crop, minibatch_per_gpu, Gs_overrides, **kwargs):
+        assert space in ['z', 'w']
+        assert sampling in ['full', 'end']
+        super().__init__(**kwargs)
+        self.num_samples = num_samples
+        self.epsilon = epsilon
+        self.space = space
+        self.sampling = sampling
+        self.crop = crop
+        self.minibatch_per_gpu = minibatch_per_gpu
+        self.Gs_overrides = Gs_overrides
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        Gs_kwargs = dict(Gs_kwargs)
+        Gs_kwargs.update(self.Gs_overrides)
+        minibatch_size = num_gpus * self.minibatch_per_gpu
+
+        # Construct TensorFlow graph.
+        distance_expr = []
+        for gpu_idx in range(num_gpus):
+            with tf.device('/gpu:%d' % gpu_idx):
+                Gs_clone = Gs.clone()
+                noise_vars = [var for name, var in Gs_clone.components.synthesis.vars.items() if name.startswith('noise')]
+
+                # Generate random latents and interpolation t-values.
+                lat_t01 = tf.random_normal([self.minibatch_per_gpu * 2] + Gs_clone.input_shape[1:])
+                lerp_t = tf.random_uniform([self.minibatch_per_gpu], 0.0, 1.0 if self.sampling == 'full' else 0.0)
+                labels = tf.reshape(tf.tile(self._get_random_labels_tf(self.minibatch_per_gpu), [1, 2]), [self.minibatch_per_gpu * 2, -1])
+
+                # Interpolate in W or Z.
+                if self.space == 'w':
+                    dlat_t01 = Gs_clone.components.mapping.get_output_for(lat_t01, labels, **Gs_kwargs)
+                    dlat_t01 = tf.cast(dlat_t01, tf.float32)
+                    dlat_t0, dlat_t1 = dlat_t01[0::2], dlat_t01[1::2]
+                    dlat_e0 = tflib.lerp(dlat_t0, dlat_t1, lerp_t[:, np.newaxis, np.newaxis])
+                    dlat_e1 = tflib.lerp(dlat_t0, dlat_t1, lerp_t[:, np.newaxis, np.newaxis] + self.epsilon)
+                    dlat_e01 = tf.reshape(tf.stack([dlat_e0, dlat_e1], axis=1), dlat_t01.shape)
+                else: # space == 'z'
+                    lat_t0, lat_t1 = lat_t01[0::2], lat_t01[1::2]
+                    lat_e0 = slerp(lat_t0, lat_t1, lerp_t[:, np.newaxis])
+                    lat_e1 = slerp(lat_t0, lat_t1, lerp_t[:, np.newaxis] + self.epsilon)
+                    lat_e01 = tf.reshape(tf.stack([lat_e0, lat_e1], axis=1), lat_t01.shape)
+                    dlat_e01 = Gs_clone.components.mapping.get_output_for(lat_e01, labels, **Gs_kwargs)
+
+                # Synthesize images.
+                with tf.control_dependencies([var.initializer for var in noise_vars]): # use same noise inputs for the entire minibatch
+                    images = Gs_clone.components.synthesis.get_output_for(dlat_e01, randomize_noise=False, **Gs_kwargs)
+                    images = tf.cast(images, tf.float32)
+
+                # Crop only the face region.
+                if self.crop:
+                    c = int(images.shape[2] // 8)
+                    images = images[:, :, c*3 : c*7, c*2 : c*6]
+
+                # Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images.
+                factor = images.shape[2] // 256
+                if factor > 1:
+                    images = tf.reshape(images, [-1, images.shape[1], images.shape[2] // factor, factor, images.shape[3] // factor, factor])
+                    images = tf.reduce_mean(images, axis=[3,5])
+
+                # Scale dynamic range from [-1,1] to [0,255] for VGG.
+                images = (images + 1) * (255 / 2)
+
+                # Evaluate perceptual distance.
+                img_e0, img_e1 = images[0::2], images[1::2]
+                distance_measure = misc.load_pkl('https://drive.google.com/uc?id=1N2-m9qszOeVC9Tq77WxsLnuWwOedQiD2') # vgg16_zhang_perceptual.pkl
+                distance_expr.append(distance_measure.get_output_for(img_e0, img_e1) * (1 / self.epsilon**2))
+
+        # Sampling loop.
+        all_distances = []
+        for begin in range(0, self.num_samples, minibatch_size):
+            self._report_progress(begin, self.num_samples)
+            all_distances += tflib.run(distance_expr)
+        all_distances = np.concatenate(all_distances, axis=0)
+
+        # Reject outliers.
+        lo = np.percentile(all_distances, 1, interpolation='lower')
+        hi = np.percentile(all_distances, 99, interpolation='higher')
+        filtered_distances = np.extract(np.logical_and(lo <= all_distances, all_distances <= hi), all_distances)
+        self._report_result(np.mean(filtered_distances))
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/metrics/precision_recall.py b/insightface/reconstruction/ostec/external/stylegan2/metrics/precision_recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..addf9bdc52a151806a28fce131f93746e38f8dc1
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/metrics/precision_recall.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Precision/Recall (PR)."""
+
+import os
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+
+from metrics import metric_base
+from training import misc
+
+#----------------------------------------------------------------------------
+
+def batch_pairwise_distances(U, V):
+    """ Compute pairwise distances between two batches of feature vectors."""
+    with tf.variable_scope('pairwise_dist_block'):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+
+        # norm_u as a row and norm_v as a column vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2*tf.matmul(U, V, False, True) + norm_v, 0.0)
+
+    return D
+
+#----------------------------------------------------------------------------
+
+class DistanceBlock():
+    """Distance block."""
+    def __init__(self, num_features, num_gpus):
+        self.num_features = num_features
+        self.num_gpus = num_gpus
+
+        # Initialize TF graph to calculate pairwise distances.
+        with tf.device('/cpu:0'):
+            self._features_batch1 = tf.placeholder(tf.float16, shape=[None, self.num_features])
+            self._features_batch2 = tf.placeholder(tf.float16, shape=[None, self.num_features])
+            features_split2 = tf.split(self._features_batch2, self.num_gpus, axis=0)
+            distances_split = []
+            for gpu_idx in range(self.num_gpus):
+                with tf.device('/gpu:%d' % gpu_idx):
+                    distances_split.append(batch_pairwise_distances(self._features_batch1, features_split2[gpu_idx]))
+            self._distance_block = tf.concat(distances_split, axis=1)
+
+    def pairwise_distances(self, U, V):
+        """Evaluate pairwise distances between two batches of feature vectors."""
+        return self._distance_block.eval(feed_dict={self._features_batch1: U, self._features_batch2: V})
+
+#----------------------------------------------------------------------------
+
+class ManifoldEstimator():
+    """Finds an estimate for the manifold of given feature vectors."""
+    def __init__(self, distance_block, features, row_batch_size, col_batch_size, nhood_sizes, clamp_to_percentile=None):
+        """Find an estimate of the manifold of given feature vectors."""
+        num_images = features.shape[0]
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self._ref_features = features
+        self._distance_block = distance_block
+
+        # Estimate manifold of features by calculating distances to kth nearest neighbor of each sample.
+        self.D = np.zeros([num_images, self.num_nhoods], dtype=np.float16)
+        distance_batch = np.zeros([row_batch_size, num_images], dtype=np.float16)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+
+        for begin1 in range(0, num_images, row_batch_size):
+            end1 = min(begin1 + row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+
+            for begin2 in range(0, num_images, col_batch_size):
+                end2 = min(begin2 + col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+
+                # Compute distances between batches.
+                distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(row_batch, col_batch)
+
+            # Find the kth nearest neighbor from the current batch.
+            self.D[begin1:end1, :] = np.partition(distance_batch[0:end1-begin1, :], seq, axis=1)[:, self.nhood_sizes]
+
+        if clamp_to_percentile is not None:
+            max_distances = np.percentile(self.D, clamp_to_percentile, axis=0)
+            self.D[self.D > max_distances] = 0  #max_distances  # 0
+
+    def evaluate(self, eval_features, return_realism=False, return_neighbors=False):
+        """Evaluate if new feature vectors are in the estimated manifold."""
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = self.D.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float16)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        #max_realism_score = np.zeros([num_eval_images,], dtype=np.float32)
+        realism_score = np.zeros([num_eval_images,], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images,], dtype=np.int32)
+
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = self._ref_features[begin2:end2]
+
+                distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(feature_batch, ref_batch)
+
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then the new sample lies on the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0:end1-begin1, :, None] <= self.D
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+
+            #max_realism_score[begin1:end1] = np.max(self.D[:, 0] / (distance_batch[0:end1-begin1, :] + 1e-18), axis=1)
+            #nearest_indices[begin1:end1] = np.argmax(self.D[:, 0] / (distance_batch[0:end1-begin1, :] + 1e-18), axis=1)
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0:end1-begin1, :], axis=1)
+            realism_score[begin1:end1] = self.D[nearest_indices[begin1:end1], 0] / np.min(distance_batch[0:end1-begin1, :], axis=1)
+
+        if return_realism and return_neighbors:
+            return batch_predictions, realism_score, nearest_indices
+        elif return_realism:
+            return batch_predictions, realism_score
+        elif return_neighbors:
+            return batch_predictions, nearest_indices
+
+        return batch_predictions
+
+#----------------------------------------------------------------------------
+
+def knn_precision_recall_features(ref_features, eval_features, feature_net, nhood_sizes,
+                                  row_batch_size, col_batch_size, num_gpus):
+    """Calculates k-NN precision and recall for two sets of feature vectors."""
+    state = dnnlib.EasyDict()
+    #num_images = ref_features.shape[0]
+    num_features = feature_net.output_shape[1]
+    state.ref_features = ref_features
+    state.eval_features = eval_features
+
+    # Initialize DistanceBlock and ManifoldEstimators.
+    distance_block = DistanceBlock(num_features, num_gpus)
+    state.ref_manifold = ManifoldEstimator(distance_block, state.ref_features, row_batch_size, col_batch_size, nhood_sizes)
+    state.eval_manifold = ManifoldEstimator(distance_block, state.eval_features, row_batch_size, col_batch_size, nhood_sizes)
+
+    # Evaluate precision and recall using k-nearest neighbors.
+    #print('Evaluating k-NN precision and recall with %i samples...' % num_images)
+    #start = time.time()
+
+    # Precision: How many points from eval_features are in ref_features manifold.
+    state.precision, state.realism_scores, state.nearest_neighbors = state.ref_manifold.evaluate(state.eval_features, return_realism=True, return_neighbors=True)
+    state.knn_precision = state.precision.mean(axis=0)
+
+    # Recall: How many points from ref_features are in eval_features manifold.
+    state.recall = state.eval_manifold.evaluate(state.ref_features)
+    state.knn_recall = state.recall.mean(axis=0)
+
+    #elapsed_time = time.time() - start
+    #print('Done evaluation in: %gs' % elapsed_time)
+
+    return state
+
+#----------------------------------------------------------------------------
+
+class PR(metric_base.MetricBase):
+    def __init__(self, num_images, nhood_size, minibatch_per_gpu, row_batch_size, col_batch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.num_images = num_images
+        self.nhood_size = nhood_size
+        self.minibatch_per_gpu = minibatch_per_gpu
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+
+    def _evaluate(self, Gs, Gs_kwargs, num_gpus):
+        minibatch_size = num_gpus * self.minibatch_per_gpu
+        feature_net = misc.load_pkl('https://drive.google.com/uc?id=1MzY4MFpZzE-mNS26pzhYlWN-4vMm2ytu') # vgg16.pkl
+
+        # Calculate features for reals.
+        cache_file = self._get_cache_file_for_reals(num_images=self.num_images)
+        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+        if os.path.isfile(cache_file):
+            ref_features = misc.load_pkl(cache_file)
+        else:
+            ref_features = np.empty([self.num_images, feature_net.output_shape[1]], dtype=np.float32)
+            for idx, images in enumerate(self._iterate_reals(minibatch_size=minibatch_size)):
+                begin = idx * minibatch_size
+                end = min(begin + minibatch_size, self.num_images)
+                ref_features[begin:end] = feature_net.run(images[:end-begin], num_gpus=num_gpus, assume_frozen=True)
+                if end == self.num_images:
+                    break
+            misc.save_pkl(ref_features, cache_file)
+
+        # Construct TensorFlow graph.
+        result_expr = []
+        for gpu_idx in range(num_gpus):
+            with tf.device('/gpu:%d' % gpu_idx):
+                Gs_clone = Gs.clone()
+                feature_net_clone = feature_net.clone()
+                latents = tf.random_normal([self.minibatch_per_gpu] + Gs_clone.input_shape[1:])
+                labels = self._get_random_labels_tf(self.minibatch_per_gpu)
+                images = Gs_clone.get_output_for(latents, labels, **Gs_kwargs)
+                images = tflib.convert_images_to_uint8(images)
+                result_expr.append(feature_net_clone.get_output_for(images))
+
+        # Calculate features for fakes.
+        eval_features = np.empty([self.num_images, feature_net.output_shape[1]], dtype=np.float32)
+        for begin in range(0, self.num_images, minibatch_size):
+            self._report_progress(begin, self.num_images)
+            end = min(begin + minibatch_size, self.num_images)
+            eval_features[begin:end] = np.concatenate(tflib.run(result_expr), axis=0)[:end-begin]
+
+        # Calculate precision and recall.
+        state = knn_precision_recall_features(ref_features=ref_features, eval_features=eval_features, feature_net=feature_net,
+            nhood_sizes=[self.nhood_size], row_batch_size=self.row_batch_size, col_batch_size=self.row_batch_size, num_gpus=num_gpus)
+        self._report_result(state.knn_precision[0], suffix='_precision')
+        self._report_result(state.knn_recall[0], suffix='_recall')
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/pretrained_networks.py b/insightface/reconstruction/ostec/external/stylegan2/pretrained_networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9449a0676251493ff206220dec6feaede48dac3
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/pretrained_networks.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""List of pre-trained StyleGAN2 networks located on Google Drive."""
+
+import pickle
+import external.stylegan2.dnnlib
+from external.stylegan2.dnnlib import tflib as tflib
+
+#----------------------------------------------------------------------------
+# StyleGAN2 Google Drive root: https://drive.google.com/open?id=1QHc-yF5C3DChRwSdZKcx1w6K8JvSxQi7
+
+gdrive_urls = {
+    'gdrive:networks/stylegan2-car-config-a.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-a.pkl',
+    'gdrive:networks/stylegan2-car-config-b.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-b.pkl',
+    'gdrive:networks/stylegan2-car-config-c.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-c.pkl',
+    'gdrive:networks/stylegan2-car-config-d.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-d.pkl',
+    'gdrive:networks/stylegan2-car-config-e.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-e.pkl',
+    'gdrive:networks/stylegan2-car-config-f.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-car-config-f.pkl',
+    'gdrive:networks/stylegan2-cat-config-a.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-cat-config-a.pkl',
+    'gdrive:networks/stylegan2-cat-config-f.pkl':                           'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-cat-config-f.pkl',
+    'gdrive:networks/stylegan2-church-config-a.pkl':                        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-church-config-a.pkl',
+    'gdrive:networks/stylegan2-church-config-f.pkl':                        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-church-config-f.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-a.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-a.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-b.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-b.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-c.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-c.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-d.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-d.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-e.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-e.pkl',
+    'gdrive:networks/stylegan2-ffhq-config-f.pkl':                          'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-f.pkl',
+    'gdrive:networks/stylegan2-horse-config-a.pkl':                         'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-horse-config-a.pkl',
+    'gdrive:networks/stylegan2-horse-config-f.pkl':                         'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-horse-config-f.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gorig-Dorig.pkl':        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gorig-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gorig-Dresnet.pkl':      'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gorig-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gorig-Dskip.pkl':        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gorig-Dskip.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gresnet-Dorig.pkl':      'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gresnet-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gresnet-Dresnet.pkl':    'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gresnet-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gresnet-Dskip.pkl':      'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gresnet-Dskip.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gskip-Dorig.pkl':        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gskip-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gskip-Dresnet.pkl':      'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gskip-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-car-config-e-Gskip-Dskip.pkl':        'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-car-config-e-Gskip-Dskip.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gorig-Dorig.pkl':       'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gorig-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gorig-Dresnet.pkl':     'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gorig-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gorig-Dskip.pkl':       'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gorig-Dskip.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gresnet-Dorig.pkl':     'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gresnet-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gresnet-Dresnet.pkl':   'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gresnet-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gresnet-Dskip.pkl':     'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gresnet-Dskip.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gskip-Dorig.pkl':       'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gskip-Dorig.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gskip-Dresnet.pkl':     'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gskip-Dresnet.pkl',
+    'gdrive:networks/table2/stylegan2-ffhq-config-e-Gskip-Dskip.pkl':       'http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/table2/stylegan2-ffhq-config-e-Gskip-Dskip.pkl',
+}
+
+#----------------------------------------------------------------------------
+
+def get_path_or_url(path_or_gdrive_path):
+    return gdrive_urls.get(path_or_gdrive_path, path_or_gdrive_path)
+
+#----------------------------------------------------------------------------
+
+_cached_networks = dict()
+
+def load_networks(path_or_gdrive_path):
+    path_or_url = get_path_or_url(path_or_gdrive_path)
+    if path_or_url in _cached_networks:
+        return _cached_networks[path_or_url]
+
+    if external.stylegan2.dnnlib.util.is_url(path_or_url):
+        stream = external.stylegan2.dnnlib.util.open_url(path_or_url, cache_dir='.stylegan2-cache')
+    else:
+        stream = open(path_or_url, 'rb')
+
+    tflib.init_tf()
+    with stream:
+        G, D, Gs = pickle.load(stream, encoding='latin1')
+    _cached_networks[path_or_url] = G, D, Gs
+    return G, D, Gs
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/projector.py b/insightface/reconstruction/ostec/external/stylegan2/projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a2f989961c7c784fd0a16e2bd4c74152fe6e953
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/projector.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+
+from training import misc
+
+#----------------------------------------------------------------------------
+
+class Projector:
+    def __init__(self):
+        self.num_steps                  = 1000
+        self.dlatent_avg_samples        = 10000
+        self.initial_learning_rate      = 0.1
+        self.initial_noise_factor       = 0.05
+        self.lr_rampdown_length         = 0.25
+        self.lr_rampup_length           = 0.05
+        self.noise_ramp_length          = 0.75
+        self.regularize_noise_weight    = 1e5
+        self.verbose                    = False
+        self.clone_net                  = True
+
+        self._Gs                    = None
+        self._minibatch_size        = None
+        self._dlatent_avg           = None
+        self._dlatent_std           = None
+        self._noise_vars            = None
+        self._noise_init_op         = None
+        self._noise_normalize_op    = None
+        self._dlatents_var          = None
+        self._noise_in              = None
+        self._dlatents_expr         = None
+        self._images_expr           = None
+        self._target_images_var     = None
+        self._lpips                 = None
+        self._dist                  = None
+        self._loss                  = None
+        self._reg_sizes             = None
+        self._lrate_in              = None
+        self._opt                   = None
+        self._opt_step              = None
+        self._cur_step              = None
+
+    def _info(self, *args):
+        if self.verbose:
+            print('Projector:', *args)
+
+    def set_network(self, Gs, minibatch_size=1):
+        assert minibatch_size == 1
+        self._Gs = Gs
+        self._minibatch_size = minibatch_size
+        if self._Gs is None:
+            return
+        if self.clone_net:
+            self._Gs = self._Gs.clone()
+
+        # Find dlatent stats.
+        self._info('Finding W midpoint and stddev using %d samples...' % self.dlatent_avg_samples)
+        latent_samples = np.random.RandomState(123).randn(self.dlatent_avg_samples, *self._Gs.input_shapes[0][1:])
+        dlatent_samples = self._Gs.components.mapping.run(latent_samples, None)[:, :1, :] # [N, 1, 512]
+        self._dlatent_avg = np.mean(dlatent_samples, axis=0, keepdims=True) # [1, 1, 512]
+        self._dlatent_std = (np.sum((dlatent_samples - self._dlatent_avg) ** 2) / self.dlatent_avg_samples) ** 0.5
+        self._info('std = %g' % self._dlatent_std)
+
+        # Find noise inputs.
+        self._info('Setting up noise inputs...')
+        self._noise_vars = []
+        noise_init_ops = []
+        noise_normalize_ops = []
+        while True:
+            n = 'G_synthesis/noise%d' % len(self._noise_vars)
+            if not n in self._Gs.vars:
+                break
+            v = self._Gs.vars[n]
+            self._noise_vars.append(v)
+            noise_init_ops.append(tf.assign(v, tf.random_normal(tf.shape(v), dtype=tf.float32)))
+            noise_mean = tf.reduce_mean(v)
+            noise_std = tf.reduce_mean((v - noise_mean)**2)**0.5
+            noise_normalize_ops.append(tf.assign(v, (v - noise_mean) / noise_std))
+            self._info(n, v)
+        self._noise_init_op = tf.group(*noise_init_ops)
+        self._noise_normalize_op = tf.group(*noise_normalize_ops)
+
+        # Image output graph.
+        self._info('Building image output graph...')
+        self._dlatents_var = tf.Variable(tf.zeros([self._minibatch_size] + list(self._dlatent_avg.shape[1:])), name='dlatents_var')
+        self._noise_in = tf.placeholder(tf.float32, [], name='noise_in')
+        dlatents_noise = tf.random.normal(shape=self._dlatents_var.shape) * self._noise_in
+        self._dlatents_expr = tf.tile(self._dlatents_var + dlatents_noise, [1, self._Gs.components.synthesis.input_shape[1], 1])
+        self._images_expr = self._Gs.components.synthesis.get_output_for(self._dlatents_expr, randomize_noise=False)
+
+        # Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images.
+        proc_images_expr = (self._images_expr + 1) * (255 / 2)
+        sh = proc_images_expr.shape.as_list()
+        if sh[2] > 256:
+            factor = sh[2] // 256
+            proc_images_expr = tf.reduce_mean(tf.reshape(proc_images_expr, [-1, sh[1], sh[2] // factor, factor, sh[2] // factor, factor]), axis=[3,5])
+
+        # Loss graph.
+        self._info('Building loss graph...')
+        self._target_images_var = tf.Variable(tf.zeros(proc_images_expr.shape), name='target_images_var')
+        if self._lpips is None:
+            self._lpips = misc.load_pkl('https://drive.google.com/uc?id=1N2-m9qszOeVC9Tq77WxsLnuWwOedQiD2') # vgg16_zhang_perceptual.pkl
+        self._dist = self._lpips.get_output_for(proc_images_expr, self._target_images_var)
+        self._loss = tf.reduce_sum(self._dist)
+
+        # Noise regularization graph.
+        self._info('Building noise regularization graph...')
+        reg_loss = 0.0
+        for v in self._noise_vars:
+            sz = v.shape[2]
+            while True:
+                reg_loss += tf.reduce_mean(v * tf.roll(v, shift=1, axis=3))**2 + tf.reduce_mean(v * tf.roll(v, shift=1, axis=2))**2
+                if sz <= 8:
+                    break # Small enough already
+                v = tf.reshape(v, [1, 1, sz//2, 2, sz//2, 2]) # Downscale
+                v = tf.reduce_mean(v, axis=[3, 5])
+                sz = sz // 2
+        self._loss += reg_loss * self.regularize_noise_weight
+
+        # Optimizer.
+        self._info('Setting up optimizer...')
+        self._lrate_in = tf.placeholder(tf.float32, [], name='lrate_in')
+        self._opt = dnnlib.tflib.Optimizer(learning_rate=self._lrate_in)
+        self._opt.register_gradients(self._loss, [self._dlatents_var] + self._noise_vars)
+        self._opt_step = self._opt.apply_updates()
+
+    def run(self, target_images):
+        # Run to completion.
+        self.start(target_images)
+        while self._cur_step < self.num_steps:
+            self.step()
+
+        # Collect results.
+        pres = dnnlib.EasyDict()
+        pres.dlatents = self.get_dlatents()
+        pres.noises = self.get_noises()
+        pres.images = self.get_images()
+        return pres
+
+    def start(self, target_images):
+        assert self._Gs is not None
+
+        # Prepare target images.
+        self._info('Preparing target images...')
+        target_images = np.asarray(target_images, dtype='float32')
+        target_images = (target_images + 1) * (255 / 2)
+        sh = target_images.shape
+        assert sh[0] == self._minibatch_size
+        if sh[2] > self._target_images_var.shape[2]:
+            factor = sh[2] // self._target_images_var.shape[2]
+            target_images = np.reshape(target_images, [-1, sh[1], sh[2] // factor, factor, sh[3] // factor, factor]).mean((3, 5))
+
+        # Initialize optimization state.
+        self._info('Initializing optimization state...')
+        tflib.set_vars({self._target_images_var: target_images, self._dlatents_var: np.tile(self._dlatent_avg, [self._minibatch_size, 1, 1])})
+        tflib.run(self._noise_init_op)
+        self._opt.reset_optimizer_state()
+        self._cur_step = 0
+
+    def step(self):
+        assert self._cur_step is not None
+        if self._cur_step >= self.num_steps:
+            return
+        if self._cur_step == 0:
+            self._info('Running...')
+
+        # Hyperparameters.
+        t = self._cur_step / self.num_steps
+        noise_strength = self._dlatent_std * self.initial_noise_factor * max(0.0, 1.0 - t / self.noise_ramp_length) ** 2
+        lr_ramp = min(1.0, (1.0 - t) / self.lr_rampdown_length)
+        lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
+        lr_ramp = lr_ramp * min(1.0, t / self.lr_rampup_length)
+        learning_rate = self.initial_learning_rate * lr_ramp
+
+        # Train.
+        feed_dict = {self._noise_in: noise_strength, self._lrate_in: learning_rate}
+        _, dist_value, loss_value = tflib.run([self._opt_step, self._dist, self._loss], feed_dict)
+        tflib.run(self._noise_normalize_op)
+
+        # Print status.
+        self._cur_step += 1
+        if self._cur_step == self.num_steps or self._cur_step % 10 == 0:
+            self._info('%-8d%-12g%-12g' % (self._cur_step, dist_value, loss_value))
+        if self._cur_step == self.num_steps:
+            self._info('Done.')
+
+    def get_cur_step(self):
+        return self._cur_step
+
+    def get_dlatents(self):
+        return tflib.run(self._dlatents_expr, {self._noise_in: 0})
+
+    def get_noises(self):
+        return tflib.run(self._noise_vars)
+
+    def get_images(self):
+        return tflib.run(self._images_expr, {self._noise_in: 0})
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/run_generator.py b/insightface/reconstruction/ostec/external/stylegan2/run_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd52c78731831989f3ec2baf6dd45a43ad03a91
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/run_generator.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+import argparse
+import numpy as np
+import PIL.Image
+import dnnlib
+import dnnlib.tflib as tflib
+import re
+import sys
+
+import pretrained_networks
+
+#----------------------------------------------------------------------------
+
+def generate_images(network_pkl, seeds, truncation_psi):
+    print('Loading networks from "%s"...' % network_pkl)
+    _G, _D, Gs = pretrained_networks.load_networks(network_pkl)
+    noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
+
+    Gs_kwargs = dnnlib.EasyDict()
+    Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
+    Gs_kwargs.randomize_noise = False
+    if truncation_psi is not None:
+        Gs_kwargs.truncation_psi = truncation_psi
+
+    for seed_idx, seed in enumerate(seeds):
+        print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
+        rnd = np.random.RandomState(seed)
+        z = rnd.randn(1, *Gs.input_shape[1:]) # [minibatch, component]
+        tflib.set_vars({var: rnd.randn(*var.shape.as_list()) for var in noise_vars}) # [height, width]
+        images = Gs.run(z, None, **Gs_kwargs) # [minibatch, height, width, channel]
+        PIL.Image.fromarray(images[0], 'RGB').save(dnnlib.make_run_dir_path('seed%04d.png' % seed))
+
+#----------------------------------------------------------------------------
+
+def style_mixing_example(network_pkl, row_seeds, col_seeds, truncation_psi, col_styles, minibatch_size=4):
+    print('Loading networks from "%s"...' % network_pkl)
+    _G, _D, Gs = pretrained_networks.load_networks(network_pkl)
+    w_avg = Gs.get_var('dlatent_avg') # [component]
+
+    Gs_syn_kwargs = dnnlib.EasyDict()
+    Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
+    Gs_syn_kwargs.randomize_noise = False
+    Gs_syn_kwargs.minibatch_size = minibatch_size
+
+    print('Generating W vectors...')
+    all_seeds = list(set(row_seeds + col_seeds))
+    all_z = np.stack([np.random.RandomState(seed).randn(*Gs.input_shape[1:]) for seed in all_seeds]) # [minibatch, component]
+    all_w = Gs.components.mapping.run(all_z, None) # [minibatch, layer, component]
+    w1 = np.load('latent_representations/im_right_01.npy')
+    w2 = np.load('latent_representations/04753.000002.02_C_01.npy')
+    all_w = np.stack([w1, w2],0)
+    all_w = w_avg + (all_w - w_avg) * truncation_psi # [minibatch, layer, component]
+    w_dict = {seed: w for seed, w in zip(all_seeds, list(all_w))} # [layer, component]
+
+    print('Generating images...')
+    all_images = Gs.components.synthesis.run(all_w, **Gs_syn_kwargs) # [minibatch, height, width, channel]
+    image_dict = {(seed, seed): image for seed, image in zip(all_seeds, list(all_images))}
+
+    print('Generating style-mixed images...')
+    for row_seed in row_seeds:
+        for col_seed in col_seeds:
+            w = w_dict[row_seed].copy()
+            w[col_styles] = w_dict[col_seed][col_styles]
+            image = Gs.components.synthesis.run(w[np.newaxis], **Gs_syn_kwargs)[0]
+            image_dict[(row_seed, col_seed)] = image
+
+    print('Saving images...')
+    for (row_seed, col_seed), image in image_dict.items():
+        PIL.Image.fromarray(image, 'RGB').save(dnnlib.make_run_dir_path('%d-%d.png' % (row_seed, col_seed)))
+
+    print('Saving image grid...')
+    _N, _C, H, W = Gs.output_shape
+    canvas = PIL.Image.new('RGB', (W * (len(col_seeds) + 1), H * (len(row_seeds) + 1)), 'black')
+    for row_idx, row_seed in enumerate([None] + row_seeds):
+        for col_idx, col_seed in enumerate([None] + col_seeds):
+            if row_seed is None and col_seed is None:
+                continue
+            key = (row_seed, col_seed)
+            if row_seed is None:
+                key = (col_seed, col_seed)
+            if col_seed is None:
+                key = (row_seed, row_seed)
+            canvas.paste(PIL.Image.fromarray(image_dict[key], 'RGB'), (W * col_idx, H * row_idx))
+    canvas.save(dnnlib.make_run_dir_path('grid.png'))
+
+#----------------------------------------------------------------------------
+
+def _parse_num_range(s):
+    '''Accept either a comma separated list of numbers 'a,b,c' or a range 'a-c' and return as a list of ints.'''
+
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    m = range_re.match(s)
+    if m:
+        return range(int(m.group(1)), int(m.group(2))+1)
+    vals = s.split(',')
+    return [int(x) for x in vals]
+
+#----------------------------------------------------------------------------
+
+_examples = '''examples:
+
+  # Generate ffhq uncurated images (matches paper Figure 12)
+  python %(prog)s generate-images --network=gdrive:networks/stylegan2-ffhq-config-f.pkl --seeds=6600-6625 --truncation-psi=0.5
+
+  # Generate ffhq curated images (matches paper Figure 11)
+  python %(prog)s generate-images --network=gdrive:networks/stylegan2-ffhq-config-f.pkl --seeds=66,230,389,1518 --truncation-psi=1.0
+
+  # Generate uncurated car images (matches paper Figure 12)
+  python %(prog)s generate-images --network=gdrive:networks/stylegan2-car-config-f.pkl --seeds=6000-6025 --truncation-psi=0.5
+
+  # Generate style mixing example (matches style mixing video clip)
+  python %(prog)s style-mixing-example --network=gdrive:networks/stylegan2-ffhq-config-f.pkl --row-seeds=85,100,75,458,1500 --col-seeds=55,821,1789,293 --truncation-psi=1.0
+'''
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='''StyleGAN2 generator.
+
+Run 'python %(prog)s <subcommand> --help' for subcommand help.''',
+        epilog=_examples,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    subparsers = parser.add_subparsers(help='Sub-commands', dest='command')
+
+    parser_generate_images = subparsers.add_parser('generate-images', help='Generate images')
+    parser_generate_images.add_argument('--network', help='Network pickle filename', dest='network_pkl', required=True)
+    parser_generate_images.add_argument('--seeds', type=_parse_num_range, help='List of random seeds', required=True)
+    parser_generate_images.add_argument('--truncation-psi', type=float, help='Truncation psi (default: %(default)s)', default=0.5)
+    parser_generate_images.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+
+    parser_style_mixing_example = subparsers.add_parser('style-mixing-example', help='Generate style mixing video')
+    parser_style_mixing_example.add_argument('--network', help='Network pickle filename', dest='network_pkl', required=True)
+    parser_style_mixing_example.add_argument('--row-seeds', type=_parse_num_range, help='Random seeds to use for image rows', required=True)
+    parser_style_mixing_example.add_argument('--col-seeds', type=_parse_num_range, help='Random seeds to use for image columns', required=True)
+    parser_style_mixing_example.add_argument('--col-styles', type=_parse_num_range, help='Style layer range (default: %(default)s)', default='0-6')
+    parser_style_mixing_example.add_argument('--truncation-psi', type=float, help='Truncation psi (default: %(default)s)', default=0.5)
+    parser_style_mixing_example.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+
+    args = parser.parse_args()
+    kwargs = vars(args)
+    subcmd = kwargs.pop('command')
+
+    if subcmd is None:
+        print ('Error: missing subcommand.  Re-run with --help for usage.')
+        sys.exit(1)
+
+    sc = dnnlib.SubmitConfig()
+    sc.num_gpus = 1
+    sc.submit_target = dnnlib.SubmitTarget.LOCAL
+    sc.local.do_not_copy_source_files = True
+    sc.run_dir_root = kwargs.pop('result_dir')
+    sc.run_desc = subcmd
+
+    func_name_map = {
+        'generate-images': 'run_generator.generate_images',
+        'style-mixing-example': 'run_generator.style_mixing_example'
+    }
+    dnnlib.submit_run(sc, func_name_map[subcmd], **kwargs)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/run_metrics.py b/insightface/reconstruction/ostec/external/stylegan2/run_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..5043b100faf3f58273cdf00239611d950962324c
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/run_metrics.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+import argparse
+import os
+import sys
+
+import dnnlib
+import dnnlib.tflib as tflib
+
+import pretrained_networks
+from metrics import metric_base
+from metrics.metric_defaults import metric_defaults
+
+#----------------------------------------------------------------------------
+
+def run(network_pkl, metrics, dataset, data_dir, mirror_augment):
+    print('Evaluating metrics "%s" for "%s"...' % (','.join(metrics), network_pkl))
+    tflib.init_tf()
+    network_pkl = pretrained_networks.get_path_or_url(network_pkl)
+    dataset_args = dnnlib.EasyDict(tfrecord_dir=dataset, shuffle_mb=0)
+    num_gpus = dnnlib.submit_config.num_gpus
+    metric_group = metric_base.MetricGroup([metric_defaults[metric] for metric in metrics])
+    metric_group.run(network_pkl, data_dir=data_dir, dataset_args=dataset_args, mirror_augment=mirror_augment, num_gpus=num_gpus)
+
+#----------------------------------------------------------------------------
+
+def _str_to_bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+#----------------------------------------------------------------------------
+
+_examples = '''examples:
+
+  python %(prog)s --data-dir=~/datasets --network=gdrive:networks/stylegan2-ffhq-config-f.pkl --metrics=fid50k,ppl_wend --dataset=ffhq --mirror-augment=true
+
+valid metrics:
+
+  ''' + ', '.join(sorted([x for x in metric_defaults.keys()])) + '''
+'''
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Run StyleGAN2 metrics.',
+        epilog=_examples,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+    parser.add_argument('--network', help='Network pickle filename', dest='network_pkl', required=True)
+    parser.add_argument('--metrics', help='Metrics to compute (default: %(default)s)', default='fid50k', type=lambda x: x.split(','))
+    parser.add_argument('--dataset', help='Training dataset', required=True)
+    parser.add_argument('--data-dir', help='Dataset root directory', required=True)
+    parser.add_argument('--mirror-augment', help='Mirror augment (default: %(default)s)', default=False, type=_str_to_bool, metavar='BOOL')
+    parser.add_argument('--num-gpus', help='Number of GPUs to use', type=int, default=1, metavar='N')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.data_dir):
+        print ('Error: dataset root directory does not exist.')
+        sys.exit(1)
+
+    kwargs = vars(args)
+    sc = dnnlib.SubmitConfig()
+    sc.num_gpus = kwargs.pop('num_gpus')
+    sc.submit_target = dnnlib.SubmitTarget.LOCAL
+    sc.local.do_not_copy_source_files = True
+    sc.run_dir_root = kwargs.pop('result_dir')
+    sc.run_desc = 'run-metrics'
+    dnnlib.submit_run(sc, 'run_metrics.run', **kwargs)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/run_projector.py b/insightface/reconstruction/ostec/external/stylegan2/run_projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f7a68004e0ef50f551a26de277b1df32a59f8a
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/run_projector.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+import argparse
+import numpy as np
+import dnnlib
+import dnnlib.tflib as tflib
+import re
+import sys
+
+import projector
+import pretrained_networks
+from training import dataset
+from training import misc
+
+#----------------------------------------------------------------------------
+
+def project_image(proj, targets, png_prefix, num_snapshots):
+    snapshot_steps = set(proj.num_steps - np.linspace(0, proj.num_steps, num_snapshots, endpoint=False, dtype=int))
+    misc.save_image_grid(targets, png_prefix + 'target.png', drange=[-1, 1])
+    proj.start(targets)
+    while proj.get_cur_step() < proj.num_steps:
+        print('\r%d / %d ... ' % (proj.get_cur_step(), proj.num_steps), end='', flush=True)
+        proj.step()
+        if proj.get_cur_step() in snapshot_steps:
+            misc.save_image_grid(proj.get_images(), png_prefix + 'step%04d.png' % proj.get_cur_step(), drange=[-1, 1])
+    print('\r%-30s\r' % '', end='', flush=True)
+
+#----------------------------------------------------------------------------
+
+def project_generated_images(network_pkl, seeds, num_snapshots, truncation_psi):
+    print('Loading networks from "%s"...' % network_pkl)
+    _G, _D, Gs = pretrained_networks.load_networks(network_pkl)
+    proj = projector.Projector()
+    proj.set_network(Gs)
+    noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
+
+    Gs_kwargs = dnnlib.EasyDict()
+    Gs_kwargs.randomize_noise = False
+    Gs_kwargs.truncation_psi = truncation_psi
+
+    for seed_idx, seed in enumerate(seeds):
+        print('Projecting seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
+        rnd = np.random.RandomState(seed)
+        z = rnd.randn(1, *Gs.input_shape[1:])
+        tflib.set_vars({var: rnd.randn(*var.shape.as_list()) for var in noise_vars})
+        images = Gs.run(z, None, **Gs_kwargs)
+        project_image(proj, targets=images, png_prefix=dnnlib.make_run_dir_path('seed%04d-' % seed), num_snapshots=num_snapshots)
+
+#----------------------------------------------------------------------------
+
+def project_real_images(network_pkl, dataset_name, data_dir, num_images, num_snapshots):
+    print('Loading networks from "%s"...' % network_pkl)
+    _G, _D, Gs = pretrained_networks.load_networks(network_pkl)
+    proj = projector.Projector()
+    proj.set_network(Gs)
+
+    print('Loading images from "%s"...' % dataset_name)
+    dataset_obj = dataset.load_dataset(data_dir=data_dir, tfrecord_dir=dataset_name, max_label_size=0, repeat=False, shuffle_mb=0)
+    assert dataset_obj.shape == Gs.output_shape[1:]
+
+    for image_idx in range(num_images):
+        print('Projecting image %d/%d ...' % (image_idx, num_images))
+        images, _labels = dataset_obj.get_minibatch_np(1)
+        images = misc.adjust_dynamic_range(images, [0, 255], [-1, 1])
+        project_image(proj, targets=images, png_prefix=dnnlib.make_run_dir_path('image%04d-' % image_idx), num_snapshots=num_snapshots)
+
+#----------------------------------------------------------------------------
+
+def _parse_num_range(s):
+    '''Accept either a comma separated list of numbers 'a,b,c' or a range 'a-c' and return as a list of ints.'''
+
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    m = range_re.match(s)
+    if m:
+        return range(int(m.group(1)), int(m.group(2))+1)
+    vals = s.split(',')
+    return [int(x) for x in vals]
+
+#----------------------------------------------------------------------------
+
+_examples = '''examples:
+
+  # Project generated images
+  python %(prog)s project-generated-images --network=gdrive:networks/stylegan2-car-config-f.pkl --seeds=0,1,5
+
+  # Project real images
+  python %(prog)s project-real-images --network=gdrive:networks/stylegan2-car-config-f.pkl --dataset=car --data-dir=~/datasets
+
+'''
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='''StyleGAN2 projector.
+
+Run 'python %(prog)s <subcommand> --help' for subcommand help.''',
+        epilog=_examples,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    subparsers = parser.add_subparsers(help='Sub-commands', dest='command')
+
+    project_generated_images_parser = subparsers.add_parser('project-generated-images', help='Project generated images')
+    project_generated_images_parser.add_argument('--network', help='Network pickle filename', dest='network_pkl', required=True)
+    project_generated_images_parser.add_argument('--seeds', type=_parse_num_range, help='List of random seeds', default=range(3))
+    project_generated_images_parser.add_argument('--num-snapshots', type=int, help='Number of snapshots (default: %(default)s)', default=5)
+    project_generated_images_parser.add_argument('--truncation-psi', type=float, help='Truncation psi (default: %(default)s)', default=1.0)
+    project_generated_images_parser.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+
+    project_real_images_parser = subparsers.add_parser('project-real-images', help='Project real images')
+    project_real_images_parser.add_argument('--network', help='Network pickle filename', dest='network_pkl', required=True)
+    project_real_images_parser.add_argument('--data-dir', help='Dataset root directory', required=True)
+    project_real_images_parser.add_argument('--dataset', help='Training dataset', dest='dataset_name', required=True)
+    project_real_images_parser.add_argument('--num-snapshots', type=int, help='Number of snapshots (default: %(default)s)', default=5)
+    project_real_images_parser.add_argument('--num-images', type=int, help='Number of images to project (default: %(default)s)', default=3)
+    project_real_images_parser.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+
+    args = parser.parse_args()
+    subcmd = args.command
+    if subcmd is None:
+        print ('Error: missing subcommand.  Re-run with --help for usage.')
+        sys.exit(1)
+
+    kwargs = vars(args)
+    sc = dnnlib.SubmitConfig()
+    sc.num_gpus = 1
+    sc.submit_target = dnnlib.SubmitTarget.LOCAL
+    sc.local.do_not_copy_source_files = True
+    sc.run_dir_root = kwargs.pop('result_dir')
+    sc.run_desc = kwargs.pop('command')
+
+    func_name_map = {
+        'project-generated-images': 'run_projector.project_generated_images',
+        'project-real-images': 'run_projector.project_real_images'
+    }
+    dnnlib.submit_run(sc, func_name_map[subcmd], **kwargs)
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/run_training.py b/insightface/reconstruction/ostec/external/stylegan2/run_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4c0a2bd414d8ddfe0edaa4b29db5532c440ef1
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/run_training.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+import argparse
+import copy
+import os
+import sys
+
+import dnnlib
+from dnnlib import EasyDict
+
+from metrics.metric_defaults import metric_defaults
+
+#----------------------------------------------------------------------------
+
+_valid_configs = [
+    # Table 1
+    'config-a', # Baseline StyleGAN
+    'config-b', # + Weight demodulation
+    'config-c', # + Lazy regularization
+    'config-d', # + Path length regularization
+    'config-e', # + No growing, new G & D arch.
+    'config-f', # + Large networks (default)
+
+    # Table 2
+    'config-e-Gorig-Dorig',   'config-e-Gorig-Dresnet',   'config-e-Gorig-Dskip',
+    'config-e-Gresnet-Dorig', 'config-e-Gresnet-Dresnet', 'config-e-Gresnet-Dskip',
+    'config-e-Gskip-Dorig',   'config-e-Gskip-Dresnet',   'config-e-Gskip-Dskip',
+]
+
+#----------------------------------------------------------------------------
+
+def run(dataset, data_dir, result_dir, config_id, num_gpus, total_kimg, gamma, mirror_augment, metrics):
+    train     = EasyDict(run_func_name='training.training_loop.training_loop') # Options for training loop.
+    G         = EasyDict(func_name='training.networks_stylegan2.G_main')       # Options for generator network.
+    D         = EasyDict(func_name='training.networks_stylegan2.D_stylegan2')  # Options for discriminator network.
+    G_opt     = EasyDict(beta1=0.0, beta2=0.99, epsilon=1e-8)                  # Options for generator optimizer.
+    D_opt     = EasyDict(beta1=0.0, beta2=0.99, epsilon=1e-8)                  # Options for discriminator optimizer.
+    G_loss    = EasyDict(func_name='training.loss.G_logistic_ns_pathreg')      # Options for generator loss.
+    D_loss    = EasyDict(func_name='training.loss.D_logistic_r1')              # Options for discriminator loss.
+    sched     = EasyDict()                                                     # Options for TrainingSchedule.
+    grid      = EasyDict(size='8k', layout='random')                           # Options for setup_snapshot_image_grid().
+    sc        = dnnlib.SubmitConfig()                                          # Options for dnnlib.submit_run().
+    tf_config = {'rnd.np_random_seed': 1000}                                   # Options for tflib.init_tf().
+
+    train.data_dir = data_dir
+    train.total_kimg = total_kimg
+    train.mirror_augment = mirror_augment
+    train.image_snapshot_ticks = train.network_snapshot_ticks = 10
+    sched.G_lrate_base = sched.D_lrate_base = 0.002
+    sched.minibatch_size_base = 32
+    sched.minibatch_gpu_base = 4
+    D_loss.gamma = 10
+    metrics = [metric_defaults[x] for x in metrics]
+    desc = 'stylegan2'
+
+    desc += '-' + dataset
+    dataset_args = EasyDict(tfrecord_dir=dataset)
+
+    assert num_gpus in [1, 2, 4, 8]
+    sc.num_gpus = num_gpus
+    desc += '-%dgpu' % num_gpus
+
+    assert config_id in _valid_configs
+    desc += '-' + config_id
+
+    # Configs A-E: Shrink networks to match original StyleGAN.
+    if config_id != 'config-f':
+        G.fmap_base = D.fmap_base = 8 << 10
+
+    # Config E: Set gamma to 100 and override G & D architecture.
+    if config_id.startswith('config-e'):
+        D_loss.gamma = 100
+        if 'Gorig'   in config_id: G.architecture = 'orig'
+        if 'Gskip'   in config_id: G.architecture = 'skip' # (default)
+        if 'Gresnet' in config_id: G.architecture = 'resnet'
+        if 'Dorig'   in config_id: D.architecture = 'orig'
+        if 'Dskip'   in config_id: D.architecture = 'skip'
+        if 'Dresnet' in config_id: D.architecture = 'resnet' # (default)
+
+    # Configs A-D: Enable progressive growing and switch to networks that support it.
+    if config_id in ['config-a', 'config-b', 'config-c', 'config-d']:
+        sched.lod_initial_resolution = 8
+        sched.G_lrate_base = sched.D_lrate_base = 0.001
+        sched.G_lrate_dict = sched.D_lrate_dict = {128: 0.0015, 256: 0.002, 512: 0.003, 1024: 0.003}
+        sched.minibatch_size_base = 32 # (default)
+        sched.minibatch_size_dict = {8: 256, 16: 128, 32: 64, 64: 32}
+        sched.minibatch_gpu_base = 4 # (default)
+        sched.minibatch_gpu_dict = {8: 32, 16: 16, 32: 8, 64: 4}
+        G.synthesis_func = 'G_synthesis_stylegan_revised'
+        D.func_name = 'training.networks_stylegan2.D_stylegan'
+
+    # Configs A-C: Disable path length regularization.
+    if config_id in ['config-a', 'config-b', 'config-c']:
+        G_loss = EasyDict(func_name='training.loss.G_logistic_ns')
+
+    # Configs A-B: Disable lazy regularization.
+    if config_id in ['config-a', 'config-b']:
+        train.lazy_regularization = False
+
+    # Config A: Switch to original StyleGAN networks.
+    if config_id == 'config-a':
+        G = EasyDict(func_name='training.networks_stylegan.G_style')
+        D = EasyDict(func_name='training.networks_stylegan.D_basic')
+
+    if gamma is not None:
+        D_loss.gamma = gamma
+
+    sc.submit_target = dnnlib.SubmitTarget.LOCAL
+    sc.local.do_not_copy_source_files = True
+    kwargs = EasyDict(train)
+    kwargs.update(G_args=G, D_args=D, G_opt_args=G_opt, D_opt_args=D_opt, G_loss_args=G_loss, D_loss_args=D_loss)
+    kwargs.update(dataset_args=dataset_args, sched_args=sched, grid_args=grid, metric_arg_list=metrics, tf_config=tf_config)
+    kwargs.submit_config = copy.deepcopy(sc)
+    kwargs.submit_config.run_dir_root = result_dir
+    kwargs.submit_config.run_desc = desc
+    dnnlib.submit_run(**kwargs)
+
+#----------------------------------------------------------------------------
+
+def _str_to_bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+def _parse_comma_sep(s):
+    if s is None or s.lower() == 'none' or s == '':
+        return []
+    return s.split(',')
+
+#----------------------------------------------------------------------------
+
+_examples = '''examples:
+
+  # Train StyleGAN2 using the FFHQ dataset
+  python %(prog)s --num-gpus=8 --data-dir=~/datasets --config=config-f --dataset=ffhq --mirror-augment=true
+
+valid configs:
+
+  ''' + ', '.join(_valid_configs) + '''
+
+valid metrics:
+
+  ''' + ', '.join(sorted([x for x in metric_defaults.keys()])) + '''
+
+'''
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Train StyleGAN2.',
+        epilog=_examples,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('--result-dir', help='Root directory for run results (default: %(default)s)', default='results', metavar='DIR')
+    parser.add_argument('--data-dir', help='Dataset root directory', required=True)
+    parser.add_argument('--dataset', help='Training dataset', required=True)
+    parser.add_argument('--config', help='Training config (default: %(default)s)', default='config-f', required=True, dest='config_id', metavar='CONFIG')
+    parser.add_argument('--num-gpus', help='Number of GPUs (default: %(default)s)', default=1, type=int, metavar='N')
+    parser.add_argument('--total-kimg', help='Training length in thousands of images (default: %(default)s)', metavar='KIMG', default=25000, type=int)
+    parser.add_argument('--gamma', help='R1 regularization weight (default is config dependent)', default=None, type=float)
+    parser.add_argument('--mirror-augment', help='Mirror augment (default: %(default)s)', default=False, metavar='BOOL', type=_str_to_bool)
+    parser.add_argument('--metrics', help='Comma-separated list of metrics or "none" (default: %(default)s)', default='fid50k', type=_parse_comma_sep)
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.data_dir):
+        print ('Error: dataset root directory does not exist.')
+        sys.exit(1)
+
+    if args.config_id not in _valid_configs:
+        print ('Error: --config value must be one of: ', ', '.join(_valid_configs))
+        sys.exit(1)
+
+    for metric in args.metrics:
+        if metric not in metric_defaults:
+            print ('Error: unknown metric \'%s\'' % metric)
+            sys.exit(1)
+
+    run(**vars(args))
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
+
diff --git a/insightface/reconstruction/ostec/external/stylegan2/test_nvcc.cu b/insightface/reconstruction/ostec/external/stylegan2/test_nvcc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8b09bbfe01b0a404f32558b4708efa6ece6ddf9f
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/test_nvcc.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <cstdio>
+
+void checkCudaError(cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        printf("%s: %s\n", cudaGetErrorName(err), cudaGetErrorString(err));
+        exit(1);
+    }
+}
+
+__global__ void cudaKernel(void)
+{
+    printf("GPU says hello.\n");
+}
+
+int main(void)
+{
+    printf("CPU says hello.\n");
+    checkCudaError(cudaLaunchKernel((void*)cudaKernel, 1, 1, NULL, 0, NULL));
+    checkCudaError(cudaDeviceSynchronize());
+    return 0;
+}
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/__init__.py b/insightface/reconstruction/ostec/external/stylegan2/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab9908efa3cb38af52e8d5bcaa8acffde5a8875
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+# empty
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/dataset.py b/insightface/reconstruction/ostec/external/stylegan2/training/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d1059838ab5ec95d5acce62abac6dc93313ba4a
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Multi-resolution input data pipeline."""
+
+import os
+import glob
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+
+#----------------------------------------------------------------------------
+# Dataset class that loads data from tfrecords files.
+
+class TFRecordDataset:
+    def __init__(self,
+        tfrecord_dir,               # Directory containing a collection of tfrecords files.
+        resolution      = None,     # Dataset resolution, None = autodetect.
+        label_file      = None,     # Relative path of the labels file, None = autodetect.
+        max_label_size  = 0,        # 0 = no labels, 'full' = full labels, <int> = N first label components.
+        max_images      = None,     # Maximum number of images to use, None = use all images.
+        repeat          = True,     # Repeat dataset indefinitely?
+        shuffle_mb      = 4096,     # Shuffle data within specified window (megabytes), 0 = disable shuffling.
+        prefetch_mb     = 2048,     # Amount of data to prefetch (megabytes), 0 = disable prefetching.
+        buffer_mb       = 256,      # Read buffer size (megabytes).
+        num_threads     = 2):       # Number of concurrent threads.
+
+        self.tfrecord_dir       = tfrecord_dir
+        self.resolution         = None
+        self.resolution_log2    = None
+        self.shape              = []        # [channels, height, width]
+        self.dtype              = 'uint8'
+        self.dynamic_range      = [0, 255]
+        self.label_file         = label_file
+        self.label_size         = None      # components
+        self.label_dtype        = None
+        self._np_labels         = None
+        self._tf_minibatch_in   = None
+        self._tf_labels_var     = None
+        self._tf_labels_dataset = None
+        self._tf_datasets       = dict()
+        self._tf_iterator       = None
+        self._tf_init_ops       = dict()
+        self._tf_minibatch_np   = None
+        self._cur_minibatch     = -1
+        self._cur_lod           = -1
+
+        # List tfrecords files and inspect their shapes.
+        assert os.path.isdir(self.tfrecord_dir)
+        tfr_files = sorted(glob.glob(os.path.join(self.tfrecord_dir, '*.tfrecords')))
+        assert len(tfr_files) >= 1
+        tfr_shapes = []
+        for tfr_file in tfr_files:
+            tfr_opt = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.NONE)
+            for record in tf.python_io.tf_record_iterator(tfr_file, tfr_opt):
+                tfr_shapes.append(self.parse_tfrecord_np(record).shape)
+                break
+
+        # Autodetect label filename.
+        if self.label_file is None:
+            guess = sorted(glob.glob(os.path.join(self.tfrecord_dir, '*.labels')))
+            if len(guess):
+                self.label_file = guess[0]
+        elif not os.path.isfile(self.label_file):
+            guess = os.path.join(self.tfrecord_dir, self.label_file)
+            if os.path.isfile(guess):
+                self.label_file = guess
+
+        # Determine shape and resolution.
+        max_shape = max(tfr_shapes, key=np.prod)
+        self.resolution = resolution if resolution is not None else max_shape[1]
+        self.resolution_log2 = int(np.log2(self.resolution))
+        self.shape = [max_shape[0], self.resolution, self.resolution]
+        tfr_lods = [self.resolution_log2 - int(np.log2(shape[1])) for shape in tfr_shapes]
+        assert all(shape[0] == max_shape[0] for shape in tfr_shapes)
+        assert all(shape[1] == shape[2] for shape in tfr_shapes)
+        assert all(shape[1] == self.resolution // (2**lod) for shape, lod in zip(tfr_shapes, tfr_lods))
+        assert all(lod in tfr_lods for lod in range(self.resolution_log2 - 1))
+
+        # Load labels.
+        assert max_label_size == 'full' or max_label_size >= 0
+        self._np_labels = np.zeros([1<<30, 0], dtype=np.float32)
+        if self.label_file is not None and max_label_size != 0:
+            self._np_labels = np.load(self.label_file)
+            assert self._np_labels.ndim == 2
+        if max_label_size != 'full' and self._np_labels.shape[1] > max_label_size:
+            self._np_labels = self._np_labels[:, :max_label_size]
+        if max_images is not None and self._np_labels.shape[0] > max_images:
+            self._np_labels = self._np_labels[:max_images]
+        self.label_size = self._np_labels.shape[1]
+        self.label_dtype = self._np_labels.dtype.name
+
+        # Build TF expressions.
+        with tf.name_scope('Dataset'), tf.device('/cpu:0'):
+            self._tf_minibatch_in = tf.placeholder(tf.int64, name='minibatch_in', shape=[])
+            self._tf_labels_var = tflib.create_var_with_large_initial_value(self._np_labels, name='labels_var')
+            self._tf_labels_dataset = tf.data.Dataset.from_tensor_slices(self._tf_labels_var)
+            for tfr_file, tfr_shape, tfr_lod in zip(tfr_files, tfr_shapes, tfr_lods):
+                if tfr_lod < 0:
+                    continue
+                dset = tf.data.TFRecordDataset(tfr_file, compression_type='', buffer_size=buffer_mb<<20)
+                if max_images is not None:
+                    dset = dset.take(max_images)
+                dset = dset.map(self.parse_tfrecord_tf, num_parallel_calls=num_threads)
+                dset = tf.data.Dataset.zip((dset, self._tf_labels_dataset))
+                bytes_per_item = np.prod(tfr_shape) * np.dtype(self.dtype).itemsize
+                if shuffle_mb > 0:
+                    dset = dset.shuffle(((shuffle_mb << 20) - 1) // bytes_per_item + 1)
+                if repeat:
+                    dset = dset.repeat()
+                if prefetch_mb > 0:
+                    dset = dset.prefetch(((prefetch_mb << 20) - 1) // bytes_per_item + 1)
+                dset = dset.batch(self._tf_minibatch_in)
+                self._tf_datasets[tfr_lod] = dset
+            self._tf_iterator = tf.data.Iterator.from_structure(self._tf_datasets[0].output_types, self._tf_datasets[0].output_shapes)
+            self._tf_init_ops = {lod: self._tf_iterator.make_initializer(dset) for lod, dset in self._tf_datasets.items()}
+
+    def close(self):
+        pass
+
+    # Use the given minibatch size and level-of-detail for the data returned by get_minibatch_tf().
+    def configure(self, minibatch_size, lod=0):
+        lod = int(np.floor(lod))
+        assert minibatch_size >= 1 and lod in self._tf_datasets
+        if self._cur_minibatch != minibatch_size or self._cur_lod != lod:
+            self._tf_init_ops[lod].run({self._tf_minibatch_in: minibatch_size})
+            self._cur_minibatch = minibatch_size
+            self._cur_lod = lod
+
+    # Get next minibatch as TensorFlow expressions.
+    def get_minibatch_tf(self): # => images, labels
+        return self._tf_iterator.get_next()
+
+    # Get next minibatch as NumPy arrays.
+    def get_minibatch_np(self, minibatch_size, lod=0): # => images, labels
+        self.configure(minibatch_size, lod)
+        with tf.name_scope('Dataset'):
+            if self._tf_minibatch_np is None:
+                self._tf_minibatch_np = self.get_minibatch_tf()
+            return tflib.run(self._tf_minibatch_np)
+
+    # Get random labels as TensorFlow expression.
+    def get_random_labels_tf(self, minibatch_size): # => labels
+        with tf.name_scope('Dataset'):
+            if self.label_size > 0:
+                with tf.device('/cpu:0'):
+                    return tf.gather(self._tf_labels_var, tf.random_uniform([minibatch_size], 0, self._np_labels.shape[0], dtype=tf.int32))
+            return tf.zeros([minibatch_size, 0], self.label_dtype)
+
+    # Get random labels as NumPy array.
+    def get_random_labels_np(self, minibatch_size): # => labels
+        if self.label_size > 0:
+            return self._np_labels[np.random.randint(self._np_labels.shape[0], size=[minibatch_size])]
+        return np.zeros([minibatch_size, 0], self.label_dtype)
+
+    # Parse individual image from a tfrecords file into TensorFlow expression.
+    @staticmethod
+    def parse_tfrecord_tf(record):
+        features = tf.parse_single_example(record, features={
+            'shape': tf.FixedLenFeature([3], tf.int64),
+            'data': tf.FixedLenFeature([], tf.string)})
+        data = tf.decode_raw(features['data'], tf.uint8)
+        return tf.reshape(data, features['shape'])
+
+    # Parse individual image from a tfrecords file into NumPy array.
+    @staticmethod
+    def parse_tfrecord_np(record):
+        ex = tf.train.Example()
+        ex.ParseFromString(record)
+        shape = ex.features.feature['shape'].int64_list.value # pylint: disable=no-member
+        data = ex.features.feature['data'].bytes_list.value[0] # pylint: disable=no-member
+        return np.fromstring(data, np.uint8).reshape(shape)
+
+#----------------------------------------------------------------------------
+# Helper func for constructing a dataset object using the given options.
+
+def load_dataset(class_name=None, data_dir=None, verbose=False, **kwargs):
+    kwargs = dict(kwargs)
+    if 'tfrecord_dir' in kwargs:
+        if class_name is None:
+            class_name = __name__ + '.TFRecordDataset'
+        if data_dir is not None:
+            kwargs['tfrecord_dir'] = os.path.join(data_dir, kwargs['tfrecord_dir'])
+
+    assert class_name is not None
+    if verbose:
+        print('Streaming data using %s...' % class_name)
+    dataset = dnnlib.util.get_obj_by_name(class_name)(**kwargs)
+    if verbose:
+        print('Dataset shape =', np.int32(dataset.shape).tolist())
+        print('Dynamic range =', dataset.dynamic_range)
+        print('Label size    =', dataset.label_size)
+    return dataset
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/loss.py b/insightface/reconstruction/ostec/external/stylegan2/training/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad2fe16fb657627bfaa547f28653a615ec395c2
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/loss.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Loss functions."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib.tflib as tflib
+from dnnlib.tflib.autosummary import autosummary
+
+#----------------------------------------------------------------------------
+# Logistic loss from the paper
+# "Generative Adversarial Nets", Goodfellow et al. 2014
+
+def G_logistic(G, D, opt, training_set, minibatch_size):
+    _ = opt
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    labels = training_set.get_random_labels_tf(minibatch_size)
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    loss = -tf.nn.softplus(fake_scores_out) # log(1-sigmoid(fake_scores_out)) # pylint: disable=invalid-unary-operand-type
+    return loss, None
+
+def G_logistic_ns(G, D, opt, training_set, minibatch_size):
+    _ = opt
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    labels = training_set.get_random_labels_tf(minibatch_size)
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    loss = tf.nn.softplus(-fake_scores_out) # -log(sigmoid(fake_scores_out))
+    return loss, None
+
+def D_logistic(G, D, opt, training_set, minibatch_size, reals, labels):
+    _ = opt, training_set
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    real_scores_out = D.get_output_for(reals, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    real_scores_out = autosummary('Loss/scores/real', real_scores_out)
+    fake_scores_out = autosummary('Loss/scores/fake', fake_scores_out)
+    loss = tf.nn.softplus(fake_scores_out) # -log(1-sigmoid(fake_scores_out))
+    loss += tf.nn.softplus(-real_scores_out) # -log(sigmoid(real_scores_out)) # pylint: disable=invalid-unary-operand-type
+    return loss, None
+
+#----------------------------------------------------------------------------
+# R1 and R2 regularizers from the paper
+# "Which Training Methods for GANs do actually Converge?", Mescheder et al. 2018
+
+def D_logistic_r1(G, D, opt, training_set, minibatch_size, reals, labels, gamma=10.0):
+    _ = opt, training_set
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    real_scores_out = D.get_output_for(reals, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    real_scores_out = autosummary('Loss/scores/real', real_scores_out)
+    fake_scores_out = autosummary('Loss/scores/fake', fake_scores_out)
+    loss = tf.nn.softplus(fake_scores_out) # -log(1-sigmoid(fake_scores_out))
+    loss += tf.nn.softplus(-real_scores_out) # -log(sigmoid(real_scores_out)) # pylint: disable=invalid-unary-operand-type
+
+    with tf.name_scope('GradientPenalty'):
+        real_grads = tf.gradients(tf.reduce_sum(real_scores_out), [reals])[0]
+        gradient_penalty = tf.reduce_sum(tf.square(real_grads), axis=[1,2,3])
+        gradient_penalty = autosummary('Loss/gradient_penalty', gradient_penalty)
+        reg = gradient_penalty * (gamma * 0.5)
+    return loss, reg
+
+def D_logistic_r2(G, D, opt, training_set, minibatch_size, reals, labels, gamma=10.0):
+    _ = opt, training_set
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    real_scores_out = D.get_output_for(reals, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    real_scores_out = autosummary('Loss/scores/real', real_scores_out)
+    fake_scores_out = autosummary('Loss/scores/fake', fake_scores_out)
+    loss = tf.nn.softplus(fake_scores_out) # -log(1-sigmoid(fake_scores_out))
+    loss += tf.nn.softplus(-real_scores_out) # -log(sigmoid(real_scores_out)) # pylint: disable=invalid-unary-operand-type
+
+    with tf.name_scope('GradientPenalty'):
+        fake_grads = tf.gradients(tf.reduce_sum(fake_scores_out), [fake_images_out])[0]
+        gradient_penalty = tf.reduce_sum(tf.square(fake_grads), axis=[1,2,3])
+        gradient_penalty = autosummary('Loss/gradient_penalty', gradient_penalty)
+        reg = gradient_penalty * (gamma * 0.5)
+    return loss, reg
+
+#----------------------------------------------------------------------------
+# WGAN loss from the paper
+# "Wasserstein Generative Adversarial Networks", Arjovsky et al. 2017
+
+def G_wgan(G, D, opt, training_set, minibatch_size):
+    _ = opt
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    labels = training_set.get_random_labels_tf(minibatch_size)
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    loss = -fake_scores_out
+    return loss, None
+
+def D_wgan(G, D, opt, training_set, minibatch_size, reals, labels, wgan_epsilon=0.001):
+    _ = opt, training_set
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    real_scores_out = D.get_output_for(reals, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    real_scores_out = autosummary('Loss/scores/real', real_scores_out)
+    fake_scores_out = autosummary('Loss/scores/fake', fake_scores_out)
+    loss = fake_scores_out - real_scores_out
+    with tf.name_scope('EpsilonPenalty'):
+        epsilon_penalty = autosummary('Loss/epsilon_penalty', tf.square(real_scores_out))
+        loss += epsilon_penalty * wgan_epsilon
+    return loss, None
+
+#----------------------------------------------------------------------------
+# WGAN-GP loss from the paper
+# "Improved Training of Wasserstein GANs", Gulrajani et al. 2017
+
+def D_wgan_gp(G, D, opt, training_set, minibatch_size, reals, labels, wgan_lambda=10.0, wgan_epsilon=0.001, wgan_target=1.0):
+    _ = opt, training_set
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    fake_images_out = G.get_output_for(latents, labels, is_training=True)
+    real_scores_out = D.get_output_for(reals, labels, is_training=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    real_scores_out = autosummary('Loss/scores/real', real_scores_out)
+    fake_scores_out = autosummary('Loss/scores/fake', fake_scores_out)
+    loss = fake_scores_out - real_scores_out
+    with tf.name_scope('EpsilonPenalty'):
+        epsilon_penalty = autosummary('Loss/epsilon_penalty', tf.square(real_scores_out))
+    loss += epsilon_penalty * wgan_epsilon
+
+    with tf.name_scope('GradientPenalty'):
+        mixing_factors = tf.random_uniform([minibatch_size, 1, 1, 1], 0.0, 1.0, dtype=fake_images_out.dtype)
+        mixed_images_out = tflib.lerp(tf.cast(reals, fake_images_out.dtype), fake_images_out, mixing_factors)
+        mixed_scores_out = D.get_output_for(mixed_images_out, labels, is_training=True)
+        mixed_scores_out = autosummary('Loss/scores/mixed', mixed_scores_out)
+        mixed_grads = tf.gradients(tf.reduce_sum(mixed_scores_out), [mixed_images_out])[0]
+        mixed_norms = tf.sqrt(tf.reduce_sum(tf.square(mixed_grads), axis=[1,2,3]))
+        mixed_norms = autosummary('Loss/mixed_norms', mixed_norms)
+        gradient_penalty = tf.square(mixed_norms - wgan_target)
+        reg = gradient_penalty * (wgan_lambda / (wgan_target**2))
+    return loss, reg
+
+#----------------------------------------------------------------------------
+# Non-saturating logistic loss with path length regularizer from the paper
+# "Analyzing and Improving the Image Quality of StyleGAN", Karras et al. 2019
+
+def G_logistic_ns_pathreg(G, D, opt, training_set, minibatch_size, pl_minibatch_shrink=2, pl_decay=0.01, pl_weight=2.0):
+    _ = opt
+    latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
+    labels = training_set.get_random_labels_tf(minibatch_size)
+    fake_images_out, fake_dlatents_out = G.get_output_for(latents, labels, is_training=True, return_dlatents=True)
+    fake_scores_out = D.get_output_for(fake_images_out, labels, is_training=True)
+    loss = tf.nn.softplus(-fake_scores_out) # -log(sigmoid(fake_scores_out))
+
+    # Path length regularization.
+    with tf.name_scope('PathReg'):
+
+        # Evaluate the regularization term using a smaller minibatch to conserve memory.
+        if pl_minibatch_shrink > 1:
+            pl_minibatch = minibatch_size // pl_minibatch_shrink
+            pl_latents = tf.random_normal([pl_minibatch] + G.input_shapes[0][1:])
+            pl_labels = training_set.get_random_labels_tf(pl_minibatch)
+            fake_images_out, fake_dlatents_out = G.get_output_for(pl_latents, pl_labels, is_training=True, return_dlatents=True)
+
+        # Compute |J*y|.
+        pl_noise = tf.random_normal(tf.shape(fake_images_out)) / np.sqrt(np.prod(G.output_shape[2:]))
+        pl_grads = tf.gradients(tf.reduce_sum(fake_images_out * pl_noise), [fake_dlatents_out])[0]
+        pl_lengths = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(pl_grads), axis=2), axis=1))
+        pl_lengths = autosummary('Loss/pl_lengths', pl_lengths)
+
+        # Track exponential moving average of |J*y|.
+        with tf.control_dependencies(None):
+            pl_mean_var = tf.Variable(name='pl_mean', trainable=False, initial_value=0.0, dtype=tf.float32)
+        pl_mean = pl_mean_var + pl_decay * (tf.reduce_mean(pl_lengths) - pl_mean_var)
+        pl_update = tf.assign(pl_mean_var, pl_mean)
+
+        # Calculate (|J*y|-a)^2.
+        with tf.control_dependencies([pl_update]):
+            pl_penalty = tf.square(pl_lengths - pl_mean)
+            pl_penalty = autosummary('Loss/pl_penalty', pl_penalty)
+
+        # Apply weight.
+        #
+        # Note: The division in pl_noise decreases the weight by num_pixels, and the reduce_mean
+        # in pl_lengths decreases it by num_affine_layers. The effective weight then becomes:
+        #
+        # gamma_pl = pl_weight / num_pixels / num_affine_layers
+        # = 2 / (r^2) / (log2(r) * 2 - 2)
+        # = 1 / (r^2 * (log2(r) - 1))
+        # = ln(2) / (r^2 * (ln(r) - ln(2))
+        #
+        reg = pl_penalty * pl_weight
+
+    return loss, reg
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/misc.py b/insightface/reconstruction/ostec/external/stylegan2/training/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b3444e85c70d9fe742bd2e8055a42210d857f8b
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/misc.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Miscellaneous utility functions."""
+
+import os
+import pickle
+import numpy as np
+import PIL.Image
+import PIL.ImageFont
+import dnnlib
+
+#----------------------------------------------------------------------------
+# Convenience wrappers for pickle that are able to load data produced by
+# older versions of the code, and from external URLs.
+
+def open_file_or_url(file_or_url):
+    if dnnlib.util.is_url(file_or_url):
+        return dnnlib.util.open_url(file_or_url, cache_dir='.stylegan2-cache')
+    return open(file_or_url, 'rb')
+
+def load_pkl(file_or_url):
+    with open_file_or_url(file_or_url) as file:
+        return pickle.load(file, encoding='latin1')
+
+def save_pkl(obj, filename):
+    with open(filename, 'wb') as file:
+        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
+
+#----------------------------------------------------------------------------
+# Image utils.
+
+def adjust_dynamic_range(data, drange_in, drange_out):
+    if drange_in != drange_out:
+        scale = (np.float32(drange_out[1]) - np.float32(drange_out[0])) / (np.float32(drange_in[1]) - np.float32(drange_in[0]))
+        bias = (np.float32(drange_out[0]) - np.float32(drange_in[0]) * scale)
+        data = data * scale + bias
+    return data
+
+def create_image_grid(images, grid_size=None):
+    assert images.ndim == 3 or images.ndim == 4
+    num, img_w, img_h = images.shape[0], images.shape[-1], images.shape[-2]
+
+    if grid_size is not None:
+        grid_w, grid_h = tuple(grid_size)
+    else:
+        grid_w = max(int(np.ceil(np.sqrt(num))), 1)
+        grid_h = max((num - 1) // grid_w + 1, 1)
+
+    grid = np.zeros(list(images.shape[1:-2]) + [grid_h * img_h, grid_w * img_w], dtype=images.dtype)
+    for idx in range(num):
+        x = (idx % grid_w) * img_w
+        y = (idx // grid_w) * img_h
+        grid[..., y : y + img_h, x : x + img_w] = images[idx]
+    return grid
+
+def convert_to_pil_image(image, drange=[0,1]):
+    assert image.ndim == 2 or image.ndim == 3
+    if image.ndim == 3:
+        if image.shape[0] == 1:
+            image = image[0] # grayscale CHW => HW
+        else:
+            image = image.transpose(1, 2, 0) # CHW -> HWC
+
+    image = adjust_dynamic_range(image, drange, [0,255])
+    image = np.rint(image).clip(0, 255).astype(np.uint8)
+    fmt = 'RGB' if image.ndim == 3 else 'L'
+    return PIL.Image.fromarray(image, fmt)
+
+def save_image_grid(images, filename, drange=[0,1], grid_size=None):
+    convert_to_pil_image(create_image_grid(images, grid_size), drange).save(filename)
+
+def apply_mirror_augment(minibatch):
+    mask = np.random.rand(minibatch.shape[0]) < 0.5
+    minibatch = np.array(minibatch)
+    minibatch[mask] = minibatch[mask, :, :, ::-1]
+    return minibatch
+
+#----------------------------------------------------------------------------
+# Loading data from previous training runs.
+
+def parse_config_for_previous_run(run_dir):
+    with open(os.path.join(run_dir, 'submit_config.pkl'), 'rb') as f:
+        data = pickle.load(f)
+    data = data.get('run_func_kwargs', {})
+    return dict(train=data, dataset=data.get('dataset_args', {}))
+
+#----------------------------------------------------------------------------
+# Size and contents of the image snapshot grids that are exported
+# periodically during training.
+
+def setup_snapshot_image_grid(training_set,
+    size    = '1080p',      # '1080p' = to be viewed on 1080p display, '4k' = to be viewed on 4k display.
+    layout  = 'random'):    # 'random' = grid contents are selected randomly, 'row_per_class' = each row corresponds to one class label.
+
+    # Select size.
+    gw = 1; gh = 1
+    if size == '1080p':
+        gw = np.clip(1920 // training_set.shape[2], 3, 32)
+        gh = np.clip(1080 // training_set.shape[1], 2, 32)
+    if size == '4k':
+        gw = np.clip(3840 // training_set.shape[2], 7, 32)
+        gh = np.clip(2160 // training_set.shape[1], 4, 32)
+    if size == '8k':
+        gw = np.clip(7680 // training_set.shape[2], 7, 32)
+        gh = np.clip(4320 // training_set.shape[1], 4, 32)
+
+    # Initialize data arrays.
+    reals = np.zeros([gw * gh] + training_set.shape, dtype=training_set.dtype)
+    labels = np.zeros([gw * gh, training_set.label_size], dtype=training_set.label_dtype)
+
+    # Random layout.
+    if layout == 'random':
+        reals[:], labels[:] = training_set.get_minibatch_np(gw * gh)
+
+    # Class-conditional layouts.
+    class_layouts = dict(row_per_class=[gw,1], col_per_class=[1,gh], class4x4=[4,4])
+    if layout in class_layouts:
+        bw, bh = class_layouts[layout]
+        nw = (gw - 1) // bw + 1
+        nh = (gh - 1) // bh + 1
+        blocks = [[] for _i in range(nw * nh)]
+        for _iter in range(1000000):
+            real, label = training_set.get_minibatch_np(1)
+            idx = np.argmax(label[0])
+            while idx < len(blocks) and len(blocks[idx]) >= bw * bh:
+                idx += training_set.label_size
+            if idx < len(blocks):
+                blocks[idx].append((real, label))
+                if all(len(block) >= bw * bh for block in blocks):
+                    break
+        for i, block in enumerate(blocks):
+            for j, (real, label) in enumerate(block):
+                x = (i %  nw) * bw + j %  bw
+                y = (i // nw) * bh + j // bw
+                if x < gw and y < gh:
+                    reals[x + y * gw] = real[0]
+                    labels[x + y * gw] = label[0]
+
+    return (gw, gh), reals, labels
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan.py b/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..76ce31caa0890becebdfc481d92ec81d0023f999
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Network architectures used in the StyleGAN paper."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+
+# NOTE: Do not import any application-specific modules here!
+# Specify all network parameters as kwargs.
+
+#----------------------------------------------------------------------------
+# Primitive ops for manipulating 4D activation tensors.
+# The gradients of these are not necessary efficient or even meaningful.
+
+def _blur2d(x, f=[1,2,1], normalize=True, flip=False, stride=1):
+    assert x.shape.ndims == 4 and all(dim.value is not None for dim in x.shape[1:])
+    assert isinstance(stride, int) and stride >= 1
+
+    # Finalize filter kernel.
+    f = np.array(f, dtype=np.float32)
+    if f.ndim == 1:
+        f = f[:, np.newaxis] * f[np.newaxis, :]
+    assert f.ndim == 2
+    if normalize:
+        f /= np.sum(f)
+    if flip:
+        f = f[::-1, ::-1]
+    f = f[:, :, np.newaxis, np.newaxis]
+    f = np.tile(f, [1, 1, int(x.shape[1]), 1])
+
+    # No-op => early exit.
+    if f.shape == (1, 1) and f[0,0] == 1:
+        return x
+
+    # Convolve using depthwise_conv2d.
+    orig_dtype = x.dtype
+    x = tf.cast(x, tf.float32)  # tf.nn.depthwise_conv2d() doesn't support fp16
+    f = tf.constant(f, dtype=x.dtype, name='filter')
+    strides = [1, 1, stride, stride]
+    x = tf.nn.depthwise_conv2d(x, f, strides=strides, padding='SAME', data_format='NCHW')
+    x = tf.cast(x, orig_dtype)
+    return x
+
+def _upscale2d(x, factor=2, gain=1):
+    assert x.shape.ndims == 4 and all(dim.value is not None for dim in x.shape[1:])
+    assert isinstance(factor, int) and factor >= 1
+
+    # Apply gain.
+    if gain != 1:
+        x *= gain
+
+    # No-op => early exit.
+    if factor == 1:
+        return x
+
+    # Upscale using tf.tile().
+    s = x.shape
+    x = tf.reshape(x, [-1, s[1], s[2], 1, s[3], 1])
+    x = tf.tile(x, [1, 1, 1, factor, 1, factor])
+    x = tf.reshape(x, [-1, s[1], s[2] * factor, s[3] * factor])
+    return x
+
+def _downscale2d(x, factor=2, gain=1):
+    assert x.shape.ndims == 4 and all(dim.value is not None for dim in x.shape[1:])
+    assert isinstance(factor, int) and factor >= 1
+
+    # 2x2, float32 => downscale using _blur2d().
+    if factor == 2 and x.dtype == tf.float32:
+        f = [np.sqrt(gain) / factor] * factor
+        return _blur2d(x, f=f, normalize=False, stride=factor)
+
+    # Apply gain.
+    if gain != 1:
+        x *= gain
+
+    # No-op => early exit.
+    if factor == 1:
+        return x
+
+    # Large factor => downscale using tf.nn.avg_pool().
+    # NOTE: Requires tf_config['graph_options.place_pruned_graph']=True to work.
+    ksize = [1, 1, factor, factor]
+    return tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding='VALID', data_format='NCHW')
+
+#----------------------------------------------------------------------------
+# High-level ops for manipulating 4D activation tensors.
+# The gradients of these are meant to be as efficient as possible.
+
+def blur2d(x, f=[1,2,1], normalize=True):
+    with tf.variable_scope('Blur2D'):
+        @tf.custom_gradient
+        def func(x):
+            y = _blur2d(x, f, normalize)
+            @tf.custom_gradient
+            def grad(dy):
+                dx = _blur2d(dy, f, normalize, flip=True)
+                return dx, lambda ddx: _blur2d(ddx, f, normalize)
+            return y, grad
+        return func(x)
+
+def upscale2d(x, factor=2):
+    with tf.variable_scope('Upscale2D'):
+        @tf.custom_gradient
+        def func(x):
+            y = _upscale2d(x, factor)
+            @tf.custom_gradient
+            def grad(dy):
+                dx = _downscale2d(dy, factor, gain=factor**2)
+                return dx, lambda ddx: _upscale2d(ddx, factor)
+            return y, grad
+        return func(x)
+
+def downscale2d(x, factor=2):
+    with tf.variable_scope('Downscale2D'):
+        @tf.custom_gradient
+        def func(x):
+            y = _downscale2d(x, factor)
+            @tf.custom_gradient
+            def grad(dy):
+                dx = _upscale2d(dy, factor, gain=1/factor**2)
+                return dx, lambda ddx: _downscale2d(ddx, factor)
+            return y, grad
+        return func(x)
+
+#----------------------------------------------------------------------------
+# Get/create weight tensor for a convolutional or fully-connected layer.
+
+def get_weight(shape, gain=np.sqrt(2), use_wscale=False, lrmul=1):
+    fan_in = np.prod(shape[:-1]) # [kernel, kernel, fmaps_in, fmaps_out] or [in, out]
+    he_std = gain / np.sqrt(fan_in) # He init
+
+    # Equalized learning rate and custom learning rate multiplier.
+    if use_wscale:
+        init_std = 1.0 / lrmul
+        runtime_coef = he_std * lrmul
+    else:
+        init_std = he_std / lrmul
+        runtime_coef = lrmul
+
+    # Create variable.
+    init = tf.initializers.random_normal(0, init_std)
+    return tf.get_variable('weight', shape=shape, initializer=init) * runtime_coef
+
+#----------------------------------------------------------------------------
+# Fully-connected layer.
+
+def dense(x, fmaps, **kwargs):
+    if len(x.shape) > 2:
+        x = tf.reshape(x, [-1, np.prod([d.value for d in x.shape[1:]])])
+    w = get_weight([x.shape[1].value, fmaps], **kwargs)
+    w = tf.cast(w, x.dtype)
+    return tf.matmul(x, w)
+
+#----------------------------------------------------------------------------
+# Convolutional layer.
+
+def conv2d(x, fmaps, kernel, **kwargs):
+    assert kernel >= 1 and kernel % 2 == 1
+    w = get_weight([kernel, kernel, x.shape[1].value, fmaps], **kwargs)
+    w = tf.cast(w, x.dtype)
+    return tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='SAME', data_format='NCHW')
+
+#----------------------------------------------------------------------------
+# Fused convolution + scaling.
+# Faster and uses less memory than performing the operations separately.
+
+def upscale2d_conv2d(x, fmaps, kernel, fused_scale='auto', **kwargs):
+    assert kernel >= 1 and kernel % 2 == 1
+    assert fused_scale in [True, False, 'auto']
+    if fused_scale == 'auto':
+        fused_scale = min(x.shape[2:]) * 2 >= 128
+
+    # Not fused => call the individual ops directly.
+    if not fused_scale:
+        return conv2d(upscale2d(x), fmaps, kernel, **kwargs)
+
+    # Fused => perform both ops simultaneously using tf.nn.conv2d_transpose().
+    w = get_weight([kernel, kernel, x.shape[1].value, fmaps], **kwargs)
+    w = tf.transpose(w, [0, 1, 3, 2]) # [kernel, kernel, fmaps_out, fmaps_in]
+    w = tf.pad(w, [[1,1], [1,1], [0,0], [0,0]], mode='CONSTANT')
+    w = tf.add_n([w[1:, 1:], w[:-1, 1:], w[1:, :-1], w[:-1, :-1]])
+    w = tf.cast(w, x.dtype)
+    os = [tf.shape(x)[0], fmaps, x.shape[2] * 2, x.shape[3] * 2]
+    return tf.nn.conv2d_transpose(x, w, os, strides=[1,1,2,2], padding='SAME', data_format='NCHW')
+
+def conv2d_downscale2d(x, fmaps, kernel, fused_scale='auto', **kwargs):
+    assert kernel >= 1 and kernel % 2 == 1
+    assert fused_scale in [True, False, 'auto']
+    if fused_scale == 'auto':
+        fused_scale = min(x.shape[2:]) >= 128
+
+    # Not fused => call the individual ops directly.
+    if not fused_scale:
+        return downscale2d(conv2d(x, fmaps, kernel, **kwargs))
+
+    # Fused => perform both ops simultaneously using tf.nn.conv2d().
+    w = get_weight([kernel, kernel, x.shape[1].value, fmaps], **kwargs)
+    w = tf.pad(w, [[1,1], [1,1], [0,0], [0,0]], mode='CONSTANT')
+    w = tf.add_n([w[1:, 1:], w[:-1, 1:], w[1:, :-1], w[:-1, :-1]]) * 0.25
+    w = tf.cast(w, x.dtype)
+    return tf.nn.conv2d(x, w, strides=[1,1,2,2], padding='SAME', data_format='NCHW')
+
+#----------------------------------------------------------------------------
+# Apply bias to the given activation tensor.
+
+def apply_bias(x, lrmul=1):
+    b = tf.get_variable('bias', shape=[x.shape[1]], initializer=tf.initializers.zeros()) * lrmul
+    b = tf.cast(b, x.dtype)
+    if len(x.shape) == 2:
+        return x + b
+    return x + tf.reshape(b, [1, -1, 1, 1])
+
+#----------------------------------------------------------------------------
+# Leaky ReLU activation. More efficient than tf.nn.leaky_relu() and supports FP16.
+
+def leaky_relu(x, alpha=0.2):
+    with tf.variable_scope('LeakyReLU'):
+        alpha = tf.constant(alpha, dtype=x.dtype, name='alpha')
+        @tf.custom_gradient
+        def func(x):
+            y = tf.maximum(x, x * alpha)
+            @tf.custom_gradient
+            def grad(dy):
+                dx = tf.where(y >= 0, dy, dy * alpha)
+                return dx, lambda ddx: tf.where(y >= 0, ddx, ddx * alpha)
+            return y, grad
+        return func(x)
+
+#----------------------------------------------------------------------------
+# Pixelwise feature vector normalization.
+
+def pixel_norm(x, epsilon=1e-8):
+    with tf.variable_scope('PixelNorm'):
+        epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon')
+        return x * tf.rsqrt(tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + epsilon)
+
+#----------------------------------------------------------------------------
+# Instance normalization.
+
+def instance_norm(x, epsilon=1e-8):
+    assert len(x.shape) == 4 # NCHW
+    with tf.variable_scope('InstanceNorm'):
+        orig_dtype = x.dtype
+        x = tf.cast(x, tf.float32)
+        x -= tf.reduce_mean(x, axis=[2,3], keepdims=True)
+        epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon')
+        x *= tf.rsqrt(tf.reduce_mean(tf.square(x), axis=[2,3], keepdims=True) + epsilon)
+        x = tf.cast(x, orig_dtype)
+        return x
+
+#----------------------------------------------------------------------------
+# Style modulation.
+
+def style_mod(x, dlatent, **kwargs):
+    with tf.variable_scope('StyleMod'):
+        style = apply_bias(dense(dlatent, fmaps=x.shape[1]*2, gain=1, **kwargs))
+        style = tf.reshape(style, [-1, 2, x.shape[1]] + [1] * (len(x.shape) - 2))
+        return x * (style[:,0] + 1) + style[:,1]
+
+#----------------------------------------------------------------------------
+# Noise input.
+
+def apply_noise(x, noise_var=None, randomize_noise=True):
+    assert len(x.shape) == 4 # NCHW
+    with tf.variable_scope('Noise'):
+        if noise_var is None or randomize_noise:
+            noise = tf.random_normal([tf.shape(x)[0], 1, x.shape[2], x.shape[3]], dtype=x.dtype)
+        else:
+            noise = tf.cast(noise_var, x.dtype)
+        weight = tf.get_variable('weight', shape=[x.shape[1].value], initializer=tf.initializers.zeros())
+        return x + noise * tf.reshape(tf.cast(weight, x.dtype), [1, -1, 1, 1])
+
+#----------------------------------------------------------------------------
+# Minibatch standard deviation.
+
+def minibatch_stddev_layer(x, group_size=4, num_new_features=1):
+    with tf.variable_scope('MinibatchStddev'):
+        group_size = tf.minimum(group_size, tf.shape(x)[0])     # Minibatch must be divisible by (or smaller than) group_size.
+        s = x.shape                                             # [NCHW]  Input shape.
+        y = tf.reshape(x, [group_size, -1, num_new_features, s[1]//num_new_features, s[2], s[3]])   # [GMncHW] Split minibatch into M groups of size G. Split channels into n channel groups c.
+        y = tf.cast(y, tf.float32)                              # [GMncHW] Cast to FP32.
+        y -= tf.reduce_mean(y, axis=0, keepdims=True)           # [GMncHW] Subtract mean over group.
+        y = tf.reduce_mean(tf.square(y), axis=0)                # [MncHW]  Calc variance over group.
+        y = tf.sqrt(y + 1e-8)                                   # [MncHW]  Calc stddev over group.
+        y = tf.reduce_mean(y, axis=[2,3,4], keepdims=True)      # [Mn111]  Take average over fmaps and pixels.
+        y = tf.reduce_mean(y, axis=[2])                         # [Mn11] Split channels into c channel groups
+        y = tf.cast(y, x.dtype)                                 # [Mn11]  Cast back to original data type.
+        y = tf.tile(y, [group_size, 1, s[2], s[3]])             # [NnHW]  Replicate over group and pixels.
+        return tf.concat([x, y], axis=1)                        # [NCHW]  Append as new fmap.
+
+#----------------------------------------------------------------------------
+# Style-based generator used in the StyleGAN paper.
+# Composed of two sub-networks (G_mapping and G_synthesis) that are defined below.
+
+def G_style(
+    latents_in,                                     # First input: Latent vectors (Z) [minibatch, latent_size].
+    labels_in,                                      # Second input: Conditioning labels [minibatch, label_size].
+    truncation_psi          = 0.7,                  # Style strength multiplier for the truncation trick. None = disable.
+    truncation_cutoff       = 8,                    # Number of layers for which to apply the truncation trick. None = disable.
+    truncation_psi_val      = None,                 # Value for truncation_psi to use during validation.
+    truncation_cutoff_val   = None,                 # Value for truncation_cutoff to use during validation.
+    dlatent_avg_beta        = 0.995,                # Decay for tracking the moving average of W during training. None = disable.
+    style_mixing_prob       = 0.9,                  # Probability of mixing styles during training. None = disable.
+    is_training             = False,                # Network is under training? Enables and disables specific features.
+    is_validation           = False,                # Network is under validation? Chooses which value to use for truncation_psi.
+    is_template_graph       = False,                # True = template graph constructed by the Network class, False = actual evaluation.
+    components              = dnnlib.EasyDict(),    # Container for sub-networks. Retained between calls.
+    **kwargs):                                      # Arguments for sub-networks (G_mapping and G_synthesis).
+
+    # Validate arguments.
+    assert not is_training or not is_validation
+    assert isinstance(components, dnnlib.EasyDict)
+    if is_validation:
+        truncation_psi = truncation_psi_val
+        truncation_cutoff = truncation_cutoff_val
+    if is_training or (truncation_psi is not None and not tflib.is_tf_expression(truncation_psi) and truncation_psi == 1):
+        truncation_psi = None
+    if is_training or (truncation_cutoff is not None and not tflib.is_tf_expression(truncation_cutoff) and truncation_cutoff <= 0):
+        truncation_cutoff = None
+    if not is_training or (dlatent_avg_beta is not None and not tflib.is_tf_expression(dlatent_avg_beta) and dlatent_avg_beta == 1):
+        dlatent_avg_beta = None
+    if not is_training or (style_mixing_prob is not None and not tflib.is_tf_expression(style_mixing_prob) and style_mixing_prob <= 0):
+        style_mixing_prob = None
+
+    # Setup components.
+    if 'synthesis' not in components:
+        components.synthesis = tflib.Network('G_synthesis', func_name=G_synthesis, **kwargs)
+    num_layers = components.synthesis.input_shape[1]
+    dlatent_size = components.synthesis.input_shape[2]
+    if 'mapping' not in components:
+        components.mapping = tflib.Network('G_mapping', func_name=G_mapping, dlatent_broadcast=num_layers, **kwargs)
+
+    # Setup variables.
+    lod_in = tf.get_variable('lod', initializer=np.float32(0), trainable=False)
+    dlatent_avg = tf.get_variable('dlatent_avg', shape=[dlatent_size], initializer=tf.initializers.zeros(), trainable=False)
+
+    # Evaluate mapping network.
+    dlatents = components.mapping.get_output_for(latents_in, labels_in, **kwargs)
+
+    # Update moving average of W.
+    if dlatent_avg_beta is not None:
+        with tf.variable_scope('DlatentAvg'):
+            batch_avg = tf.reduce_mean(dlatents[:, 0], axis=0)
+            update_op = tf.assign(dlatent_avg, tflib.lerp(batch_avg, dlatent_avg, dlatent_avg_beta))
+            with tf.control_dependencies([update_op]):
+                dlatents = tf.identity(dlatents)
+
+    # Perform style mixing regularization.
+    if style_mixing_prob is not None:
+        with tf.name_scope('StyleMix'):
+            latents2 = tf.random_normal(tf.shape(latents_in))
+            dlatents2 = components.mapping.get_output_for(latents2, labels_in, **kwargs)
+            layer_idx = np.arange(num_layers)[np.newaxis, :, np.newaxis]
+            cur_layers = num_layers - tf.cast(lod_in, tf.int32) * 2
+            mixing_cutoff = tf.cond(
+                tf.random_uniform([], 0.0, 1.0) < style_mixing_prob,
+                lambda: tf.random_uniform([], 1, cur_layers, dtype=tf.int32),
+                lambda: cur_layers)
+            dlatents = tf.where(tf.broadcast_to(layer_idx < mixing_cutoff, tf.shape(dlatents)), dlatents, dlatents2)
+
+    # Apply truncation trick.
+    if truncation_psi is not None and truncation_cutoff is not None:
+        with tf.variable_scope('Truncation'):
+            layer_idx = np.arange(num_layers)[np.newaxis, :, np.newaxis]
+            ones = np.ones(layer_idx.shape, dtype=np.float32)
+            coefs = tf.where(layer_idx < truncation_cutoff, truncation_psi * ones, ones)
+            dlatents = tflib.lerp(dlatent_avg, dlatents, coefs)
+
+    # Evaluate synthesis network.
+    with tf.control_dependencies([tf.assign(components.synthesis.find_var('lod'), lod_in)]):
+        images_out = components.synthesis.get_output_for(dlatents, force_clean_graph=is_template_graph, **kwargs)
+    return tf.identity(images_out, name='images_out')
+
+#----------------------------------------------------------------------------
+# Mapping network used in the StyleGAN paper.
+
+def G_mapping(
+    latents_in,                             # First input: Latent vectors (Z) [minibatch, latent_size].
+    labels_in,                              # Second input: Conditioning labels [minibatch, label_size].
+    latent_size             = 512,          # Latent vector (Z) dimensionality.
+    label_size              = 0,            # Label dimensionality, 0 if no labels.
+    dlatent_size            = 512,          # Disentangled latent (W) dimensionality.
+    dlatent_broadcast       = None,         # Output disentangled latent (W) as [minibatch, dlatent_size] or [minibatch, dlatent_broadcast, dlatent_size].
+    mapping_layers          = 8,            # Number of mapping layers.
+    mapping_fmaps           = 512,          # Number of activations in the mapping layers.
+    mapping_lrmul           = 0.01,         # Learning rate multiplier for the mapping layers.
+    mapping_nonlinearity    = 'lrelu',      # Activation function: 'relu', 'lrelu'.
+    use_wscale              = True,         # Enable equalized learning rate?
+    normalize_latents       = True,         # Normalize latent vectors (Z) before feeding them to the mapping layers?
+    dtype                   = 'float32',    # Data type to use for activations and outputs.
+    **_kwargs):                             # Ignore unrecognized keyword args.
+
+    act, gain = {'relu': (tf.nn.relu, np.sqrt(2)), 'lrelu': (leaky_relu, np.sqrt(2))}[mapping_nonlinearity]
+
+    # Inputs.
+    latents_in.set_shape([None, latent_size])
+    labels_in.set_shape([None, label_size])
+    latents_in = tf.cast(latents_in, dtype)
+    labels_in = tf.cast(labels_in, dtype)
+    x = latents_in
+
+    # Embed labels and concatenate them with latents.
+    if label_size:
+        with tf.variable_scope('LabelConcat'):
+            w = tf.get_variable('weight', shape=[label_size, latent_size], initializer=tf.initializers.random_normal())
+            y = tf.matmul(labels_in, tf.cast(w, dtype))
+            x = tf.concat([x, y], axis=1)
+
+    # Normalize latents.
+    if normalize_latents:
+        x = pixel_norm(x)
+
+    # Mapping layers.
+    for layer_idx in range(mapping_layers):
+        with tf.variable_scope('Dense%d' % layer_idx):
+            fmaps = dlatent_size if layer_idx == mapping_layers - 1 else mapping_fmaps
+            x = dense(x, fmaps=fmaps, gain=gain, use_wscale=use_wscale, lrmul=mapping_lrmul)
+            x = apply_bias(x, lrmul=mapping_lrmul)
+            x = act(x)
+
+    # Broadcast.
+    if dlatent_broadcast is not None:
+        with tf.variable_scope('Broadcast'):
+            x = tf.tile(x[:, np.newaxis], [1, dlatent_broadcast, 1])
+
+    # Output.
+    assert x.dtype == tf.as_dtype(dtype)
+    return tf.identity(x, name='dlatents_out')
+
+#----------------------------------------------------------------------------
+# Synthesis network used in the StyleGAN paper.
+
+def G_synthesis(
+    dlatents_in,                        # Input: Disentangled latents (W) [minibatch, num_layers, dlatent_size].
+    dlatent_size        = 512,          # Disentangled latent (W) dimensionality.
+    num_channels        = 3,            # Number of output color channels.
+    resolution          = 1024,         # Output resolution.
+    fmap_base           = 8192,         # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    use_styles          = True,         # Enable style inputs?
+    const_input_layer   = True,         # First layer is a learned constant?
+    use_noise           = True,         # Enable noise inputs?
+    randomize_noise     = True,         # True = randomize noise inputs every time (non-deterministic), False = read noise inputs from variables.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu'
+    use_wscale          = True,         # Enable equalized learning rate?
+    use_pixel_norm      = False,        # Enable pixelwise feature vector normalization?
+    use_instance_norm   = True,         # Enable instance normalization?
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    fused_scale         = 'auto',       # True = fused convolution + scaling, False = separate ops, 'auto' = decide automatically.
+    blur_filter         = [1,2,1],      # Low-pass filter to apply when resampling activations. None = no filtering.
+    structure           = 'auto',       # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically.
+    is_template_graph   = False,        # True = template graph constructed by the Network class, False = actual evaluation.
+    force_clean_graph   = False,        # True = construct a clean graph that looks nice in TensorBoard, False = default behavior.
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return min(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_max)
+    def blur(x): return blur2d(x, blur_filter) if blur_filter else x
+    if is_template_graph: force_clean_graph = True
+    if force_clean_graph: randomize_noise = False
+    if structure == 'auto': structure = 'linear' if force_clean_graph else 'recursive'
+    act, gain = {'relu': (tf.nn.relu, np.sqrt(2)), 'lrelu': (leaky_relu, np.sqrt(2))}[nonlinearity]
+    num_layers = resolution_log2 * 2 - 2
+    num_styles = num_layers if use_styles else 1
+    images_out = None
+
+    # Primary inputs.
+    dlatents_in.set_shape([None, num_styles, dlatent_size])
+    dlatents_in = tf.cast(dlatents_in, dtype)
+    lod_in = tf.cast(tf.get_variable('lod', initializer=np.float32(0), trainable=False), dtype)
+
+    # Noise inputs.
+    noise_inputs = []
+    if use_noise:
+        for layer_idx in range(num_layers):
+            res = layer_idx // 2 + 2
+            shape = [1, use_noise, 2**res, 2**res]
+            noise_inputs.append(tf.get_variable('noise%d' % layer_idx, shape=shape, initializer=tf.initializers.random_normal(), trainable=False))
+
+    # Things to do at the end of each layer.
+    def layer_epilogue(x, layer_idx):
+        if use_noise:
+            x = apply_noise(x, noise_inputs[layer_idx], randomize_noise=randomize_noise)
+        x = apply_bias(x)
+        x = act(x)
+        if use_pixel_norm:
+            x = pixel_norm(x)
+        if use_instance_norm:
+            x = instance_norm(x)
+        if use_styles:
+            x = style_mod(x, dlatents_in[:, layer_idx], use_wscale=use_wscale)
+        return x
+
+    # Early layers.
+    with tf.variable_scope('4x4'):
+        if const_input_layer:
+            with tf.variable_scope('Const'):
+                x = tf.get_variable('const', shape=[1, nf(1), 4, 4], initializer=tf.initializers.ones())
+                x = layer_epilogue(tf.tile(tf.cast(x, dtype), [tf.shape(dlatents_in)[0], 1, 1, 1]), 0)
+        else:
+            with tf.variable_scope('Dense'):
+                x = dense(dlatents_in[:, 0], fmaps=nf(1)*16, gain=gain/4, use_wscale=use_wscale) # tweak gain to match the official implementation of Progressing GAN
+                x = layer_epilogue(tf.reshape(x, [-1, nf(1), 4, 4]), 0)
+        with tf.variable_scope('Conv'):
+            x = layer_epilogue(conv2d(x, fmaps=nf(1), kernel=3, gain=gain, use_wscale=use_wscale), 1)
+
+    # Building blocks for remaining layers.
+    def block(res, x): # res = 3..resolution_log2
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            with tf.variable_scope('Conv0_up'):
+                x = layer_epilogue(blur(upscale2d_conv2d(x, fmaps=nf(res-1), kernel=3, gain=gain, use_wscale=use_wscale, fused_scale=fused_scale)), res*2-4)
+            with tf.variable_scope('Conv1'):
+                x = layer_epilogue(conv2d(x, fmaps=nf(res-1), kernel=3, gain=gain, use_wscale=use_wscale), res*2-3)
+            return x
+    def torgb(res, x): # res = 2..resolution_log2
+        lod = resolution_log2 - res
+        with tf.variable_scope('ToRGB_lod%d' % lod):
+            return apply_bias(conv2d(x, fmaps=num_channels, kernel=1, gain=1, use_wscale=use_wscale))
+
+    # Fixed structure: simple and efficient, but does not support progressive growing.
+    if structure == 'fixed':
+        for res in range(3, resolution_log2 + 1):
+            x = block(res, x)
+        images_out = torgb(resolution_log2, x)
+
+    # Linear structure: simple but inefficient.
+    if structure == 'linear':
+        images_out = torgb(2, x)
+        for res in range(3, resolution_log2 + 1):
+            lod = resolution_log2 - res
+            x = block(res, x)
+            img = torgb(res, x)
+            images_out = upscale2d(images_out)
+            with tf.variable_scope('Grow_lod%d' % lod):
+                images_out = tflib.lerp_clip(img, images_out, lod_in - lod)
+
+    # Recursive structure: complex but efficient.
+    if structure == 'recursive':
+        def cset(cur_lambda, new_cond, new_lambda):
+            return lambda: tf.cond(new_cond, new_lambda, cur_lambda)
+        def grow(x, res, lod):
+            y = block(res, x)
+            img = lambda: upscale2d(torgb(res, y), 2**lod)
+            img = cset(img, (lod_in > lod), lambda: upscale2d(tflib.lerp(torgb(res, y), upscale2d(torgb(res - 1, x)), lod_in - lod), 2**lod))
+            if lod > 0: img = cset(img, (lod_in < lod), lambda: grow(y, res + 1, lod - 1))
+            return img()
+        images_out = grow(x, 3, resolution_log2 - 3)
+
+    assert images_out.dtype == tf.as_dtype(dtype)
+    return tf.identity(images_out, name='images_out')
+
+#----------------------------------------------------------------------------
+# Discriminator used in the StyleGAN paper.
+
+def D_basic(
+    images_in,                          # First input: Images [minibatch, channel, height, width].
+    labels_in,                          # Second input: Labels [minibatch, label_size].
+    num_channels        = 1,            # Number of input color channels. Overridden based on dataset.
+    resolution          = 32,           # Input resolution. Overridden based on dataset.
+    label_size          = 0,            # Dimensionality of the labels, 0 if no labels. Overridden based on dataset.
+    fmap_base           = 8192,         # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu',
+    use_wscale          = True,         # Enable equalized learning rate?
+    mbstd_group_size    = 4,            # Group size for the minibatch standard deviation layer, 0 = disable.
+    mbstd_num_features  = 1,            # Number of features for the minibatch standard deviation layer.
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    fused_scale         = 'auto',       # True = fused convolution + scaling, False = separate ops, 'auto' = decide automatically.
+    blur_filter         = [1,2,1],      # Low-pass filter to apply when resampling activations. None = no filtering.
+    structure           = 'auto',       # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically.
+    is_template_graph   = False,        # True = template graph constructed by the Network class, False = actual evaluation.
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return min(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_max)
+    def blur(x): return blur2d(x, blur_filter) if blur_filter else x
+    if structure == 'auto': structure = 'linear' if is_template_graph else 'recursive'
+    act, gain = {'relu': (tf.nn.relu, np.sqrt(2)), 'lrelu': (leaky_relu, np.sqrt(2))}[nonlinearity]
+
+    images_in.set_shape([None, num_channels, resolution, resolution])
+    labels_in.set_shape([None, label_size])
+    images_in = tf.cast(images_in, dtype)
+    labels_in = tf.cast(labels_in, dtype)
+    lod_in = tf.cast(tf.get_variable('lod', initializer=np.float32(0.0), trainable=False), dtype)
+    scores_out = None
+
+    # Building blocks.
+    def fromrgb(x, res): # res = 2..resolution_log2
+        with tf.variable_scope('FromRGB_lod%d' % (resolution_log2 - res)):
+            return act(apply_bias(conv2d(x, fmaps=nf(res-1), kernel=1, gain=gain, use_wscale=use_wscale)))
+    def block(x, res): # res = 2..resolution_log2
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            if res >= 3: # 8x8 and up
+                with tf.variable_scope('Conv0'):
+                    x = act(apply_bias(conv2d(x, fmaps=nf(res-1), kernel=3, gain=gain, use_wscale=use_wscale)))
+                with tf.variable_scope('Conv1_down'):
+                    x = act(apply_bias(conv2d_downscale2d(blur(x), fmaps=nf(res-2), kernel=3, gain=gain, use_wscale=use_wscale, fused_scale=fused_scale)))
+            else: # 4x4
+                if mbstd_group_size > 1:
+                    x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features)
+                with tf.variable_scope('Conv'):
+                    x = act(apply_bias(conv2d(x, fmaps=nf(res-1), kernel=3, gain=gain, use_wscale=use_wscale)))
+                with tf.variable_scope('Dense0'):
+                    x = act(apply_bias(dense(x, fmaps=nf(res-2), gain=gain, use_wscale=use_wscale)))
+                with tf.variable_scope('Dense1'):
+                    x = apply_bias(dense(x, fmaps=max(label_size, 1), gain=1, use_wscale=use_wscale))
+            return x
+
+    # Fixed structure: simple and efficient, but does not support progressive growing.
+    if structure == 'fixed':
+        x = fromrgb(images_in, resolution_log2)
+        for res in range(resolution_log2, 2, -1):
+            x = block(x, res)
+        scores_out = block(x, 2)
+
+    # Linear structure: simple but inefficient.
+    if structure == 'linear':
+        img = images_in
+        x = fromrgb(img, resolution_log2)
+        for res in range(resolution_log2, 2, -1):
+            lod = resolution_log2 - res
+            x = block(x, res)
+            img = downscale2d(img)
+            y = fromrgb(img, res - 1)
+            with tf.variable_scope('Grow_lod%d' % lod):
+                x = tflib.lerp_clip(x, y, lod_in - lod)
+        scores_out = block(x, 2)
+
+    # Recursive structure: complex but efficient.
+    if structure == 'recursive':
+        def cset(cur_lambda, new_cond, new_lambda):
+            return lambda: tf.cond(new_cond, new_lambda, cur_lambda)
+        def grow(res, lod):
+            x = lambda: fromrgb(downscale2d(images_in, 2**lod), res)
+            if lod > 0: x = cset(x, (lod_in < lod), lambda: grow(res + 1, lod - 1))
+            x = block(x(), res); y = lambda: x
+            if res > 2: y = cset(y, (lod_in > lod), lambda: tflib.lerp(x, fromrgb(downscale2d(images_in, 2**(lod+1)), res - 1), lod_in - lod))
+            return y()
+        scores_out = grow(2, resolution_log2 - 2)
+
+    # Label conditioning from "Which Training Methods for GANs do actually Converge?"
+    if label_size:
+        with tf.variable_scope('LabelSwitch'):
+            scores_out = tf.reduce_sum(scores_out * labels_in, axis=1, keepdims=True)
+
+    assert scores_out.dtype == tf.as_dtype(dtype)
+    scores_out = tf.identity(scores_out, name='scores_out')
+    return scores_out
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan2.py b/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c96fc19207c943da2fb2e4941816175299e79f8
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/networks_stylegan2.py
@@ -0,0 +1,697 @@
+﻿# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Network architectures used in the StyleGAN2 paper."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+from dnnlib.tflib.ops.upfirdn_2d import upsample_2d, downsample_2d, upsample_conv_2d, conv_downsample_2d
+from dnnlib.tflib.ops.fused_bias_act import fused_bias_act
+
+# NOTE: Do not import any application-specific modules here!
+# Specify all network parameters as kwargs.
+
+#----------------------------------------------------------------------------
+# Get/create weight tensor for a convolution or fully-connected layer.
+
+def get_weight(shape, gain=1, use_wscale=True, lrmul=1, weight_var='weight'):
+    fan_in = np.prod(shape[:-1]) # [kernel, kernel, fmaps_in, fmaps_out] or [in, out]
+    he_std = gain / np.sqrt(fan_in) # He init
+
+    # Equalized learning rate and custom learning rate multiplier.
+    if use_wscale:
+        init_std = 1.0 / lrmul
+        runtime_coef = he_std * lrmul
+    else:
+        init_std = he_std / lrmul
+        runtime_coef = lrmul
+
+    # Create variable.
+    init = tf.initializers.random_normal(0, init_std)
+    return tf.get_variable(weight_var, shape=shape, initializer=init) * runtime_coef
+
+#----------------------------------------------------------------------------
+# Fully-connected layer.
+
+def dense_layer(x, fmaps, gain=1, use_wscale=True, lrmul=1, weight_var='weight'):
+    if len(x.shape) > 2:
+        x = tf.reshape(x, [-1, np.prod([d.value for d in x.shape[1:]])])
+    w = get_weight([x.shape[1].value, fmaps], gain=gain, use_wscale=use_wscale, lrmul=lrmul, weight_var=weight_var)
+    w = tf.cast(w, x.dtype)
+    return tf.matmul(x, w)
+
+#----------------------------------------------------------------------------
+# Convolution layer with optional upsampling or downsampling.
+
+def conv2d_layer(x, fmaps, kernel, up=False, down=False, resample_kernel=None, gain=1, use_wscale=True, lrmul=1, weight_var='weight'):
+    assert not (up and down)
+    assert kernel >= 1 and kernel % 2 == 1
+    w = get_weight([kernel, kernel, x.shape[1].value, fmaps], gain=gain, use_wscale=use_wscale, lrmul=lrmul, weight_var=weight_var)
+    if up:
+        x = upsample_conv_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel)
+    elif down:
+        x = conv_downsample_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel)
+    else:
+        x = tf.nn.conv2d(x, tf.cast(w, x.dtype), data_format='NCHW', strides=[1,1,1,1], padding='SAME')
+    return x
+
+#----------------------------------------------------------------------------
+# Apply bias and activation func.
+
+def apply_bias_act(x, act='linear', alpha=None, gain=None, lrmul=1, bias_var='bias'):
+    b = tf.get_variable(bias_var, shape=[x.shape[1]], initializer=tf.initializers.zeros()) * lrmul
+    return fused_bias_act(x, b=tf.cast(b, x.dtype), act=act, alpha=alpha, gain=gain)
+
+#----------------------------------------------------------------------------
+# Naive upsampling (nearest neighbor) and downsampling (average pooling).
+
+def naive_upsample_2d(x, factor=2):
+    with tf.variable_scope('NaiveUpsample'):
+        _N, C, H, W = x.shape.as_list()
+        x = tf.reshape(x, [-1, C, H, 1, W, 1])
+        x = tf.tile(x, [1, 1, 1, factor, 1, factor])
+        return tf.reshape(x, [-1, C, H * factor, W * factor])
+
+def naive_downsample_2d(x, factor=2):
+    with tf.variable_scope('NaiveDownsample'):
+        _N, C, H, W = x.shape.as_list()
+        x = tf.reshape(x, [-1, C, H // factor, factor, W // factor, factor])
+        return tf.reduce_mean(x, axis=[3,5])
+
+#----------------------------------------------------------------------------
+# Modulated convolution layer.
+
+def modulated_conv2d_layer(x, y, fmaps, kernel, up=False, down=False, demodulate=True, resample_kernel=None, gain=1, use_wscale=True, lrmul=1, fused_modconv=True, weight_var='weight', mod_weight_var='mod_weight', mod_bias_var='mod_bias'):
+    assert not (up and down)
+    assert kernel >= 1 and kernel % 2 == 1
+
+    # Get weight.
+    w = get_weight([kernel, kernel, x.shape[1].value, fmaps], gain=gain, use_wscale=use_wscale, lrmul=lrmul, weight_var=weight_var)
+    ww = w[np.newaxis] # [BkkIO] Introduce minibatch dimension.
+
+    # Modulate.
+    s = dense_layer(y, fmaps=x.shape[1].value, weight_var=mod_weight_var) # [BI] Transform incoming W to style.
+    s = apply_bias_act(s, bias_var=mod_bias_var) + 1 # [BI] Add bias (initially 1).
+    ww *= tf.cast(s[:, np.newaxis, np.newaxis, :, np.newaxis], w.dtype) # [BkkIO] Scale input feature maps.
+
+    # Demodulate.
+    if demodulate:
+        d = tf.rsqrt(tf.reduce_sum(tf.square(ww), axis=[1,2,3]) + 1e-8) # [BO] Scaling factor.
+        ww *= d[:, np.newaxis, np.newaxis, np.newaxis, :] # [BkkIO] Scale output feature maps.
+
+    # Reshape/scale input.
+    if fused_modconv:
+        x = tf.reshape(x, [1, -1, x.shape[2], x.shape[3]]) # Fused => reshape minibatch to convolution groups.
+        w = tf.reshape(tf.transpose(ww, [1, 2, 3, 0, 4]), [ww.shape[1], ww.shape[2], ww.shape[3], -1])
+    else:
+        x *= tf.cast(s[:, :, np.newaxis, np.newaxis], x.dtype) # [BIhw] Not fused => scale input activations.
+
+    # Convolution with optional up/downsampling.
+    if up:
+        x = upsample_conv_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel)
+    elif down:
+        x = conv_downsample_2d(x, tf.cast(w, x.dtype), data_format='NCHW', k=resample_kernel)
+    else:
+        x = tf.nn.conv2d(x, tf.cast(w, x.dtype), data_format='NCHW', strides=[1,1,1,1], padding='SAME')
+
+    # Reshape/scale output.
+    if fused_modconv:
+        x = tf.reshape(x, [-1, fmaps, x.shape[2], x.shape[3]]) # Fused => reshape convolution groups back to minibatch.
+    elif demodulate:
+        x *= tf.cast(d[:, :, np.newaxis, np.newaxis], x.dtype) # [BOhw] Not fused => scale output activations.
+    return x
+
+#----------------------------------------------------------------------------
+# Minibatch standard deviation layer.
+
+def minibatch_stddev_layer(x, group_size=4, num_new_features=1):
+    group_size = tf.minimum(group_size, tf.shape(x)[0])     # Minibatch must be divisible by (or smaller than) group_size.
+    s = x.shape                                             # [NCHW]  Input shape.
+    y = tf.reshape(x, [group_size, -1, num_new_features, s[1]//num_new_features, s[2], s[3]])   # [GMncHW] Split minibatch into M groups of size G. Split channels into n channel groups c.
+    y = tf.cast(y, tf.float32)                              # [GMncHW] Cast to FP32.
+    y -= tf.reduce_mean(y, axis=0, keepdims=True)           # [GMncHW] Subtract mean over group.
+    y = tf.reduce_mean(tf.square(y), axis=0)                # [MncHW]  Calc variance over group.
+    y = tf.sqrt(y + 1e-8)                                   # [MncHW]  Calc stddev over group.
+    y = tf.reduce_mean(y, axis=[2,3,4], keepdims=True)      # [Mn111]  Take average over fmaps and pixels.
+    y = tf.reduce_mean(y, axis=[2])                         # [Mn11] Split channels into c channel groups
+    y = tf.cast(y, x.dtype)                                 # [Mn11]  Cast back to original data type.
+    y = tf.tile(y, [group_size, 1, s[2], s[3]])             # [NnHW]  Replicate over group and pixels.
+    return tf.concat([x, y], axis=1)                        # [NCHW]  Append as new fmap.
+
+#----------------------------------------------------------------------------
+# Main generator network.
+# Composed of two sub-networks (mapping and synthesis) that are defined below.
+# Used in configs B-F (Table 1).
+
+def G_main(
+    latents_in,                                         # First input: Latent vectors (Z) [minibatch, latent_size].
+    labels_in,                                          # Second input: Conditioning labels [minibatch, label_size].
+    truncation_psi          = 0.5,                      # Style strength multiplier for the truncation trick. None = disable.
+    truncation_cutoff       = None,                     # Number of layers for which to apply the truncation trick. None = disable.
+    truncation_psi_val      = None,                     # Value for truncation_psi to use during validation.
+    truncation_cutoff_val   = None,                     # Value for truncation_cutoff to use during validation.
+    dlatent_avg_beta        = 0.995,                    # Decay for tracking the moving average of W during training. None = disable.
+    style_mixing_prob       = 0.9,                      # Probability of mixing styles during training. None = disable.
+    is_training             = False,                    # Network is under training? Enables and disables specific features.
+    is_validation           = False,                    # Network is under validation? Chooses which value to use for truncation_psi.
+    return_dlatents         = False,                    # Return dlatents in addition to the images?
+    is_template_graph       = False,                    # True = template graph constructed by the Network class, False = actual evaluation.
+    components              = dnnlib.EasyDict(),        # Container for sub-networks. Retained between calls.
+    mapping_func            = 'G_mapping',              # Build func name for the mapping network.
+    synthesis_func          = 'G_synthesis_stylegan2',  # Build func name for the synthesis network.
+    **kwargs):                                          # Arguments for sub-networks (mapping and synthesis).
+
+    # Validate arguments.
+    assert not is_training or not is_validation
+    assert isinstance(components, dnnlib.EasyDict)
+    if is_validation:
+        truncation_psi = truncation_psi_val
+        truncation_cutoff = truncation_cutoff_val
+    if is_training or (truncation_psi is not None and not tflib.is_tf_expression(truncation_psi) and truncation_psi == 1):
+        truncation_psi = None
+    if is_training:
+        truncation_cutoff = None
+    if not is_training or (dlatent_avg_beta is not None and not tflib.is_tf_expression(dlatent_avg_beta) and dlatent_avg_beta == 1):
+        dlatent_avg_beta = None
+    if not is_training or (style_mixing_prob is not None and not tflib.is_tf_expression(style_mixing_prob) and style_mixing_prob <= 0):
+        style_mixing_prob = None
+
+    # Setup components.
+    if 'synthesis' not in components:
+        components.synthesis = tflib.Network('G_synthesis', func_name=globals()[synthesis_func], **kwargs)
+    num_layers = components.synthesis.input_shape[1]
+    dlatent_size = components.synthesis.input_shape[2]
+    if 'mapping' not in components:
+        components.mapping = tflib.Network('G_mapping', func_name=globals()[mapping_func], dlatent_broadcast=num_layers, **kwargs)
+
+    # Setup variables.
+    lod_in = tf.get_variable('lod', initializer=np.float32(0), trainable=False)
+    dlatent_avg = tf.get_variable('dlatent_avg', shape=[dlatent_size], initializer=tf.initializers.zeros(), trainable=False)
+
+    # Evaluate mapping network.
+    dlatents = components.mapping.get_output_for(latents_in, labels_in, is_training=is_training, **kwargs)
+    dlatents = tf.cast(dlatents, tf.float32)
+
+    # Update moving average of W.
+    if dlatent_avg_beta is not None:
+        with tf.variable_scope('DlatentAvg'):
+            batch_avg = tf.reduce_mean(dlatents[:, 0], axis=0)
+            update_op = tf.assign(dlatent_avg, tflib.lerp(batch_avg, dlatent_avg, dlatent_avg_beta))
+            with tf.control_dependencies([update_op]):
+                dlatents = tf.identity(dlatents)
+
+    # Perform style mixing regularization.
+    if style_mixing_prob is not None:
+        with tf.variable_scope('StyleMix'):
+            latents2 = tf.random_normal(tf.shape(latents_in))
+            dlatents2 = components.mapping.get_output_for(latents2, labels_in, is_training=is_training, **kwargs)
+            dlatents2 = tf.cast(dlatents2, tf.float32)
+            layer_idx = np.arange(num_layers)[np.newaxis, :, np.newaxis]
+            cur_layers = num_layers - tf.cast(lod_in, tf.int32) * 2
+            mixing_cutoff = tf.cond(
+                tf.random_uniform([], 0.0, 1.0) < style_mixing_prob,
+                lambda: tf.random_uniform([], 1, cur_layers, dtype=tf.int32),
+                lambda: cur_layers)
+            dlatents = tf.where(tf.broadcast_to(layer_idx < mixing_cutoff, tf.shape(dlatents)), dlatents, dlatents2)
+
+    # Apply truncation trick.
+    if truncation_psi is not None:
+        with tf.variable_scope('Truncation'):
+            layer_idx = np.arange(num_layers)[np.newaxis, :, np.newaxis]
+            layer_psi = np.ones(layer_idx.shape, dtype=np.float32)
+            if truncation_cutoff is None:
+                layer_psi *= truncation_psi
+            else:
+                layer_psi = tf.where(layer_idx < truncation_cutoff, layer_psi * truncation_psi, layer_psi)
+            dlatents = tflib.lerp(dlatent_avg, dlatents, layer_psi)
+
+    # Evaluate synthesis network.
+    deps = []
+    if 'lod' in components.synthesis.vars:
+        deps.append(tf.assign(components.synthesis.vars['lod'], lod_in))
+    with tf.control_dependencies(deps):
+        images_out = components.synthesis.get_output_for(dlatents, is_training=is_training, force_clean_graph=is_template_graph, **kwargs)
+
+    # Return requested outputs.
+    images_out = tf.identity(images_out, name='images_out')
+    if return_dlatents:
+        return images_out, dlatents
+    return images_out
+
+#----------------------------------------------------------------------------
+# Mapping network.
+# Transforms the input latent code (z) to the disentangled latent code (w).
+# Used in configs B-F (Table 1).
+
+def G_mapping(
+    latents_in,                             # First input: Latent vectors (Z) [minibatch, latent_size].
+    labels_in,                              # Second input: Conditioning labels [minibatch, label_size].
+    latent_size             = 512,          # Latent vector (Z) dimensionality.
+    label_size              = 0,            # Label dimensionality, 0 if no labels.
+    dlatent_size            = 512,          # Disentangled latent (W) dimensionality.
+    dlatent_broadcast       = None,         # Output disentangled latent (W) as [minibatch, dlatent_size] or [minibatch, dlatent_broadcast, dlatent_size].
+    mapping_layers          = 8,            # Number of mapping layers.
+    mapping_fmaps           = 512,          # Number of activations in the mapping layers.
+    mapping_lrmul           = 0.01,         # Learning rate multiplier for the mapping layers.
+    mapping_nonlinearity    = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+    normalize_latents       = True,         # Normalize latent vectors (Z) before feeding them to the mapping layers?
+    dtype                   = 'float32',    # Data type to use for activations and outputs.
+    **_kwargs):                             # Ignore unrecognized keyword args.
+
+    act = mapping_nonlinearity
+
+    # Inputs.
+    latents_in.set_shape([None, latent_size])
+    labels_in.set_shape([None, label_size])
+    latents_in = tf.cast(latents_in, dtype)
+    labels_in = tf.cast(labels_in, dtype)
+    x = latents_in
+
+    # Embed labels and concatenate them with latents.
+    if label_size:
+        with tf.variable_scope('LabelConcat'):
+            w = tf.get_variable('weight', shape=[label_size, latent_size], initializer=tf.initializers.random_normal())
+            y = tf.matmul(labels_in, tf.cast(w, dtype))
+            x = tf.concat([x, y], axis=1)
+
+    # Normalize latents.
+    if normalize_latents:
+        with tf.variable_scope('Normalize'):
+            x *= tf.rsqrt(tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + 1e-8)
+
+    # Mapping layers.
+    for layer_idx in range(mapping_layers):
+        with tf.variable_scope('Dense%d' % layer_idx):
+            fmaps = dlatent_size if layer_idx == mapping_layers - 1 else mapping_fmaps
+            x = apply_bias_act(dense_layer(x, fmaps=fmaps, lrmul=mapping_lrmul), act=act, lrmul=mapping_lrmul)
+
+    # Broadcast.
+    if dlatent_broadcast is not None:
+        with tf.variable_scope('Broadcast'):
+            x = tf.tile(x[:, np.newaxis], [1, dlatent_broadcast, 1])
+
+    # Output.
+    assert x.dtype == tf.as_dtype(dtype)
+    return tf.identity(x, name='dlatents_out')
+
+#----------------------------------------------------------------------------
+# StyleGAN synthesis network with revised architecture (Figure 2d).
+# Implements progressive growing, but no skip connections or residual nets (Figure 7).
+# Used in configs B-D (Table 1).
+
+def G_synthesis_stylegan_revised(
+    dlatents_in,                        # Input: Disentangled latents (W) [minibatch, num_layers, dlatent_size].
+    dlatent_size        = 512,          # Disentangled latent (W) dimensionality.
+    num_channels        = 3,            # Number of output color channels.
+    resolution          = 1024,         # Output resolution.
+    fmap_base           = 16 << 10,     # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_min            = 1,            # Minimum number of feature maps in any layer.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    randomize_noise     = True,         # True = randomize noise inputs every time (non-deterministic), False = read noise inputs from variables.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    resample_kernel     = [1,3,3,1],    # Low-pass filter to apply when resampling activations. None = no filtering.
+    fused_modconv       = True,         # Implement modulated_conv2d_layer() as a single fused op?
+    structure           = 'auto',       # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically.
+    is_template_graph   = False,        # True = template graph constructed by the Network class, False = actual evaluation.
+    force_clean_graph   = False,        # True = construct a clean graph that looks nice in TensorBoard, False = default behavior.
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return np.clip(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_min, fmap_max)
+    if is_template_graph: force_clean_graph = True
+    if force_clean_graph: randomize_noise = False
+    if structure == 'auto': structure = 'linear' if force_clean_graph else 'recursive'
+    act = nonlinearity
+    num_layers = resolution_log2 * 2 - 2
+    images_out = None
+
+    # Primary inputs.
+    dlatents_in.set_shape([None, num_layers, dlatent_size])
+    dlatents_in = tf.cast(dlatents_in, dtype)
+    lod_in = tf.cast(tf.get_variable('lod', initializer=np.float32(0), trainable=False), dtype)
+
+    # Noise inputs.
+    noise_inputs = []
+    for layer_idx in range(num_layers - 1):
+        res = (layer_idx + 5) // 2
+        shape = [1, 1, 2**res, 2**res]
+        noise_inputs.append(tf.get_variable('noise%d' % layer_idx, shape=shape, initializer=tf.initializers.random_normal(), trainable=False))
+
+    # Single convolution layer with all the bells and whistles.
+    def layer(x, layer_idx, fmaps, kernel, up=False):
+        x = modulated_conv2d_layer(x, dlatents_in[:, layer_idx], fmaps=fmaps, kernel=kernel, up=up, resample_kernel=resample_kernel, fused_modconv=fused_modconv)
+        if randomize_noise:
+            noise = tf.random_normal([tf.shape(x)[0], 1, x.shape[2], x.shape[3]], dtype=x.dtype)
+        else:
+            noise = tf.cast(noise_inputs[layer_idx], x.dtype)
+        noise_strength = tf.get_variable('noise_strength', shape=[], initializer=tf.initializers.zeros())
+        x += noise * tf.cast(noise_strength, x.dtype)
+        return apply_bias_act(x, act=act)
+
+    # Early layers.
+    with tf.variable_scope('4x4'):
+        with tf.variable_scope('Const'):
+            x = tf.get_variable('const', shape=[1, nf(1), 4, 4], initializer=tf.initializers.random_normal())
+            x = tf.tile(tf.cast(x, dtype), [tf.shape(dlatents_in)[0], 1, 1, 1])
+        with tf.variable_scope('Conv'):
+            x = layer(x, layer_idx=0, fmaps=nf(1), kernel=3)
+
+    # Building blocks for remaining layers.
+    def block(res, x): # res = 3..resolution_log2
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            with tf.variable_scope('Conv0_up'):
+                x = layer(x, layer_idx=res*2-5, fmaps=nf(res-1), kernel=3, up=True)
+            with tf.variable_scope('Conv1'):
+                x = layer(x, layer_idx=res*2-4, fmaps=nf(res-1), kernel=3)
+            return x
+    def torgb(res, x): # res = 2..resolution_log2
+        with tf.variable_scope('ToRGB_lod%d' % (resolution_log2 - res)):
+            return apply_bias_act(modulated_conv2d_layer(x, dlatents_in[:, res*2-3], fmaps=num_channels, kernel=1, demodulate=False, fused_modconv=fused_modconv))
+
+    # Fixed structure: simple and efficient, but does not support progressive growing.
+    if structure == 'fixed':
+        for res in range(3, resolution_log2 + 1):
+            x = block(res, x)
+        images_out = torgb(resolution_log2, x)
+
+    # Linear structure: simple but inefficient.
+    if structure == 'linear':
+        images_out = torgb(2, x)
+        for res in range(3, resolution_log2 + 1):
+            lod = resolution_log2 - res
+            x = block(res, x)
+            img = torgb(res, x)
+            with tf.variable_scope('Upsample_lod%d' % lod):
+                images_out = upsample_2d(images_out)
+            with tf.variable_scope('Grow_lod%d' % lod):
+                images_out = tflib.lerp_clip(img, images_out, lod_in - lod)
+
+    # Recursive structure: complex but efficient.
+    if structure == 'recursive':
+        def cset(cur_lambda, new_cond, new_lambda):
+            return lambda: tf.cond(new_cond, new_lambda, cur_lambda)
+        def grow(x, res, lod):
+            y = block(res, x)
+            img = lambda: naive_upsample_2d(torgb(res, y), factor=2**lod)
+            img = cset(img, (lod_in > lod), lambda: naive_upsample_2d(tflib.lerp(torgb(res, y), upsample_2d(torgb(res - 1, x)), lod_in - lod), factor=2**lod))
+            if lod > 0: img = cset(img, (lod_in < lod), lambda: grow(y, res + 1, lod - 1))
+            return img()
+        images_out = grow(x, 3, resolution_log2 - 3)
+
+    assert images_out.dtype == tf.as_dtype(dtype)
+    return tf.identity(images_out, name='images_out')
+
+#----------------------------------------------------------------------------
+# StyleGAN2 synthesis network (Figure 7).
+# Implements skip connections and residual nets (Figure 7), but no progressive growing.
+# Used in configs E-F (Table 1).
+
+def G_synthesis_stylegan2(
+    dlatents_in,                        # Input: Disentangled latents (W) [minibatch, num_layers, dlatent_size].
+    dlatent_size        = 512,          # Disentangled latent (W) dimensionality.
+    num_channels        = 3,            # Number of output color channels.
+    resolution          = 1024,         # Output resolution.
+    fmap_base           = 16 << 10,     # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_min            = 1,            # Minimum number of feature maps in any layer.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    randomize_noise     = True,         # True = randomize noise inputs every time (non-deterministic), False = read noise inputs from variables.
+    architecture        = 'skip',       # Architecture: 'orig', 'skip', 'resnet'.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    resample_kernel     = [1,3,3,1],    # Low-pass filter to apply when resampling activations. None = no filtering.
+    fused_modconv       = True,         # Implement modulated_conv2d_layer() as a single fused op?
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return np.clip(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_min, fmap_max)
+    assert architecture in ['orig', 'skip', 'resnet']
+    act = nonlinearity
+    num_layers = resolution_log2 * 2 - 2
+    images_out = None
+
+    # Primary inputs.
+    dlatents_in.set_shape([None, num_layers, dlatent_size])
+    dlatents_in = tf.cast(dlatents_in, dtype)
+
+    # Noise inputs.
+    noise_inputs = []
+    for layer_idx in range(num_layers - 1):
+        res = (layer_idx + 5) // 2
+        shape = [1, 1, 2**res, 2**res]
+        noise_inputs.append(tf.get_variable('noise%d' % layer_idx, shape=shape, initializer=tf.initializers.random_normal(), trainable=False))
+
+    # Single convolution layer with all the bells and whistles.
+    def layer(x, layer_idx, fmaps, kernel, up=False):
+        x = modulated_conv2d_layer(x, dlatents_in[:, layer_idx], fmaps=fmaps, kernel=kernel, up=up, resample_kernel=resample_kernel, fused_modconv=fused_modconv)
+        if randomize_noise:
+            noise = tf.random_normal([tf.shape(x)[0], 1, x.shape[2], x.shape[3]], dtype=x.dtype)
+        else:
+            noise = tf.cast(noise_inputs[layer_idx], x.dtype)
+        noise_strength = tf.get_variable('noise_strength', shape=[], initializer=tf.initializers.zeros())
+        x += noise * tf.cast(noise_strength, x.dtype)
+        return apply_bias_act(x, act=act)
+
+    # Building blocks for main layers.
+    def block(x, res): # res = 3..resolution_log2
+        t = x
+        with tf.variable_scope('Conv0_up'):
+            x = layer(x, layer_idx=res*2-5, fmaps=nf(res-1), kernel=3, up=True)
+        with tf.variable_scope('Conv1'):
+            x = layer(x, layer_idx=res*2-4, fmaps=nf(res-1), kernel=3)
+        if architecture == 'resnet':
+            with tf.variable_scope('Skip'):
+                t = conv2d_layer(t, fmaps=nf(res-1), kernel=1, up=True, resample_kernel=resample_kernel)
+                x = (x + t) * (1 / np.sqrt(2))
+        return x
+    def upsample(y):
+        with tf.variable_scope('Upsample'):
+            return upsample_2d(y, k=resample_kernel)
+    def torgb(x, y, res): # res = 2..resolution_log2
+        with tf.variable_scope('ToRGB'):
+            t = apply_bias_act(modulated_conv2d_layer(x, dlatents_in[:, res*2-3], fmaps=num_channels, kernel=1, demodulate=False, fused_modconv=fused_modconv))
+            return t if y is None else y + t
+
+    # Early layers.
+    y = None
+    with tf.variable_scope('4x4'):
+        with tf.variable_scope('Const'):
+            x = tf.get_variable('const', shape=[1, nf(1), 4, 4], initializer=tf.initializers.random_normal())
+            x = tf.tile(tf.cast(x, dtype), [tf.shape(dlatents_in)[0], 1, 1, 1])
+        with tf.variable_scope('Conv'):
+            x = layer(x, layer_idx=0, fmaps=nf(1), kernel=3)
+        if architecture == 'skip':
+            y = torgb(x, y, 2)
+
+    # Main layers.
+    for res in range(3, resolution_log2 + 1):
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            x = block(x, res)
+            if architecture == 'skip':
+                y = upsample(y)
+            if architecture == 'skip' or res == resolution_log2:
+                y = torgb(x, y, res)
+    images_out = y
+
+    assert images_out.dtype == tf.as_dtype(dtype)
+    return tf.identity(images_out, name='images_out')
+
+#----------------------------------------------------------------------------
+# Original StyleGAN discriminator.
+# Used in configs B-D (Table 1).
+
+def D_stylegan(
+    images_in,                          # First input: Images [minibatch, channel, height, width].
+    labels_in,                          # Second input: Labels [minibatch, label_size].
+    num_channels        = 3,            # Number of input color channels. Overridden based on dataset.
+    resolution          = 1024,         # Input resolution. Overridden based on dataset.
+    label_size          = 0,            # Dimensionality of the labels, 0 if no labels. Overridden based on dataset.
+    fmap_base           = 16 << 10,     # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_min            = 1,            # Minimum number of feature maps in any layer.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+    mbstd_group_size    = 4,            # Group size for the minibatch standard deviation layer, 0 = disable.
+    mbstd_num_features  = 1,            # Number of features for the minibatch standard deviation layer.
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    resample_kernel     = [1,3,3,1],    # Low-pass filter to apply when resampling activations. None = no filtering.
+    structure           = 'auto',       # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically.
+    is_template_graph   = False,        # True = template graph constructed by the Network class, False = actual evaluation.
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return np.clip(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_min, fmap_max)
+    if structure == 'auto': structure = 'linear' if is_template_graph else 'recursive'
+    act = nonlinearity
+
+    images_in.set_shape([None, num_channels, resolution, resolution])
+    labels_in.set_shape([None, label_size])
+    images_in = tf.cast(images_in, dtype)
+    labels_in = tf.cast(labels_in, dtype)
+    lod_in = tf.cast(tf.get_variable('lod', initializer=np.float32(0.0), trainable=False), dtype)
+
+    # Building blocks for spatial layers.
+    def fromrgb(x, res): # res = 2..resolution_log2
+        with tf.variable_scope('FromRGB_lod%d' % (resolution_log2 - res)):
+            return apply_bias_act(conv2d_layer(x, fmaps=nf(res-1), kernel=1), act=act)
+    def block(x, res): # res = 2..resolution_log2
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            with tf.variable_scope('Conv0'):
+                x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-1), kernel=3), act=act)
+            with tf.variable_scope('Conv1_down'):
+                x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-2), kernel=3, down=True, resample_kernel=resample_kernel), act=act)
+            return x
+
+    # Fixed structure: simple and efficient, but does not support progressive growing.
+    if structure == 'fixed':
+        x = fromrgb(images_in, resolution_log2)
+        for res in range(resolution_log2, 2, -1):
+            x = block(x, res)
+
+    # Linear structure: simple but inefficient.
+    if structure == 'linear':
+        img = images_in
+        x = fromrgb(img, resolution_log2)
+        for res in range(resolution_log2, 2, -1):
+            lod = resolution_log2 - res
+            x = block(x, res)
+            with tf.variable_scope('Downsample_lod%d' % lod):
+                img = downsample_2d(img)
+            y = fromrgb(img, res - 1)
+            with tf.variable_scope('Grow_lod%d' % lod):
+                x = tflib.lerp_clip(x, y, lod_in - lod)
+
+    # Recursive structure: complex but efficient.
+    if structure == 'recursive':
+        def cset(cur_lambda, new_cond, new_lambda):
+            return lambda: tf.cond(new_cond, new_lambda, cur_lambda)
+        def grow(res, lod):
+            x = lambda: fromrgb(naive_downsample_2d(images_in, factor=2**lod), res)
+            if lod > 0: x = cset(x, (lod_in < lod), lambda: grow(res + 1, lod - 1))
+            x = block(x(), res); y = lambda: x
+            y = cset(y, (lod_in > lod), lambda: tflib.lerp(x, fromrgb(naive_downsample_2d(images_in, factor=2**(lod+1)), res - 1), lod_in - lod))
+            return y()
+        x = grow(3, resolution_log2 - 3)
+
+    # Final layers at 4x4 resolution.
+    with tf.variable_scope('4x4'):
+        if mbstd_group_size > 1:
+            with tf.variable_scope('MinibatchStddev'):
+                x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features)
+        with tf.variable_scope('Conv'):
+            x = apply_bias_act(conv2d_layer(x, fmaps=nf(1), kernel=3), act=act)
+        with tf.variable_scope('Dense0'):
+            x = apply_bias_act(dense_layer(x, fmaps=nf(0)), act=act)
+
+    # Output layer with label conditioning from "Which Training Methods for GANs do actually Converge?"
+    with tf.variable_scope('Output'):
+        x = apply_bias_act(dense_layer(x, fmaps=max(labels_in.shape[1], 1)))
+        if labels_in.shape[1] > 0:
+            x = tf.reduce_sum(x * labels_in, axis=1, keepdims=True)
+    scores_out = x
+
+    # Output.
+    assert scores_out.dtype == tf.as_dtype(dtype)
+    scores_out = tf.identity(scores_out, name='scores_out')
+    return scores_out
+
+#----------------------------------------------------------------------------
+# StyleGAN2 discriminator (Figure 7).
+# Implements skip connections and residual nets (Figure 7), but no progressive growing.
+# Used in configs E-F (Table 1).
+
+def D_stylegan2(
+    images_in,                          # First input: Images [minibatch, channel, height, width].
+    labels_in,                          # Second input: Labels [minibatch, label_size].
+    num_channels        = 3,            # Number of input color channels. Overridden based on dataset.
+    resolution          = 1024,         # Input resolution. Overridden based on dataset.
+    label_size          = 0,            # Dimensionality of the labels, 0 if no labels. Overridden based on dataset.
+    fmap_base           = 16 << 10,     # Overall multiplier for the number of feature maps.
+    fmap_decay          = 1.0,          # log2 feature map reduction when doubling the resolution.
+    fmap_min            = 1,            # Minimum number of feature maps in any layer.
+    fmap_max            = 512,          # Maximum number of feature maps in any layer.
+    architecture        = 'resnet',     # Architecture: 'orig', 'skip', 'resnet'.
+    nonlinearity        = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+    mbstd_group_size    = 4,            # Group size for the minibatch standard deviation layer, 0 = disable.
+    mbstd_num_features  = 1,            # Number of features for the minibatch standard deviation layer.
+    dtype               = 'float32',    # Data type to use for activations and outputs.
+    resample_kernel     = [1,3,3,1],    # Low-pass filter to apply when resampling activations. None = no filtering.
+    **_kwargs):                         # Ignore unrecognized keyword args.
+
+    resolution_log2 = int(np.log2(resolution))
+    assert resolution == 2**resolution_log2 and resolution >= 4
+    def nf(stage): return np.clip(int(fmap_base / (2.0 ** (stage * fmap_decay))), fmap_min, fmap_max)
+    assert architecture in ['orig', 'skip', 'resnet']
+    act = nonlinearity
+
+    images_in.set_shape([None, num_channels, resolution, resolution])
+    labels_in.set_shape([None, label_size])
+    images_in = tf.cast(images_in, dtype)
+    labels_in = tf.cast(labels_in, dtype)
+
+    # Building blocks for main layers.
+    def fromrgb(x, y, res): # res = 2..resolution_log2
+        with tf.variable_scope('FromRGB'):
+            t = apply_bias_act(conv2d_layer(y, fmaps=nf(res-1), kernel=1), act=act)
+            return t if x is None else x + t
+    def block(x, res): # res = 2..resolution_log2
+        t = x
+        with tf.variable_scope('Conv0'):
+            x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-1), kernel=3), act=act)
+        with tf.variable_scope('Conv1_down'):
+            x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-2), kernel=3, down=True, resample_kernel=resample_kernel), act=act)
+        if architecture == 'resnet':
+            with tf.variable_scope('Skip'):
+                t = conv2d_layer(t, fmaps=nf(res-2), kernel=1, down=True, resample_kernel=resample_kernel)
+                x = (x + t) * (1 / np.sqrt(2))
+        return x
+    def downsample(y):
+        with tf.variable_scope('Downsample'):
+            return downsample_2d(y, k=resample_kernel)
+
+    # Main layers.
+    x = None
+    y = images_in
+    for res in range(resolution_log2, 2, -1):
+        with tf.variable_scope('%dx%d' % (2**res, 2**res)):
+            if architecture == 'skip' or res == resolution_log2:
+                x = fromrgb(x, y, res)
+            x = block(x, res)
+            if architecture == 'skip':
+                y = downsample(y)
+
+    # Final layers.
+    with tf.variable_scope('4x4'):
+        if architecture == 'skip':
+            x = fromrgb(x, y, 2)
+        if mbstd_group_size > 1:
+            with tf.variable_scope('MinibatchStddev'):
+                x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features)
+        with tf.variable_scope('Conv'):
+            x = apply_bias_act(conv2d_layer(x, fmaps=nf(1), kernel=3), act=act)
+        with tf.variable_scope('Dense0'):
+            x = apply_bias_act(dense_layer(x, fmaps=nf(0)), act=act)
+
+    # Output layer with label conditioning from "Which Training Methods for GANs do actually Converge?"
+    with tf.variable_scope('Output'):
+        x = apply_bias_act(dense_layer(x, fmaps=max(labels_in.shape[1], 1)))
+        if labels_in.shape[1] > 0:
+            x = tf.reduce_sum(x * labels_in, axis=1, keepdims=True)
+    scores_out = x
+
+    # Output.
+    assert scores_out.dtype == tf.as_dtype(dtype)
+    scores_out = tf.identity(scores_out, name='scores_out')
+    return scores_out
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/external/stylegan2/training/training_loop.py b/insightface/reconstruction/ostec/external/stylegan2/training/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d88cf03b8448ff57caa98515918920c6cfab35
--- /dev/null
+++ b/insightface/reconstruction/ostec/external/stylegan2/training/training_loop.py
@@ -0,0 +1,356 @@
+# Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, visit
+# https://nvlabs.github.io/stylegan2/license.html
+
+"""Main training script."""
+
+import numpy as np
+import tensorflow as tf
+import dnnlib
+import dnnlib.tflib as tflib
+from dnnlib.tflib.autosummary import autosummary
+
+from training import dataset
+from training import misc
+from metrics import metric_base
+
+#----------------------------------------------------------------------------
+# Just-in-time processing of training images before feeding them to the networks.
+
+def process_reals(x, labels, lod, mirror_augment, drange_data, drange_net):
+    with tf.name_scope('DynamicRange'):
+        x = tf.cast(x, tf.float32)
+        x = misc.adjust_dynamic_range(x, drange_data, drange_net)
+    if mirror_augment:
+        with tf.name_scope('MirrorAugment'):
+            x = tf.where(tf.random_uniform([tf.shape(x)[0]]) < 0.5, x, tf.reverse(x, [3]))
+    with tf.name_scope('FadeLOD'): # Smooth crossfade between consecutive levels-of-detail.
+        s = tf.shape(x)
+        y = tf.reshape(x, [-1, s[1], s[2]//2, 2, s[3]//2, 2])
+        y = tf.reduce_mean(y, axis=[3, 5], keepdims=True)
+        y = tf.tile(y, [1, 1, 1, 2, 1, 2])
+        y = tf.reshape(y, [-1, s[1], s[2], s[3]])
+        x = tflib.lerp(x, y, lod - tf.floor(lod))
+    with tf.name_scope('UpscaleLOD'): # Upscale to match the expected input/output size of the networks.
+        s = tf.shape(x)
+        factor = tf.cast(2 ** tf.floor(lod), tf.int32)
+        x = tf.reshape(x, [-1, s[1], s[2], 1, s[3], 1])
+        x = tf.tile(x, [1, 1, 1, factor, 1, factor])
+        x = tf.reshape(x, [-1, s[1], s[2] * factor, s[3] * factor])
+    return x, labels
+
+#----------------------------------------------------------------------------
+# Evaluate time-varying training parameters.
+
+def training_schedule(
+    cur_nimg,
+    training_set,
+    lod_initial_resolution  = None,     # Image resolution used at the beginning.
+    lod_training_kimg       = 600,      # Thousands of real images to show before doubling the resolution.
+    lod_transition_kimg     = 600,      # Thousands of real images to show when fading in new layers.
+    minibatch_size_base     = 32,       # Global minibatch size.
+    minibatch_size_dict     = {},       # Resolution-specific overrides.
+    minibatch_gpu_base      = 4,        # Number of samples processed at a time by one GPU.
+    minibatch_gpu_dict      = {},       # Resolution-specific overrides.
+    G_lrate_base            = 0.002,    # Learning rate for the generator.
+    G_lrate_dict            = {},       # Resolution-specific overrides.
+    D_lrate_base            = 0.002,    # Learning rate for the discriminator.
+    D_lrate_dict            = {},       # Resolution-specific overrides.
+    lrate_rampup_kimg       = 0,        # Duration of learning rate ramp-up.
+    tick_kimg_base          = 4,        # Default interval of progress snapshots.
+    tick_kimg_dict          = {8:28, 16:24, 32:20, 64:16, 128:12, 256:8, 512:6, 1024:4}): # Resolution-specific overrides.
+
+    # Initialize result dict.
+    s = dnnlib.EasyDict()
+    s.kimg = cur_nimg / 1000.0
+
+    # Training phase.
+    phase_dur = lod_training_kimg + lod_transition_kimg
+    phase_idx = int(np.floor(s.kimg / phase_dur)) if phase_dur > 0 else 0
+    phase_kimg = s.kimg - phase_idx * phase_dur
+
+    # Level-of-detail and resolution.
+    if lod_initial_resolution is None:
+        s.lod = 0.0
+    else:
+        s.lod = training_set.resolution_log2
+        s.lod -= np.floor(np.log2(lod_initial_resolution))
+        s.lod -= phase_idx
+        if lod_transition_kimg > 0:
+            s.lod -= max(phase_kimg - lod_training_kimg, 0.0) / lod_transition_kimg
+        s.lod = max(s.lod, 0.0)
+    s.resolution = 2 ** (training_set.resolution_log2 - int(np.floor(s.lod)))
+
+    # Minibatch size.
+    s.minibatch_size = minibatch_size_dict.get(s.resolution, minibatch_size_base)
+    s.minibatch_gpu = minibatch_gpu_dict.get(s.resolution, minibatch_gpu_base)
+
+    # Learning rate.
+    s.G_lrate = G_lrate_dict.get(s.resolution, G_lrate_base)
+    s.D_lrate = D_lrate_dict.get(s.resolution, D_lrate_base)
+    if lrate_rampup_kimg > 0:
+        rampup = min(s.kimg / lrate_rampup_kimg, 1.0)
+        s.G_lrate *= rampup
+        s.D_lrate *= rampup
+
+    # Other parameters.
+    s.tick_kimg = tick_kimg_dict.get(s.resolution, tick_kimg_base)
+    return s
+
+#----------------------------------------------------------------------------
+# Main training script.
+
+def training_loop(
+    G_args                  = {},       # Options for generator network.
+    D_args                  = {},       # Options for discriminator network.
+    G_opt_args              = {},       # Options for generator optimizer.
+    D_opt_args              = {},       # Options for discriminator optimizer.
+    G_loss_args             = {},       # Options for generator loss.
+    D_loss_args             = {},       # Options for discriminator loss.
+    dataset_args            = {},       # Options for dataset.load_dataset().
+    sched_args              = {},       # Options for train.TrainingSchedule.
+    grid_args               = {},       # Options for train.setup_snapshot_image_grid().
+    metric_arg_list         = [],       # Options for MetricGroup.
+    tf_config               = {},       # Options for tflib.init_tf().
+    data_dir                = None,     # Directory to load datasets from.
+    G_smoothing_kimg        = 10.0,     # Half-life of the running average of generator weights.
+    minibatch_repeats       = 4,        # Number of minibatches to run before adjusting training parameters.
+    lazy_regularization     = True,     # Perform regularization as a separate training step?
+    G_reg_interval          = 4,        # How often the perform regularization for G? Ignored if lazy_regularization=False.
+    D_reg_interval          = 16,       # How often the perform regularization for D? Ignored if lazy_regularization=False.
+    reset_opt_for_new_lod   = True,     # Reset optimizer internal state (e.g. Adam moments) when new layers are introduced?
+    total_kimg              = 25000,    # Total length of the training, measured in thousands of real images.
+    mirror_augment          = False,    # Enable mirror augment?
+    drange_net              = [-1,1],   # Dynamic range used when feeding image data to the networks.
+    image_snapshot_ticks    = 50,       # How often to save image snapshots? None = only save 'reals.png' and 'fakes-init.png'.
+    network_snapshot_ticks  = 50,       # How often to save network snapshots? None = only save 'networks-final.pkl'.
+    save_tf_graph           = False,    # Include full TensorFlow computation graph in the tfevents file?
+    save_weight_histograms  = False,    # Include weight histograms in the tfevents file?
+    resume_pkl              = None,     # Network pickle to resume training from, None = train from scratch.
+    resume_kimg             = 0.0,      # Assumed training progress at the beginning. Affects reporting and training schedule.
+    resume_time             = 0.0,      # Assumed wallclock time at the beginning. Affects reporting.
+    resume_with_new_nets    = False):   # Construct new networks according to G_args and D_args before resuming training?
+
+    # Initialize dnnlib and TensorFlow.
+    tflib.init_tf(tf_config)
+    num_gpus = dnnlib.submit_config.num_gpus
+
+    # Load training set.
+    training_set = dataset.load_dataset(data_dir=dnnlib.convert_path(data_dir), verbose=True, **dataset_args)
+    grid_size, grid_reals, grid_labels = misc.setup_snapshot_image_grid(training_set, **grid_args)
+    misc.save_image_grid(grid_reals, dnnlib.make_run_dir_path('reals.png'), drange=training_set.dynamic_range, grid_size=grid_size)
+
+    # Construct or load networks.
+    with tf.device('/gpu:0'):
+        if resume_pkl is None or resume_with_new_nets:
+            print('Constructing networks...')
+            G = tflib.Network('G', num_channels=training_set.shape[0], resolution=training_set.shape[1], label_size=training_set.label_size, **G_args)
+            D = tflib.Network('D', num_channels=training_set.shape[0], resolution=training_set.shape[1], label_size=training_set.label_size, **D_args)
+            Gs = G.clone('Gs')
+        if resume_pkl is not None:
+            print('Loading networks from "%s"...' % resume_pkl)
+            rG, rD, rGs = misc.load_pkl(resume_pkl)
+            if resume_with_new_nets: G.copy_vars_from(rG); D.copy_vars_from(rD); Gs.copy_vars_from(rGs)
+            else: G = rG; D = rD; Gs = rGs
+
+    # Print layers and generate initial image snapshot.
+    G.print_layers(); D.print_layers()
+    sched = training_schedule(cur_nimg=total_kimg*1000, training_set=training_set, **sched_args)
+    grid_latents = np.random.randn(np.prod(grid_size), *G.input_shape[1:])
+    grid_fakes = Gs.run(grid_latents, grid_labels, is_validation=True, minibatch_size=sched.minibatch_gpu)
+    misc.save_image_grid(grid_fakes, dnnlib.make_run_dir_path('fakes_init.png'), drange=drange_net, grid_size=grid_size)
+
+    # Setup training inputs.
+    print('Building TensorFlow graph...')
+    with tf.name_scope('Inputs'), tf.device('/cpu:0'):
+        lod_in               = tf.placeholder(tf.float32, name='lod_in', shape=[])
+        lrate_in             = tf.placeholder(tf.float32, name='lrate_in', shape=[])
+        minibatch_size_in    = tf.placeholder(tf.int32, name='minibatch_size_in', shape=[])
+        minibatch_gpu_in     = tf.placeholder(tf.int32, name='minibatch_gpu_in', shape=[])
+        minibatch_multiplier = minibatch_size_in // (minibatch_gpu_in * num_gpus)
+        Gs_beta              = 0.5 ** tf.div(tf.cast(minibatch_size_in, tf.float32), G_smoothing_kimg * 1000.0) if G_smoothing_kimg > 0.0 else 0.0
+
+    # Setup optimizers.
+    G_opt_args = dict(G_opt_args)
+    D_opt_args = dict(D_opt_args)
+    for args, reg_interval in [(G_opt_args, G_reg_interval), (D_opt_args, D_reg_interval)]:
+        args['minibatch_multiplier'] = minibatch_multiplier
+        args['learning_rate'] = lrate_in
+        if lazy_regularization:
+            mb_ratio = reg_interval / (reg_interval + 1)
+            args['learning_rate'] *= mb_ratio
+            if 'beta1' in args: args['beta1'] **= mb_ratio
+            if 'beta2' in args: args['beta2'] **= mb_ratio
+    G_opt = tflib.Optimizer(name='TrainG', **G_opt_args)
+    D_opt = tflib.Optimizer(name='TrainD', **D_opt_args)
+    G_reg_opt = tflib.Optimizer(name='RegG', share=G_opt, **G_opt_args)
+    D_reg_opt = tflib.Optimizer(name='RegD', share=D_opt, **D_opt_args)
+
+    # Build training graph for each GPU.
+    data_fetch_ops = []
+    for gpu in range(num_gpus):
+        with tf.name_scope('GPU%d' % gpu), tf.device('/gpu:%d' % gpu):
+
+            # Create GPU-specific shadow copies of G and D.
+            G_gpu = G if gpu == 0 else G.clone(G.name + '_shadow')
+            D_gpu = D if gpu == 0 else D.clone(D.name + '_shadow')
+
+            # Fetch training data via temporary variables.
+            with tf.name_scope('DataFetch'):
+                sched = training_schedule(cur_nimg=int(resume_kimg*1000), training_set=training_set, **sched_args)
+                reals_var = tf.Variable(name='reals', trainable=False, initial_value=tf.zeros([sched.minibatch_gpu] + training_set.shape))
+                labels_var = tf.Variable(name='labels', trainable=False, initial_value=tf.zeros([sched.minibatch_gpu, training_set.label_size]))
+                reals_write, labels_write = training_set.get_minibatch_tf()
+                reals_write, labels_write = process_reals(reals_write, labels_write, lod_in, mirror_augment, training_set.dynamic_range, drange_net)
+                reals_write = tf.concat([reals_write, reals_var[minibatch_gpu_in:]], axis=0)
+                labels_write = tf.concat([labels_write, labels_var[minibatch_gpu_in:]], axis=0)
+                data_fetch_ops += [tf.assign(reals_var, reals_write)]
+                data_fetch_ops += [tf.assign(labels_var, labels_write)]
+                reals_read = reals_var[:minibatch_gpu_in]
+                labels_read = labels_var[:minibatch_gpu_in]
+
+            # Evaluate loss functions.
+            lod_assign_ops = []
+            if 'lod' in G_gpu.vars: lod_assign_ops += [tf.assign(G_gpu.vars['lod'], lod_in)]
+            if 'lod' in D_gpu.vars: lod_assign_ops += [tf.assign(D_gpu.vars['lod'], lod_in)]
+            with tf.control_dependencies(lod_assign_ops):
+                with tf.name_scope('G_loss'):
+                    G_loss, G_reg = dnnlib.util.call_func_by_name(G=G_gpu, D=D_gpu, opt=G_opt, training_set=training_set, minibatch_size=minibatch_gpu_in, **G_loss_args)
+                with tf.name_scope('D_loss'):
+                    D_loss, D_reg = dnnlib.util.call_func_by_name(G=G_gpu, D=D_gpu, opt=D_opt, training_set=training_set, minibatch_size=minibatch_gpu_in, reals=reals_read, labels=labels_read, **D_loss_args)
+
+            # Register gradients.
+            if not lazy_regularization:
+                if G_reg is not None: G_loss += G_reg
+                if D_reg is not None: D_loss += D_reg
+            else:
+                if G_reg is not None: G_reg_opt.register_gradients(tf.reduce_mean(G_reg * G_reg_interval), G_gpu.trainables)
+                if D_reg is not None: D_reg_opt.register_gradients(tf.reduce_mean(D_reg * D_reg_interval), D_gpu.trainables)
+            G_opt.register_gradients(tf.reduce_mean(G_loss), G_gpu.trainables)
+            D_opt.register_gradients(tf.reduce_mean(D_loss), D_gpu.trainables)
+
+    # Setup training ops.
+    data_fetch_op = tf.group(*data_fetch_ops)
+    G_train_op = G_opt.apply_updates()
+    D_train_op = D_opt.apply_updates()
+    G_reg_op = G_reg_opt.apply_updates(allow_no_op=True)
+    D_reg_op = D_reg_opt.apply_updates(allow_no_op=True)
+    Gs_update_op = Gs.setup_as_moving_average_of(G, beta=Gs_beta)
+
+    # Finalize graph.
+    with tf.device('/gpu:0'):
+        try:
+            peak_gpu_mem_op = tf.contrib.memory_stats.MaxBytesInUse()
+        except tf.errors.NotFoundError:
+            peak_gpu_mem_op = tf.constant(0)
+    tflib.init_uninitialized_vars()
+
+    print('Initializing logs...')
+    summary_log = tf.summary.FileWriter(dnnlib.make_run_dir_path())
+    if save_tf_graph:
+        summary_log.add_graph(tf.get_default_graph())
+    if save_weight_histograms:
+        G.setup_weight_histograms(); D.setup_weight_histograms()
+    metrics = metric_base.MetricGroup(metric_arg_list)
+
+    print('Training for %d kimg...\n' % total_kimg)
+    dnnlib.RunContext.get().update('', cur_epoch=resume_kimg, max_epoch=total_kimg)
+    maintenance_time = dnnlib.RunContext.get().get_last_update_interval()
+    cur_nimg = int(resume_kimg * 1000)
+    cur_tick = -1
+    tick_start_nimg = cur_nimg
+    prev_lod = -1.0
+    running_mb_counter = 0
+    while cur_nimg < total_kimg * 1000:
+        if dnnlib.RunContext.get().should_stop(): break
+
+        # Choose training parameters and configure training ops.
+        sched = training_schedule(cur_nimg=cur_nimg, training_set=training_set, **sched_args)
+        assert sched.minibatch_size % (sched.minibatch_gpu * num_gpus) == 0
+        training_set.configure(sched.minibatch_gpu, sched.lod)
+        if reset_opt_for_new_lod:
+            if np.floor(sched.lod) != np.floor(prev_lod) or np.ceil(sched.lod) != np.ceil(prev_lod):
+                G_opt.reset_optimizer_state(); D_opt.reset_optimizer_state()
+        prev_lod = sched.lod
+
+        # Run training ops.
+        feed_dict = {lod_in: sched.lod, lrate_in: sched.G_lrate, minibatch_size_in: sched.minibatch_size, minibatch_gpu_in: sched.minibatch_gpu}
+        for _repeat in range(minibatch_repeats):
+            rounds = range(0, sched.minibatch_size, sched.minibatch_gpu * num_gpus)
+            run_G_reg = (lazy_regularization and running_mb_counter % G_reg_interval == 0)
+            run_D_reg = (lazy_regularization and running_mb_counter % D_reg_interval == 0)
+            cur_nimg += sched.minibatch_size
+            running_mb_counter += 1
+
+            # Fast path without gradient accumulation.
+            if len(rounds) == 1:
+                tflib.run([G_train_op, data_fetch_op], feed_dict)
+                if run_G_reg:
+                    tflib.run(G_reg_op, feed_dict)
+                tflib.run([D_train_op, Gs_update_op], feed_dict)
+                if run_D_reg:
+                    tflib.run(D_reg_op, feed_dict)
+
+            # Slow path with gradient accumulation.
+            else:
+                for _round in rounds:
+                    tflib.run(G_train_op, feed_dict)
+                if run_G_reg:
+                    for _round in rounds:
+                        tflib.run(G_reg_op, feed_dict)
+                tflib.run(Gs_update_op, feed_dict)
+                for _round in rounds:
+                    tflib.run(data_fetch_op, feed_dict)
+                    tflib.run(D_train_op, feed_dict)
+                if run_D_reg:
+                    for _round in rounds:
+                        tflib.run(D_reg_op, feed_dict)
+
+        # Perform maintenance tasks once per tick.
+        done = (cur_nimg >= total_kimg * 1000)
+        if cur_tick < 0 or cur_nimg >= tick_start_nimg + sched.tick_kimg * 1000 or done:
+            cur_tick += 1
+            tick_kimg = (cur_nimg - tick_start_nimg) / 1000.0
+            tick_start_nimg = cur_nimg
+            tick_time = dnnlib.RunContext.get().get_time_since_last_update()
+            total_time = dnnlib.RunContext.get().get_time_since_start() + resume_time
+
+            # Report progress.
+            print('tick %-5d kimg %-8.1f lod %-5.2f minibatch %-4d time %-12s sec/tick %-7.1f sec/kimg %-7.2f maintenance %-6.1f gpumem %.1f' % (
+                autosummary('Progress/tick', cur_tick),
+                autosummary('Progress/kimg', cur_nimg / 1000.0),
+                autosummary('Progress/lod', sched.lod),
+                autosummary('Progress/minibatch', sched.minibatch_size),
+                dnnlib.util.format_time(autosummary('Timing/total_sec', total_time)),
+                autosummary('Timing/sec_per_tick', tick_time),
+                autosummary('Timing/sec_per_kimg', tick_time / tick_kimg),
+                autosummary('Timing/maintenance_sec', maintenance_time),
+                autosummary('Resources/peak_gpu_mem_gb', peak_gpu_mem_op.eval() / 2**30)))
+            autosummary('Timing/total_hours', total_time / (60.0 * 60.0))
+            autosummary('Timing/total_days', total_time / (24.0 * 60.0 * 60.0))
+
+            # Save snapshots.
+            if image_snapshot_ticks is not None and (cur_tick % image_snapshot_ticks == 0 or done):
+                grid_fakes = Gs.run(grid_latents, grid_labels, is_validation=True, minibatch_size=sched.minibatch_gpu)
+                misc.save_image_grid(grid_fakes, dnnlib.make_run_dir_path('fakes%06d.png' % (cur_nimg // 1000)), drange=drange_net, grid_size=grid_size)
+            if network_snapshot_ticks is not None and (cur_tick % network_snapshot_ticks == 0 or done):
+                pkl = dnnlib.make_run_dir_path('network-snapshot-%06d.pkl' % (cur_nimg // 1000))
+                misc.save_pkl((G, D, Gs), pkl)
+                metrics.run(pkl, run_dir=dnnlib.make_run_dir_path(), data_dir=dnnlib.convert_path(data_dir), num_gpus=num_gpus, tf_config=tf_config)
+
+            # Update summaries and RunContext.
+            metrics.update_autosummaries()
+            tflib.autosummary.save_summaries(summary_log, cur_nimg)
+            dnnlib.RunContext.get().update('%.2f' % sched.lod, cur_epoch=cur_nimg // 1000, max_epoch=total_kimg)
+            maintenance_time = dnnlib.RunContext.get().get_last_update_interval() - tick_time
+
+    # Save final snapshot.
+    misc.save_pkl((G, D, Gs), dnnlib.make_run_dir_path('network-final.pkl'))
+
+    # All done.
+    summary_log.close()
+    training_set.close()
+
+#----------------------------------------------------------------------------
diff --git a/insightface/reconstruction/ostec/run_ostec.py b/insightface/reconstruction/ostec/run_ostec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59bb8c136f99f6e0504068c9a433ea19e8e71f1
--- /dev/null
+++ b/insightface/reconstruction/ostec/run_ostec.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import time
+import os
+import glob
+from random import shuffle
+import argparse
+from argparse import Namespace
+import menpo.io as mio
+import menpo.image
+import cv2
+import sys
+sys.path.append("external/stylegan2")
+sys.path.append("external/deep3dfacerecon")
+sys.path.append("external/graphonomy")
+from core.operator import Operator
+from core.config import get_config
+import numpy as np
+from utils.utils import im_menpo2PIL, fix_obj
+from external.face_detector.detect_face import Face_Detector
+from FaceHairMask.MaskExtractor import MaskExtractor
+from menpo.shape import TexturedTriMesh
+import menpo3d.io as m3io
+
+def main(args):
+    source_dir = args.source_dir
+    save_dir = args.save_dir
+    os.makedirs(save_dir,exist_ok=True)
+    operator = Operator(args)
+    detector = Face_Detector()
+    if not args.ganfit:
+        from external.deep3dfacerecon.ostec_api import Deep3dModel
+        deep3dmodel = Deep3dModel()
+    maskExtractor = MaskExtractor()
+
+
+    # while True:
+    for ext in ['.png', '.jpg']:
+        print('Scanning paths...')
+        paths = glob.glob(source_dir + '/*' + ext)
+        shuffle(paths)
+        for path in paths:
+            # try: # To avoid detection errors on large datasets
+            save_path = path.replace(source_dir, save_dir)
+            pkl_path = path.replace(ext,'.pkl')
+            if not os.path.isfile(save_path.replace(ext, '.png')):
+                print('Started: ' + path)
+                start = time.time()
+
+                img = menpo.image.Image(np.transpose(cv2.imread(path)[:,:,::-1],[2,0,1])/255.0)
+
+                if args.ganfit and not os.path.isfile(pkl_path):
+                    raise Exception('Reconstruction from GANfit mode is activated and no GANFit reconstruction pickle file has been found! Either Remove --ganfit flag or Run GANFit first.')
+
+                if os.path.isfile(pkl_path): # GANFit mode
+                    fitting = mio.import_pickle(pkl_path)
+
+                else: # Deep3dReconstruction mode
+                    _, lms = detector.face_detection((img.pixels_with_channels_at_back() * 255).astype(np.uint8))
+                    fitting = deep3dmodel.recontruct(im_menpo2PIL(img), lms)
+                    img = menpo.image.Image(fitting['input'])
+
+                _, face_mask = maskExtractor.main(img)
+
+                final_uv, results_dict = operator.run(img, fitting, face_mask)
+                tmesh = TexturedTriMesh(fitting['vertices'],operator.tcoords.points,final_uv,operator.uv_trilist)
+                m3io.export_textured_mesh(tmesh,save_path.replace(ext, '.obj'),texture_extension='.png')
+                fix_obj(save_path.replace(ext, '.obj'))
+                # mio.export_image(final_uv, save_path.replace(ext, '.png'))
+
+                if args.frontalize:
+                    mio.export_image(results_dict['frontal'], save_path.replace(ext,'_frontal.png'))
+                if args.pickle:
+                    mio.export_pickle(results_dict, save_path.replace(ext,'.pkl'))
+
+                print('Total Processing Time : %.2f secs' % (time.time() - start))
+
+            # except Exception as inst:
+            #     print(type(inst))  # the exception instance
+            #     print(inst.args)  # arguments stored in .args
+            #     print(inst)  # __str__ allows args to be printed directly,
+
+if __name__ == "__main__":
+    args, unparsed = get_config()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source_dir', help='Directory of input 2D images')
+    parser.add_argument('--save_dir', help='Directory to save synthesized UVs')
+    args2 = parser.parse_args(unparsed)
+    args = Namespace(**vars(args), **vars(args2))
+    main(args)
diff --git a/insightface/reconstruction/ostec/utils/align2stylegan.py b/insightface/reconstruction/ostec/utils/align2stylegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c45953ead5777254e5ececfad6089bf464cb8ce
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/align2stylegan.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import numpy as np
+import scipy.ndimage
+import PIL.Image
+
+def create_perspective_transform_matrix(src, dst):
+    """ Creates a perspective transformation matrix which transforms points
+        in quadrilateral ``src`` to the corresponding points on quadrilateral
+        ``dst``.
+
+        Will raise a ``np.linalg.LinAlgError`` on invalid input.
+        """
+    # See:
+    # * http://xenia.media.mit.edu/~cwren/interpolator/
+    # * http://stackoverflow.com/a/14178717/71522
+    in_matrix = []
+    for (x, y), (X, Y) in zip(src, dst):
+        in_matrix.extend([
+            [x, y, 1, 0, 0, 0, -X * x, -X * y],
+            [0, 0, 0, x, y, 1, -Y * x, -Y * y],
+        ])
+
+    A = np.matrix(in_matrix, dtype=np.float)
+    B = np.array(dst).reshape(8)
+    af = np.dot(np.linalg.inv(A.T * A) * A.T, B)
+    return np.append(np.array(af).reshape(8), 1).reshape((3, 3))
+
+
+def create_perspective_transform(src, dst, round=False, splat_args=False):
+    """ Returns a function which will transform points in quadrilateral
+        ``src`` to the corresponding points on quadrilateral ``dst``::
+
+            >>> transform = create_perspective_transform(
+            ...     [(0, 0), (10, 0), (10, 10), (0, 10)],
+            ...     [(50, 50), (100, 50), (100, 100), (50, 100)],
+            ... )
+            >>> transform((5, 5))
+            (74.99999999999639, 74.999999999999957)
+
+        If ``round`` is ``True`` then points will be rounded to the nearest
+        integer and integer values will be returned.
+
+            >>> transform = create_perspective_transform(
+            ...     [(0, 0), (10, 0), (10, 10), (0, 10)],
+            ...     [(50, 50), (100, 50), (100, 100), (50, 100)],
+            ...     round=True,
+            ... )
+            >>> transform((5, 5))
+            (75, 75)
+
+        If ``splat_args`` is ``True`` the function will accept two arguments
+        instead of a tuple.
+
+            >>> transform = create_perspective_transform(
+            ...     [(0, 0), (10, 0), (10, 10), (0, 10)],
+            ...     [(50, 50), (100, 50), (100, 100), (50, 100)],
+            ...     splat_args=True,
+            ... )
+            >>> transform(5, 5)
+            (74.99999999999639, 74.999999999999957)
+
+        If the input values yield an invalid transformation matrix an identity
+        function will be returned and the ``error`` attribute will be set to a
+        description of the error::
+
+            >>> tranform = create_perspective_transform(
+            ...     np.zeros((4, 2)),
+            ...     np.zeros((4, 2)),
+            ... )
+            >>> transform((5, 5))
+            (5.0, 5.0)
+            >>> transform.error
+            'invalid input quads (...): Singular matrix
+        """
+    try:
+        transform_matrix = create_perspective_transform_matrix(src, dst)
+        error = None
+    except np.linalg.LinAlgError as e:
+        transform_matrix = np.identity(3, dtype=np.float)
+        error = "invalid input quads (%s and %s): %s" %(src, dst, e)
+        error = error.replace("\n", "")
+
+    to_eval = "def perspective_transform(%s):\n" %(
+        splat_args and "*pt" or "pt",
+    )
+    to_eval += "  res = np.dot(transform_matrix, ((pt[0], ), (pt[1], ), (1, )))\n"
+    to_eval += "  res = res / res[2]\n"
+    if round:
+        to_eval += "  return (int(round(res[0][0])), int(round(res[1][0])))\n"
+    else:
+        to_eval += "  return (res[0][0], res[1][0])\n"
+    locals = {
+        "transform_matrix": transform_matrix,
+    }
+    locals.update(globals())
+    exec(to_eval,locals,locals)
+    res = locals["perspective_transform"]
+    res.matrix = transform_matrix
+    res.error = error
+    return res
+
+
+def align_mesh2stylegan(temp_tcoords, transformation_params):
+    temp_tcoords = temp_tcoords.copy()
+    temp_tcoords[:, 0] = temp_tcoords[:, 0] - transformation_params['crop'][1]
+    temp_tcoords[:, 1] = temp_tcoords[:, 1] - transformation_params['crop'][0]
+
+    temp_tcoords[:, 0] = temp_tcoords[:, 0] + transformation_params['pad'][1]
+    temp_tcoords[:, 1] = temp_tcoords[:, 1] + transformation_params['pad'][0]
+
+    h, w = (4096, 4096)  # transformation_params['new_size']
+    transform = create_perspective_transform(
+        transformation_params['quad'],
+        [(0, 0), (0, h), (h, w), (w, 0)],
+        splat_args=True,
+    )
+    for i in range(len(temp_tcoords)):
+        temp_tcoords[i, 1], temp_tcoords[i, 0] = transform(temp_tcoords[i, 1], temp_tcoords[i, 0])
+
+    new_tcoords = temp_tcoords[:, ::-1] / (h, w)  # transformation_params['new_size']
+    new_tcoords[:, 1] = 1 - new_tcoords[:, 1]
+    return new_tcoords
+
+def align_im2stylegan(src_im, src_mask, face_landmarks, output_size=1024, transform_size=4096,
+                    enable_padding=True, x_scale=1, y_scale=1, em_scale=0.1, alpha=False):
+        # Align function from FFHQ dataset pre-processing step
+        # https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py
+
+        lm = np.array(face_landmarks)
+        lm_chin = lm[0: 17]  # left-right
+        lm_eyebrow_left = lm[17: 22]  # left-right
+        lm_eyebrow_right = lm[22: 27]  # left-right
+        lm_nose = lm[27: 31]  # top-down
+        lm_nostrils = lm[31: 36]  # top-down
+        lm_eye_left = lm[36: 42]  # left-clockwise
+        lm_eye_right = lm[42: 48]  # left-clockwise
+        lm_mouth_outer = lm[48: 60]  # left-clockwise
+        lm_mouth_inner = lm[60: 68]  # left-clockwise
+
+        # Calculate auxiliary vectors.
+        eye_left = np.mean(lm_eye_left, axis=0)
+        eye_right = np.mean(lm_eye_right, axis=0)
+        eye_avg = (eye_left + eye_right) * 0.5
+        eye_to_eye = eye_right - eye_left
+        mouth_left = lm_mouth_outer[0]
+        mouth_right = lm_mouth_outer[6]
+        mouth_avg = (mouth_left + mouth_right) * 0.5
+        eye_to_mouth = mouth_avg - eye_avg
+
+        # Choose oriented crop rectangle.
+        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+        x /= np.hypot(*x)
+        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+        x *= x_scale
+        y = np.flipud(x) * [-y_scale, y_scale]
+        c = eye_avg + eye_to_mouth * em_scale
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+        qsize = np.hypot(*x) * 2
+        rsize = None
+
+        img = src_im.convert('RGBA').convert('RGB')
+
+        img_mask = src_mask.convert('L')
+
+        img.putalpha(img_mask)
+
+        # Shrink.
+        shrink = int(np.floor(qsize / output_size * 0.5))
+        if shrink > 1:
+            rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+            img = img.resize(rsize, PIL.Image.ANTIALIAS)
+            quad /= shrink
+            qsize /= shrink
+
+        # Crop.
+        border = max(int(np.rint(qsize * 0.1)), 3)
+        crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+                int(np.ceil(max(quad[:, 1]))))
+        crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
+                min(crop[3] + border, img.size[1]))
+        if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
+            img = img.crop(crop)
+            quad -= crop[0:2]
+
+        # Pad.
+        pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+               int(np.ceil(max(quad[:, 1]))))
+        pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
+               max(pad[3] - img.size[1] + border, 0))
+        if enable_padding and max(pad) > border - 4:
+            pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
+            img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'constant')
+            h, w, _ = img.shape
+            y, x, _ = np.ogrid[:h, :w, :1]
+            mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
+                              1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
+            blur = qsize * 0.02
+            img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+            img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
+            img = np.uint8(np.clip(np.rint(img), 0, 255))
+            if alpha:
+                mask = 1 - np.clip(3.0 * mask, 0.0, 1.0)
+                mask = np.uint8(np.clip(np.rint(mask * 255), 0, 255))
+                img = np.concatenate((img, mask), axis=2)
+                img = PIL.Image.fromarray(img, 'RGBA')
+            else:
+                img = PIL.Image.fromarray(img, 'RGBA')
+            quad += pad[:2]
+
+        # Transform.
+        aligned_mask = PIL.Image.fromarray(np.uint8(img)[:, :, 3])
+        img = PIL.Image.fromarray(np.uint8(img)[:, :, :3])
+
+        img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(),
+                            PIL.Image.BILINEAR)
+        aligned_mask = aligned_mask.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(),
+                                              PIL.Image.BILINEAR)
+        if output_size < transform_size:
+            img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+            aligned_mask = aligned_mask.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+
+        transformation_params = {
+            'rsize': rsize,
+            'crop': crop,
+            'pad': pad,
+            'quad': quad + 0.5,
+            'new_size': (output_size, output_size)
+        }
+        # Save aligned image.
+        return img, aligned_mask, transformation_params
diff --git a/insightface/reconstruction/ostec/utils/ganfit_camera.py b/insightface/reconstruction/ostec/utils/ganfit_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..40bbe09d63d4760d9aaf3904e35ec8af96558cb9
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/ganfit_camera.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import numpy as np
+import math
+
+"""Collection of functions to adapt GANFit camera parameters"""
+
+GANFIT_CAMERA_CONSTANTS = [np.array([[0.0, 0.0, 6.0]], dtype=np.float32),
+                    np.array([[0.0, 0.0, 0.0]], dtype=np.float32),
+                    np.array([[0.0, 1.0, 0.0]], dtype=np.float32),
+                    np.array([20.], dtype=np.float32)]
+
+def look_at(eye, center, world_up):
+    """Computes camera viewing matrices.
+    Functionality mimes gluLookAt (third_party/GL/glu/include/GLU/glu.h).
+    Args:
+    eye: 2-D float32 tensor with shape [batch_size, 3] containing the XYZ world
+        space position of the camera.
+    center: 2-D float32 tensor with shape [batch_size, 3] containing a position
+        along the center of the camera's gaze.
+    world_up: 2-D float32 tensor with shape [batch_size, 3] specifying the
+        world's up direction; the output camera will have no tilt with respect
+        to this direction.
+    Returns:
+    A [batch_size, 4, 4] float tensor containing a right-handed camera
+    extrinsics matrix that maps points from world space to points in eye space.
+    """
+    batch_size = center.shape[0]
+    vector_degeneracy_cutoff = 1e-6
+    forward = center - eye
+    forward_norm = np.linalg.norm(forward, axis=1, keepdims=True)
+    forward = np.divide(forward, forward_norm)
+
+    to_side = np.cross(forward, world_up)
+    to_side_norm = np.linalg.norm(to_side, axis=1, keepdims=True)
+    to_side = np.divide(to_side, to_side_norm)
+    cam_up = np.cross(to_side, forward)
+
+    w_column = np.array(
+        batch_size * [[0., 0., 0., 1.]], dtype=np.float32)  # [batch_size, 4]
+    w_column = np.reshape(w_column, [batch_size, 4, 1])
+    view_rotation = np.stack(
+        [to_side, cam_up, -forward,
+         np.zeros_like(to_side, dtype=np.float32)],
+        axis=1)  # [batch_size, 4, 3] matrix
+    view_rotation = np.concatenate(
+        [view_rotation, w_column], axis=2)  # [batch_size, 4, 4]
+
+    identity_batch = np.tile(np.expand_dims(np.eye(3), 0), [batch_size, 1, 1])
+    view_translation = np.concatenate([identity_batch, np.expand_dims(-eye, 2)], 2)
+    view_translation = np.concatenate(
+        [view_translation,
+         np.reshape(w_column, [batch_size, 1, 4])], 1)
+    camera_matrices = np.matmul(view_rotation, view_translation)
+    return camera_matrices
+
+def perspective(aspect_ratio, fov_y, near_clip, far_clip):
+    """Computes perspective transformation matrices.
+    Functionality mimes gluPerspective (third_party/GL/glu/include/GLU/glu.h).
+    Args:
+      aspect_ratio: float value specifying the image aspect ratio (width/height).
+      fov_y: 1-D float32 Tensor with shape [batch_size] specifying output vertical
+          field of views in degrees.
+      near_clip: 1-D float32 Tensor with shape [batch_size] specifying near
+          clipping plane distance.
+      far_clip: 1-D float32 Tensor with shape [batch_size] specifying far clipping
+          plane distance.
+    Returns:
+      A [batch_size, 4, 4] float tensor that maps from right-handed points in eye
+      space to left-handed points in clip space.
+    """
+    focal_lengths_y = 1.0 / np.tan(fov_y * (math.pi / 360.0))
+    depth_range = far_clip - near_clip
+    p_22 = -(far_clip + near_clip) / depth_range
+    p_23 = -2.0 * (far_clip * near_clip / depth_range)
+
+    zeros = np.zeros_like(p_23, dtype=np.float32)
+    # pyformat: disable
+    perspective_transform = np.concatenate(
+      [
+          focal_lengths_y / aspect_ratio, zeros, zeros, zeros,
+          zeros, focal_lengths_y, zeros, zeros,
+          zeros, zeros, p_22, p_23,
+          zeros, zeros, -np.ones_like(p_23, dtype=np.float32), zeros
+      ], axis=0)
+    # pyformat: enable
+    perspective_transform = np.reshape(perspective_transform, [4, 4, -1])
+    return np.transpose(perspective_transform, [2, 0, 1])
+
+def euler_matrices(angles):
+  """Computes a XYZ Tait-Bryan (improper Euler angle) rotation.
+  Returns 4x4 matrices for convenient multiplication with other transformations.
+  Args:
+    angles: a [batch_size, 3] tensor containing X, Y, and Z angles in radians.
+  Returns:
+    a [batch_size, 4, 4] tensor of matrices.
+  """
+  s = np.sin(angles)
+  c = np.cos(angles)
+  # Rename variables for readability in the matrix definition below.
+  c0, c1, c2 = (c[:, 0], c[:, 1], c[:, 2])
+  s0, s1, s2 = (s[:, 0], s[:, 1], s[:, 2])
+
+  zeros = np.zeros_like(s[:, 0])
+  ones = np.ones_like(s[:, 0])
+
+  # pyformat: disable
+  flattened = np.concatenate(
+      [
+          c2 * c1, c2 * s1 * s0 - c0 * s2, s2 * s0 + c2 * c0 * s1, zeros,
+          c1 * s2, c2 * c0 + s2 * s1 * s0, c0 * s2 * s1 - c2 * s0, zeros,
+          -s1, c1 * s0, c1 * c0, zeros,
+          zeros, zeros, zeros, ones
+      ],
+      axis=0)
+  # pyformat: enable
+  reshaped = np.reshape(flattened, [4, 4, -1])
+  return np.transpose(reshaped, [2, 0, 1])
+
+def transform_homogeneous(matrices, vertices):
+  """Applies batched 4x4 homogenous matrix transformations to 3-D vertices.
+  The vertices are input and output as as row-major, but are interpreted as
+  column vectors multiplied on the right-hand side of the matrices. More
+  explicitly, this function computes (MV^T)^T.
+  Vertices are assumed to be xyz, and are extended to xyzw with w=1.
+  Args:
+    matrices: a [batch_size, 4, 4] tensor of matrices.
+    vertices: a [batch_size, N, 3] tensor of xyz vertices.
+  Returns:
+    a [batch_size, N, 4] tensor of xyzw vertices.
+  Raises:
+    ValueError: if matrices or vertices have the wrong number of dimensions.
+  """
+  homogeneous_coord = np.ones(
+      [np.shape(vertices)[0], np.shape(vertices)[1], 1], dtype=np.float32)
+  vertices_homogeneous = np.concatenate([vertices, homogeneous_coord], 2)
+
+  return np.matmul(vertices_homogeneous[0], matrices[0].T)
+
+
+def P2sRt(P):
+    ''' decompositing camera matrix P
+    Args:
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t: (3,). translation.
+    '''
+    t = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2.0
+    r1 = R1 / np.linalg.norm(R1)
+    r2 = R2 / np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t
+
+
+# Ref: https://www.learnopencv.com/rotation-matrix-to-euler-angles/
+def isRotationMatrix(R):
+    ''' checks if a matrix is a valid rotation matrix(whether orthogonal or not)
+    '''
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    I = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(I - shouldBeIdentity)
+    return n < 1e-6
+
+
+def matrix2angle(R):
+    ''' get three Euler angles from Rotation Matrix
+    Args:
+        R: (3,3). rotation matrix
+    Returns:
+        x: pitch
+        y: yaw
+        z: roll
+    '''
+    assert (isRotationMatrix)
+    sy = math.sqrt(R[0, 0] * R[0, 0] + R[1, 0] * R[1, 0])
+
+    singular = sy < 1e-6
+
+    if not singular:
+        x = math.atan2(R[2, 1], R[2, 2])
+        y = math.atan2(-R[2, 0], sy)
+        z = math.atan2(R[1, 0], R[0, 0])
+    else:
+        x = math.atan2(-R[1, 2], R[1, 1])
+        y = math.atan2(-R[2, 0], sy)
+        z = 0
+
+    # rx, ry, rz = np.rad2deg(x), np.rad2deg(y), np.rad2deg(z)
+    # rx, ry, rz = x * 180 / np.pi, y * 180 / np.pi, z * 180 / np.pi
+    # return rx, ry, rz
+    return x, y, z
+
+def get_camera_matrices(camera_params):
+    # defaults from ganfit
+
+    camera_position = camera_params[0] + GANFIT_CAMERA_CONSTANTS[0]
+    camera_lookat = camera_params[1] + GANFIT_CAMERA_CONSTANTS[1]
+    camera_up =  camera_params[2] + GANFIT_CAMERA_CONSTANTS[2]
+
+    return look_at(camera_position, camera_lookat, camera_up)
+
+def get_pose(camera_params):
+    s, R, t = P2sRt(get_camera_matrices(camera_params)[0])
+    return matrix2angle(R)  # pitch:%.2f,\n yaw:%.2f \n , roll:%.2f \n
+
+def apply_camera_only3d(vertices, camera_params):
+
+    camera_matrices = get_camera_matrices(camera_params)
+
+    clip_space_vertices = transform_homogeneous(camera_matrices, np.array([vertices]))[:, 0:3]
+
+    return clip_space_vertices
+
+
+def apply_camera(camera_params, tmesh, fitting, ganfit_image_size = 512):
+    vertices = tmesh.points
+
+    # defaults from ganfit
+    image_width = ganfit_image_size
+    image_height = ganfit_image_size
+    near_clip = np.array([0.01])
+    far_clip = np.array([10.0])
+    fov_y =  camera_params[3] + GANFIT_CAMERA_CONSTANTS[3]
+
+    camera_matrices = get_camera_matrices(camera_params)
+    perspective_transforms = perspective(image_width / image_height, fov_y, near_clip, far_clip)
+    clip_space_transforms = np.matmul(perspective_transforms, camera_matrices)
+
+    clip_space_vertices = transform_homogeneous(clip_space_transforms, np.array([vertices]))
+
+    _MINIMUM_PERSPECTIVE_DIVIDE_THRESHOLD = 1e-6
+    clip_space_points_w = np.maximum(
+        np.abs(clip_space_vertices[:, 3:4]),
+        _MINIMUM_PERSPECTIVE_DIVIDE_THRESHOLD) * np.sign(
+        clip_space_vertices[:, 3:4])
+    normalized_device_coordinates = (
+            clip_space_vertices[:, 0:3] / clip_space_points_w)
+
+    normalized_device_coordinates = ((normalized_device_coordinates[:, 0:2] * [1, -1]) / 2 + 0.5) * [image_width,
+                                                                                                     image_height]
+
+    dense_lms_org = fitting['transformation'].apply(normalized_device_coordinates[:, ::-1])[:, ::-1]
+
+    dense_lms_org /= [1024, 1024]
+    dense_lms_org[:, 1] = 1 - dense_lms_org[:, 1]
+
+    return dense_lms_org
+
diff --git a/insightface/reconstruction/ostec/utils/generate_heatmap.py b/insightface/reconstruction/ostec/utils/generate_heatmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b5533c4df4ceaa18e71d590af921bdab8ed477
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/generate_heatmap.py
@@ -0,0 +1,129 @@
+import numpy as np
+import math
+import cv2
+
+# Adapted from: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+def _gaussian(size=3, sigma=0.25, amplitude=1, normalize=False, width=None, height=None, sigma_horz=None,
+              sigma_vert=None, mean_horz=0.5, mean_vert=0.5):
+    """ Generate a guassian kernel.
+
+    Args:
+        size (int): The size of the kernel if the width or height are not specified
+        sigma (float): Standard deviation of the kernel if sigma_horz or sigma_vert are not specified
+        amplitude: The scale of the kernel
+        normalize: If True, the kernel will be normalized such as values will sum to one
+        width (int, optional): The width of the kernel
+        height (int, optional): The height of the kernel
+        sigma_horz (float, optional): Horizontal standard deviation of the kernel
+        sigma_vert (float, optional): Vertical standard deviation of the kernel
+        mean_horz (float): Horizontal mean of the kernel
+        mean_vert (float): Vertical mean of the kernel
+
+    Returns:
+        np.array: The computed gaussian kernel
+    """
+    # handle some defaults
+    if width is None:
+        width = size
+    if height is None:
+        height = size
+    if sigma_horz is None:
+        sigma_horz = sigma
+    if sigma_vert is None:
+        sigma_vert = sigma
+    center_x = mean_horz * width + 0.5
+    center_y = mean_vert * height + 0.5
+    gauss = np.empty((height, width), dtype=np.float32)
+    # generate kernel
+    for i in range(height):
+        for j in range(width):
+            gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / (
+                    sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0))
+    if normalize:
+        gauss = gauss / np.sum(gauss)
+
+    return gauss
+
+
+# Adapted from: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+def draw_gaussian(image, point, sigma):
+    """ Draw gaussian circle at a point in an image.
+
+    Args:
+        image (np.array): An image of shape (H, W)
+        point (np.array): The center point of the guassian circle
+        sigma (float): Standard deviation of the gaussian kernel
+
+    Returns:
+        np.array: The image with the drawn gaussian.
+    """
+    # Check if the gaussian is inside
+    point[0] = round(point[0], 2)
+    point[1] = round(point[1], 2)
+
+    ul = [math.floor(point[0] - 7.5 * sigma), math.floor(point[1] - 7.5 * sigma)]
+    br = [math.floor(point[0] + 7.5 * sigma), math.floor(point[1] + 7.5 * sigma)]
+    if (ul[0] > image.shape[1] or ul[1] >
+            image.shape[0] or br[0] < 1 or br[1] < 1):
+        return image
+    size = 15 * sigma + 1
+    g = _gaussian(size, sigma=0.1)
+    g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) -
+           int(max(1, ul[0])) + int(max(1, -ul[0]))]
+    g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) -
+           int(max(1, ul[1])) + int(max(1, -ul[1]))]
+    img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))]
+    img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))]
+    assert (g_x[0] > 0 and g_y[1] > 0)
+    image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] = \
+        image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]]
+    image[image > 1] = 1
+
+    return image
+
+
+# Adapted from: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/api.py
+def generate_heatmaps(height, width, points, sigma=None):
+    """ Generate heatmaps corresponding to a set of points.
+
+    Args:
+        height (int): Heatmap height
+        width (int): Heatmap width
+        points (np.array): An array of points of shape (N, 2)
+        sigma (float, optional): Standard deviation of the gaussian kernel. If not specified it will be determined
+            from the width of the heatmap
+
+    Returns:
+        np.array: The generated heatmaps.
+    """
+    sigma = max(1, int(np.round(width / 128.))) if sigma is None else sigma
+    heatmaps = np.zeros((points.shape[0], height, width), dtype=np.float32)
+    for i in range(points.shape[0]):
+        if points[i, 0] > 0:
+            heatmaps[i] = draw_gaussian(
+                heatmaps[i], points[i], sigma)
+
+    return heatmaps
+
+
+
+if __name__ == "__main__":
+
+    #you can use [X,2] matrix
+    points = np.array([
+        [(30.2946)+8, 51.6963],
+        [(65.5318)+8, 51.5014],
+        [(48.0252)+8, 71.7366],
+        [(33.5493)+8, 92.3655],
+        [(62.7299)+8, 92.2041]], dtype=np.float32)
+
+
+    heatmaps = generate_heatmaps(width = 112,
+                                 height = 112,
+                                 points = points,
+                                 sigma = 3)
+    print(heatmaps.shape)
+
+    final_heatmap = np.sum(heatmaps, axis=0)
+    cv2.imwrite("final_heatmap.png", final_heatmap*255)
+    print("end")
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/utils/image_rasterization.py b/insightface/reconstruction/ostec/utils/image_rasterization.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b46126b799d2e010e51b5d5adc5d0c67a920b18
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/image_rasterization.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+from menpo3d.rasterize import (
+    rasterize_barycentric_coordinate_images,
+    rasterize_mesh_from_barycentric_coordinate_images)
+
+from menpo.transform import Rotation, Translation, Scale
+from menpo3d.camera import PerspectiveProjection, PerspectiveCamera
+import numpy as np
+
+def rotation_z(theta, degrees=True):
+   if degrees:
+       # convert to radians
+       theta = theta * np.pi / 180.0
+   return Rotation(np.array([[np.cos(theta), -np.sin(theta), 0],
+                             [np.sin(theta), np.cos(theta), 0],
+                             [0, 0, 1]]),
+                   skip_checks=True)
+
+def rotation_y(theta, degrees=True):
+   if degrees:
+       # convert to radians
+       theta = theta * np.pi / 180.0
+   return Rotation(np.array([[np.cos(theta), 0, np.sin(theta)],
+                             [0, 1, 0],
+                             [-np.sin(theta), 0, np.cos(theta)]]),
+                   skip_checks=True)
+
+def rotation_x(theta, degrees=True):
+   if degrees:
+       theta = np.deg2rad(theta)
+   return Rotation(np.array([[ 1,             0,              0],
+                             [ 0, np.cos(theta), -np.sin(theta)],
+                             [ 0, np.sin(theta), np.cos(theta)]]),
+                   skip_checks=True)
+
+def perspective_camera_for_template(img_shape, focal_length_mult=2,
+                                     pose_angle_deg=[0,0,0], cam_dist = 7):
+    f = np.array(img_shape).max() * focal_length_mult
+    rot_z = rotation_z(180 + pose_angle_deg[2])
+    rot_y = rotation_y(180 + pose_angle_deg[1])
+    rot_x = rotation_x(pose_angle_deg[0])
+    rotation = rot_z.compose_before(rot_y).compose_before(rot_x)
+
+    translation = Translation([0, 0, +cam_dist])
+    projection = PerspectiveProjection(f, img_shape)
+    return PerspectiveCamera(rotation, translation, projection)
+
+def align_mesh_to_template(source, target, scale_corrective=1.2):
+    scale = Scale((target.norm() / source.norm()) * scale_corrective,
+                  n_dims=target.n_dims)
+    translation = Translation(target.centre() - source.centre())
+    return translation.compose_before(scale)
+
+def rasterize_image( mesh, img_shape, pose_angle_deg=[0,0,0], cam_dist = 7, ):
+    camera = perspective_camera_for_template(img_shape, pose_angle_deg=pose_angle_deg, cam_dist= cam_dist)
+
+    # Pre-process - align the mesh roughly with the template
+    # aligned_mesh = align_mesh_to_template(mesh, template).apply(mesh)
+
+    mesh_in_img = camera.apply(mesh)
+
+    bcs = rasterize_barycentric_coordinate_images(mesh_in_img, img_shape)
+    img = rasterize_mesh_from_barycentric_coordinate_images(mesh_in_img, *bcs)
+#   shape_img = rasterize_shape_image_from_barycentric_coordinate_images(mesh, *bcs)
+    img.pixels = np.clip(img.pixels,0.0,1.0)
+    return img, mesh_in_img.points
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/utils/shading.py b/insightface/reconstruction/ostec/utils/shading.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5d436ef5190783d826643c66bad559e464cf9a
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/shading.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+""" Renders 3D faces in python with lambertian shading
+    Author: Stylianos Ploumpis """
+
+import numpy as np
+from menpo.transform import UniformScale, Translation
+
+
+def l2_normalize(x, axis=0, epsilon=1e-12):
+    """
+    Transforms an `ndarray` to have a unit l2 norm along
+    a given direction.
+    ----------
+    x : `ndarray`
+        The array to be transformed.
+    axis : `int`
+        The axis that will be l2 unit normed.
+    epsilon: `float`
+        A small value such as to avoid division by zero.
+
+    Returns
+    -------
+    x : (D,) `ndarray`
+        The transformed array.
+    """
+    return x / np.maximum(np.linalg.norm(x, axis=axis), epsilon)
+
+
+def mesh_in_unit_sphere(mesh):
+    scale = UniformScale(1 / mesh.norm(), mesh.n_dims)
+    translation = Translation(-scale.apply(mesh).centre())
+    return translation.compose_after(scale)
+
+
+def lambertian_shading(mesh, diffuse_colour=0.4,
+                       albedo_weighting=0.6, ambient_colour=0.2,
+                       light_positions=((45, 45, 1), (30, 60, 10),(30, 60, 0.5),(0, 0, 1))):
+
+    diffuse_colour = np.asarray(diffuse_colour)
+    light_positions = l2_normalize(np.asarray(light_positions).reshape(-1, 3),
+                                   axis=0)
+
+    unit_transform = mesh_in_unit_sphere(mesh)
+    mesh = unit_transform.apply(mesh)
+
+    light_directions = l2_normalize(light_positions.reshape(-1, 1, 3) -
+                                    mesh.points[None, ...], axis=0)
+
+    # Calculate the lambertian reflectance for each light source.
+    # This will be an `ndarray` of shape(num_light_sources, num_vertices)
+    lambertian = np.sum(light_directions *
+                        mesh.vertex_normals()[None, ...], 2)[..., None]
+
+    # Sum up the contribution of all the light sources and multiply by the
+    # diffusion colour.
+    lambertian = lambertian.sum(0) * diffuse_colour + ambient_colour
+
+    mesh.colours[...] = np.clip(mesh.colours * albedo_weighting +
+                                lambertian * (1 - albedo_weighting),
+                                0, 1)
+
+    return unit_transform.pseudoinverse().apply(mesh)
\ No newline at end of file
diff --git a/insightface/reconstruction/ostec/utils/utils.py b/insightface/reconstruction/ostec/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eb00eb33322b142fb3c8b33d51241945b13015d
--- /dev/null
+++ b/insightface/reconstruction/ostec/utils/utils.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2020, Baris Gecer. All rights reserved.
+#
+# This work is made available under the CC BY-NC-SA 4.0.
+# To view a copy of this license, see LICENSE
+
+import PIL.Image
+from skimage.morphology import binary_dilation,  disk
+from skimage.filters import gaussian
+from scipy.interpolate import NearestNDInterpolator
+import cv2 as cv
+from utils.shading import lambertian_shading
+from menpo3d.rasterize import rasterize_mesh
+from menpo.shape import TriMesh, TexturedTriMesh, ColouredTriMesh
+from menpo.image import Image
+from utils.image_rasterization import *
+import menpo.io as mio
+import os
+
+gold = (244/256,178/256,88/256)
+green = (26/256,140/256,57/256)
+blue = (24/256,148/256,187/256)
+red = (255/256,51/256,51/256)
+purple = (102/256,0/256,204/256)
+pink = (255/256,21/256,255/256)
+
+def perspective_camera(img_shape, focal_length_mult=2,
+                                     pose_angle_deg=[0,0,0], translation=4):
+    f = np.array(img_shape).max() * focal_length_mult
+    rot_z = rotation_z(180 + pose_angle_deg[2])
+    rot_y = rotation_y(180 + pose_angle_deg[1])
+    rot_x = rotation_x(pose_angle_deg[0])
+    rotation = rot_z.compose_before(rot_y).compose_before(rot_x)
+
+    translation = Translation([0, 0, translation])
+    projection = PerspectiveProjection(f, img_shape)
+    return PerspectiveCamera(rotation, translation, projection)
+
+def rasterize_mesh_at_template(mesh, img_shape=(1024,1024), pose_angle_deg=[0, 0, 0], translation=4, shaded=True, **kwargs):
+    camera = perspective_camera(img_shape, pose_angle_deg=pose_angle_deg, translation=translation)
+    if shaded:
+        mesh = lambertian_shading(mesh, **kwargs)
+    mesh.points[mesh.points == 0] = None
+    return rasterize_mesh(camera.apply(mesh), img_shape)
+
+def export_tmesh(tmesh, path, pose_angle_deg=[0, 30, 0], translation=4):
+    render = rasterize_mesh_at_template(tmesh, pose_angle_deg=pose_angle_deg, shaded=False, translation=translation)
+    mio.export_image(render, path, overwrite=True)
+
+def export_shape(tmesh, path, pose_angle_deg=[0, 30, 0]):
+    mesh = ColouredTriMesh(tmesh.points, tmesh.trilist,
+                           np.tile(np.array(blue), [len(tmesh.points), 1]))
+    render = rasterize_mesh_at_template(mesh, pose_angle_deg=pose_angle_deg, shaded=True)
+    mio.export_image(render, path, overwrite=True)
+
+def uv_color_normalize(img_uv_src, angle_uv_src, img_uv, angle_uv):
+    intersection = (angle_uv_src.pixels[0] < 0.6) & (angle_uv_src.pixels[0] > 0.3) & (angle_uv.pixels[0] > 0.7)
+    if np.any(intersection):
+        target_dist = img_uv_src.pixels[:, intersection]
+        target_mean = np.mean(target_dist, 1).reshape([3, -1])
+        targed_std = np.std(target_dist, 1).reshape([3, -1])
+
+        source_dist = img_uv.pixels[:, intersection]
+        source_mean = np.mean(source_dist, 1).reshape([3, -1])
+        source_std = np.std(source_dist, 1).reshape([3, -1])
+
+        temp = ((img_uv.pixels.reshape([3, -1]) - source_mean) / source_std) * targed_std + target_mean
+        return Image(temp.reshape(img_uv.pixels.shape))
+    else:
+        return img_uv
+
+def uv_stiching(img_uv_list, angle_uv_list, smoothing_sigma = 10, seamless_clone=False):
+    max_ind = np.argmax(np.array(angle_uv_list).mean(axis=1), axis=0)
+    mask_out_all = np.max(np.array(angle_uv_list).mean(axis=1),axis=0) ==-1
+    max_ind[mask_out_all] = len(img_uv_list) -1
+    all_uvs = np.clip(np.array(img_uv_list), 0, 1)
+
+    max_ind_one_hot = np.zeros((max_ind.size, len(img_uv_list)))
+    max_ind_one_hot[np.arange(max_ind.size), max_ind.flatten()] = 1
+    max_ind_one_hot = max_ind_one_hot.reshape(max_ind.shape + (-1,))
+
+    if smoothing_sigma>0:
+        max_ind_one_hot_g = gaussian(max_ind_one_hot, sigma=smoothing_sigma, multichannel=True, mode='reflect')
+    else:
+        max_ind_one_hot_g = max_ind_one_hot
+
+    max_ind_one_hot = np.tile(max_ind_one_hot, [3, 1, 1, 1])
+    max_ind_one_hot = np.transpose(max_ind_one_hot, [3, 0, 1, 2])
+
+    max_ind_one_hot_g = np.tile(max_ind_one_hot_g, [3, 1, 1, 1])
+    max_ind_one_hot_g = np.transpose(max_ind_one_hot_g, [3, 0, 1, 2])
+
+    if seamless_clone:
+        all_uvs_uint8 = (np.transpose(all_uvs,[0,2,3,1])*255).astype(np.uint8)
+        max_ind_one_uint8 = (np.transpose(1 - max_ind_one_hot,[0,2,3,1])*255).astype(np.uint8)
+        max_ind_one_uint8_g = (np.transpose(1 - max_ind_one_hot_g,[0,2,3,1])*255).astype(np.uint8)
+
+        dst = (Image(np.sum(all_uvs * max_ind_one_hot_g, axis=0)).pixels_with_channels_at_back()*255).astype(np.uint8)
+        for i, (src, mask, mask_g) in enumerate(zip(all_uvs_uint8[1:], max_ind_one_uint8[1:], max_ind_one_uint8_g[1:])):
+            mask_inv = 255-mask.copy()
+            mask_inv[:,0:5] = 255
+            mask_inv[:, -6:-1] = 255
+            mask_inv[0:5,:] = 255
+            mask_inv[-6:-1,:] = 255
+            mask_inv_dilated = binary_dilation(mask_inv.astype(np.bool)[:,:,0], disk(50)).astype(np.uint8)*255
+            mask_inv_dilated = np.transpose(np.tile(mask_inv_dilated,[3,1,1]),[1,2,0])
+
+            mask_inv_dilated_g = gaussian(mask_inv_dilated/255, sigma=100, multichannel=True, mode='reflect')
+            mixed = (src * mask_inv_dilated_g + dst * (1-mask_inv_dilated_g)).astype(np.uint8)
+
+            im_clone = cv.seamlessClone(dst, mixed, mask_inv, (512,512), cv.NORMAL_CLONE)
+            dst = (im_clone * (1-mask_g/255) +  dst* (mask_g/255)).astype(np.uint8)
+        final_uv = Image(np.transpose(dst.astype(np.float)/255,[2,0,1]))
+    else:
+        final_uv = Image(np.sum(all_uvs * max_ind_one_hot_g, axis=0))
+
+    final_uv.pixels = np.clip(final_uv.pixels,0.0, 1.0)
+    return final_uv, max_ind_one_hot
+
+def im_menpo2PIL(menpo_im):
+    return PIL.Image.fromarray((menpo_im.pixels_with_channels_at_back() * 255).astype(np.uint8))
+
+def im_PIL2menpo(pil_im):
+    return Image.init_from_channels_at_back(np.array(pil_im).astype(np.float) / 255)
+
+def fill_UV(UV):
+    mask = np.sum(UV.pixels, 0) == 0
+    xx, yy = np.meshgrid(np.arange(UV.shape[1]), np.arange(UV.shape[0]))
+    xym = np.vstack((np.ravel(xx[~mask]), np.ravel(yy[~mask]))).T
+    data = UV.pixels[:, ~mask]
+    for i in range(3):
+        interp = NearestNDInterpolator(xym, data[i])
+        result = interp(np.ravel(xx[mask]), np.ravel(yy[mask]))
+        UV.pixels[i, mask] = result
+    return UV
+
+from keras.utils import get_file
+import bz2
+def unpack_bz2(src_path):
+    data = bz2.BZ2File(src_path).read()
+    dst_path = src_path[:-4]
+    with open(dst_path, 'wb') as fp:
+        fp.write(data)
+    return dst_path
+
+
+def fix_obj(fp):
+    os.path.dirname(fp)
+    template = """# Produced by Dimensional Imaging OBJ exporter
+# http://www.di3d.com
+#
+#
+newmtl merged_material
+Ka  0.5 0.5 0.5
+Kd  0.5 0.5 0.5
+Ks  0.47 0.47 0.47
+d 1
+Ns 0
+illum 2
+map_Kd {}.png
+#
+#
+# EOF""".format(os.path.splitext(os.path.basename(fp))[0])
+    with open(os.path.join(os.path.dirname(fp), os.path.splitext(os.path.basename(fp))[0] + '.mtl'), 'w') as f:
+        f.write(template)
+
+    with open(fp, 'r+')  as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write('mtllib ' + os.path.splitext(os.path.basename(fp))[0] + '.mtl' + '\n' + content)
+
diff --git a/insightface/requirements.txt b/insightface/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b8cdc0177ada72d5864eb93b3ba82df72fdaa0c7
--- /dev/null
+++ b/insightface/requirements.txt
@@ -0,0 +1,3 @@
+Cython>=0.29.28
+cmake>=3.22.3    
+numpy>=1.22.3
diff --git a/insightface/tools/onnx2caffe/LICENSE b/insightface/tools/onnx2caffe/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..879e87b38e31ca313c0c3dd1f876ef21c209a695
--- /dev/null
+++ b/insightface/tools/onnx2caffe/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 MTlab, Meitu Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/insightface/tools/onnx2caffe/MyCaffe.py b/insightface/tools/onnx2caffe/MyCaffe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cbaf7b7230c886bed258893e07dd887185a32cd
--- /dev/null
+++ b/insightface/tools/onnx2caffe/MyCaffe.py
@@ -0,0 +1,125 @@
+from collections import OrderedDict, Counter
+
+from caffe.proto import caffe_pb2
+from google import protobuf
+import six
+
+def param_name_dict():
+    """Find out the correspondence between layer names and parameter names."""
+
+    layer = caffe_pb2.LayerParameter()
+    # get all parameter names (typically underscore case) and corresponding
+    # type names (typically camel case), which contain the layer names
+    # (note that not all parameters correspond to layers, but we'll ignore that)
+    param_names = [f.name for f in layer.DESCRIPTOR.fields if f.name.endswith('_param')]
+    param_type_names = [type(getattr(layer, s)).__name__ for s in param_names]
+    # strip the final '_param' or 'Parameter'
+    param_names = [s[:-len('_param')] for s in param_names]
+    param_type_names = [s[:-len('Parameter')] for s in param_type_names]
+    return dict(zip(param_type_names, param_names))
+
+def assign_proto(proto, name, val):
+    """Assign a Python object to a protobuf message, based on the Python
+    type (in recursive fashion). Lists become repeated fields/messages, dicts
+    become messages, and other types are assigned directly. For convenience,
+    repeated fields whose values are not lists are converted to single-element
+    lists; e.g., `my_repeated_int_field=3` is converted to
+    `my_repeated_int_field=[3]`."""
+
+    is_repeated_field = hasattr(getattr(proto, name), 'extend')
+    if is_repeated_field and not isinstance(val, list):
+        val = [val]
+    if isinstance(val, list):
+        if isinstance(val[0], dict):
+            for item in val:
+                proto_item = getattr(proto, name).add()
+                for k, v in six.iteritems(item):
+                    assign_proto(proto_item, k, v)
+        else:
+            getattr(proto, name).extend(val)
+    elif isinstance(val, dict):
+        for k, v in six.iteritems(val):
+            assign_proto(getattr(proto, name), k, v)
+    else:
+        setattr(proto, name, val)
+
+class Function(object):
+    """A Function specifies a layer, its parameters, and its inputs (which
+    are Tops from other layers)."""
+
+    def __init__(self, type_name, layer_name, inputs,outputs, **params):
+        self.type_name = type_name
+        self.inputs = inputs
+        self.outputs = outputs
+        self.params = params
+        self.layer_name = layer_name
+        self.ntop = self.params.get('ntop', 1)
+        # use del to make sure kwargs are not double-processed as layer params
+        if 'ntop' in self.params:
+            del self.params['ntop']
+        self.in_place = self.params.get('in_place', False)
+        if 'in_place' in self.params:
+            del self.params['in_place']
+        # self.tops = tuple(Top(self, n) for n in range(self.ntop))l
+
+    def _get_name(self, names, autonames):
+        if self not in names and self.ntop > 0:
+            names[self] = self._get_top_name(self.tops[0], names, autonames)
+        elif self not in names:
+            autonames[self.type_name] += 1
+            names[self] = self.type_name + str(autonames[self.type_name])
+        return names[self]
+
+    def _get_top_name(self, top, names, autonames):
+        if top not in names:
+            autonames[top.fn.type_name] += 1
+            names[top] = top.fn.type_name + str(autonames[top.fn.type_name])
+        return names[top]
+
+    def _to_proto(self):
+        bottom_names = []
+        for inp in self.inputs:
+            # inp._to_proto(layers, names, autonames)
+            bottom_names.append(inp)
+        layer = caffe_pb2.LayerParameter()
+        layer.type = self.type_name
+        layer.bottom.extend(bottom_names)
+
+        if self.in_place:
+            layer.top.extend(layer.bottom)
+        else:
+            for top in self.outputs:
+                layer.top.append(top)
+        layer.name = self.layer_name
+        # print(self.type_name + "...")
+        for k, v in six.iteritems(self.params):
+            # special case to handle generic *params
+            # print("generating "+k+"...")
+
+            if k.endswith('param'):
+                assign_proto(layer, k, v)
+            else:
+                try:
+                    assign_proto(getattr(layer,
+                        _param_names[self.type_name] + '_param'), k, v)
+                except (AttributeError, KeyError):
+                    assign_proto(layer, k, v)
+
+        return layer
+
+class Layers(object):
+    """A Layers object is a pseudo-module which generates functions that specify
+    layers; e.g., Layers().Convolution(bottom, kernel_size=3) will produce a Top
+    specifying a 3x3 convolution applied to bottom."""
+
+    def __getattr__(self, name):
+        def layer_fn(*args, **kwargs):
+            fn = Function(name, args, kwargs)
+            return fn
+        return layer_fn
+
+
+
+
+_param_names = param_name_dict()
+
diff --git a/insightface/tools/onnx2caffe/README.md b/insightface/tools/onnx2caffe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02b8238fe0ab849f98b18de2abda68b1fcbdee26
--- /dev/null
+++ b/insightface/tools/onnx2caffe/README.md
@@ -0,0 +1,37 @@
+# Convert ONNX to Caffe
+
+This tool is modified from [onnx2caffe](https://github.com/MTlab/onnx2caffe) by MTlab.
+
+We added some OPs to support one-stage mmdetection models.
+
+### Dependencies
+* pycaffe (with builtin Upsample and Permute layers)
+* onnx  
+
+
+### How to use
+To convert onnx model to caffe:
+```
+python convertCaffe.py ./model/mmdet.onnx ./model/a.prototxt ./model/a.caffemodel
+```
+
+### Current support operation
+* Conv
+* ConvTranspose
+* BatchNormalization
+* MaxPool
+* AveragePool
+* Relu
+* Sigmoid
+* Dropout
+* Gemm (InnerProduct only)
+* Add
+* Mul
+* Reshape
+* Upsample
+* Concat
+* Flatten
+* **Resize**
+* **Permute**
+* **Scale**
+
diff --git a/insightface/tools/onnx2caffe/convertCaffe.py b/insightface/tools/onnx2caffe/convertCaffe.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd52776632dc22ab156229a3a59b66f2f37aac1a
--- /dev/null
+++ b/insightface/tools/onnx2caffe/convertCaffe.py
@@ -0,0 +1,114 @@
+#from __future__ import print_function
+import sys
+import caffe
+import onnx
+import numpy as np
+from caffe.proto import caffe_pb2
+caffe.set_mode_cpu()
+from onnx2caffe._transformers import ConvAddFuser,ConstantsToInitializers
+from onnx2caffe._graph import Graph
+
+import onnx2caffe._operators as cvt
+import onnx2caffe._weightloader as wlr
+from onnx2caffe._error_utils import ErrorHandling
+from collections import OrderedDict
+from onnx import shape_inference
+import importlib
+
+USE_DECONV_AS_UPSAMPLE = True
+
+transformers = [
+    ConstantsToInitializers(),
+    ConvAddFuser(),
+]
+
+def convertToCaffe(graph, prototxt_save_path, caffe_model_save_path):
+    exist_edges = []
+    layers = []
+    exist_nodes = []
+    err = ErrorHandling()
+    for i in graph.inputs:
+        edge_name = i[0]
+        input_layer = cvt.make_input(i)
+        layers.append(input_layer)
+        exist_edges.append(i[0])
+        graph.channel_dims[edge_name] = graph.shape_dict[edge_name][1]
+
+
+    for id, node in enumerate(graph.nodes):
+        node_name = node.name
+        op_type = node.op_type
+        inputs = node.inputs
+        inputs_tensor = node.input_tensors
+        input_non_exist_flag = False
+
+        for inp in inputs:
+            if inp not in exist_edges and inp not in inputs_tensor:
+                input_non_exist_flag = True
+                break
+        if input_non_exist_flag:
+            continue
+
+        if op_type not in cvt._ONNX_NODE_REGISTRY:
+            err.unsupported_op(node)
+            continue
+        converter_fn = cvt._ONNX_NODE_REGISTRY[op_type]
+        layer = converter_fn(node,graph,err)
+        if type(layer)==tuple:
+            for l in layer:
+                layers.append(l)
+        else:
+            layers.append(layer)
+        outs = node.outputs
+        for out in outs:
+            exist_edges.append(out)
+
+    net = caffe_pb2.NetParameter()
+    for id,layer in enumerate(layers):
+        layers[id] = layer._to_proto()
+    net.layer.extend(layers)
+
+    with open(prototxt_save_path, 'w') as f:
+        print(net,file=f)
+
+    caffe.set_mode_cpu()
+    deploy = prototxt_save_path
+    net = caffe.Net(deploy,
+                    caffe.TEST)
+
+    for id, node in enumerate(graph.nodes):
+        node_name = node.name
+        op_type = node.op_type
+        inputs = node.inputs
+        inputs_tensor = node.input_tensors
+        input_non_exist_flag = False
+        if op_type not in wlr._ONNX_NODE_REGISTRY:
+            err.unsupported_op(node)
+            continue
+        converter_fn = wlr._ONNX_NODE_REGISTRY[op_type]
+        converter_fn(net, node, graph, err)
+
+    net.save(caffe_model_save_path)
+    return net
+
+def getGraph(onnx_path):
+    model = onnx.load(onnx_path)
+    output_names = [node.name for node in model.graph.output]
+    model = shape_inference.infer_shapes(model)
+    model_graph = model.graph
+    graph = Graph.from_onnx(model_graph)
+    graph = graph.transformed(transformers)
+    graph.channel_dims = {}
+
+    return graph, output_names
+
+if __name__ == "__main__":
+    cvt.USE_DECONV_AS_UPSAMPLE = USE_DECONV_AS_UPSAMPLE
+    wlr.USE_DECONV_AS_UPSAMPLE = USE_DECONV_AS_UPSAMPLE
+    onnx_path = sys.argv[1]
+    prototxt_path = sys.argv[2]
+    caffemodel_path = sys.argv[3]
+    graph, output_names = getGraph(onnx_path)
+    convertToCaffe(graph, prototxt_path, caffemodel_path)
+    print('output_names:', output_names)
+
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/__init__.py b/insightface/tools/onnx2caffe/onnx2caffe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/_error_utils.py b/insightface/tools/onnx2caffe/onnx2caffe/_error_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..46f34c05d06ddfe4c0baf2e1fb7e131fce35f485
--- /dev/null
+++ b/insightface/tools/onnx2caffe/onnx2caffe/_error_utils.py
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Dict, Text, Any, Callable
+from ._graph import Node, Graph
+
+class ErrorHandling(object):
+  '''
+  To handle errors and addition of custom layers
+  '''
+
+  def __init__(self,
+               add_custom_layers = False, # type: bool
+               custom_conversion_functions = dict(), # type: Dict[Text, Any]
+               custom_layer_nodes = [], # type : List[Node]
+               ):
+      # type: (...) -> None
+      self.add_custom_layers = add_custom_layers
+      self.custom_conversion_functions = custom_conversion_functions
+      self.custom_layer_nodes = custom_layer_nodes
+
+
+  def unsupported_op(self,
+                     node,  # type: Node
+                    ):
+      # type: (...) -> Callable[[Any, Node, Graph, ErrorHandling], None]
+      '''
+      Either raise an error for an unsupported op type or return custom layer add function
+      '''
+      if self.add_custom_layers:
+        from ._operators import _convert_custom
+        return _convert_custom
+      else:
+        raise TypeError(
+          "ONNX node of type {} is not supported.\n".format(node.op_type,)
+        )
+
+
+  def unsupported_op_configuration(self,
+                                   node, # type: Node
+                                   err_message, # type: Text
+                                   ):
+      raise TypeError(
+        "Error while converting op of type: {}. Error message: {}\n".format(node.op_type, err_message, )
+      )
+
+
+  def missing_initializer(self,
+                          node, # type: Node
+                          err_message, # type: Text
+                          ):
+      # type: (...) -> None
+      '''
+      Missing initializer error
+      '''
+      raise ValueError(
+        "Missing initializer error in op of type {}, with input name = {}, "
+        "output name = {}. Error message: {}\n".
+        format(node.op_type, node.inputs[0], node.outputs[0], err_message)
+      )
+
+
+
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/_graph.py b/insightface/tools/onnx2caffe/onnx2caffe/_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e81dbc8e1fe177f5c49bbf29204b789815742b
--- /dev/null
+++ b/insightface/tools/onnx2caffe/onnx2caffe/_graph.py
@@ -0,0 +1,225 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from onnx import numpy_helper, ValueInfoProto, AttributeProto, GraphProto, NodeProto, TensorProto, TensorShapeProto
+from typing import Any, Text, Iterable, List, Dict, Sequence, Optional, Tuple, Union
+from typing_extensions import Protocol
+import numpy as np
+
+
+class Transformer(Protocol):
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        pass
+
+
+EdgeInfo = Tuple[Text, Any, TensorShapeProto]
+AttributeValue = Any # TODO Union[Sequence[float], Sequence[int], Sequence[Text], Sequence[TensorProto], Sequence[GraphProto]]
+
+def _input_from_onnx_input(input):  # type: (ValueInfoProto) -> EdgeInfo
+    name = input.name
+    type = input.type.tensor_type.elem_type
+    shape = tuple([d.dim_value for d in input.type.tensor_type.shape.dim])
+    return (name, type, shape)
+
+
+def _convertAttributeProto(onnx_arg):  # type: (AttributeProto) -> AttributeValue
+    """
+    Convert an ONNX AttributeProto into an appropriate Python object
+    for the type.
+    NB: Tensor attribute gets returned as numpy array
+    """
+    if onnx_arg.HasField('f'):
+        return onnx_arg.f
+    elif onnx_arg.HasField('i'):
+        return onnx_arg.i
+    elif onnx_arg.HasField('s'):
+        return onnx_arg.s
+    elif onnx_arg.HasField('t'):
+        return numpy_helper.to_array(onnx_arg.t)
+    elif len(onnx_arg.floats):
+        return list(onnx_arg.floats)
+    elif len(onnx_arg.ints):
+        return list(onnx_arg.ints)
+    elif len(onnx_arg.strings):
+        return list(onnx_arg.strings)
+    else:
+        raise ValueError("Unsupported ONNX attribute: {}".format(onnx_arg))
+
+
+class Attributes(Dict[Text, Any]):
+    @staticmethod
+    def from_onnx(args):  # type: (Iterable[AttributeProto]) -> Attributes
+        d = Attributes()
+        for arg in args:
+            d[arg.name] = _convertAttributeProto(arg)
+        return d
+
+
+class Node(object):
+    def __init__(self,
+                 name,  # type: Optional[Text]
+                 op_type,  # type: Text
+                 attrs,  # type: Dict[Text, AttributeValue]
+                 inputs,  # type: List[Text]
+                 outputs,  # type: List[Text]
+                 ):
+        # type: (...) -> None
+        self.name = name
+        self.op_type = op_type
+        self.attrs = attrs
+        self.inputs = inputs
+        self.outputs = outputs
+        self.input_tensors = {}  # type: Dict[Text, np._ArrayLike[Any]]
+        self.parents = []  # type: List[Node]
+        self.children = []  # type: List[Node]
+        self.metadata = {}  # type: Dict[Any, Any]
+
+    def add_parent(self, parent_node):  # type: (Node) -> None
+        assert parent_node not in self.parents
+        self.parents.append(parent_node)
+        if self not in parent_node.children:
+            parent_node.children.append(self)
+
+    def add_child(self, child_node):  # type: (Node) -> None
+        assert child_node not in self.children
+        self.children.append(child_node)
+        if self not in child_node.parents:
+            child_node.parents.append(self)
+
+    def get_only_parent(self):  # type: () -> Node
+        if len(self.parents) != 1:
+            raise ValueError('Node ({}) expected to have 1 parent. Found {}.'
+                             .format(self, len(self.parents)))
+        return self.parents[0]
+
+    @staticmethod
+    def from_onnx(node):  # type: (NodeProto) -> Node
+        attrs = Attributes.from_onnx(node.attribute)
+        name = Text(node.name)
+        if len(name) == 0:
+            name = "_".join(node.output)
+        return Node(
+            name, node.op_type, attrs, list(node.input), list(node.output)
+        )
+
+
+class Graph(object):
+    def __init__(self,
+                 nodes,  # type: List[Node]
+                 inputs,  # type: List[EdgeInfo]
+                 outputs,  # type: List[EdgeInfo]
+                 shape_dict, # type: Dict[Text,Tuple[int,...]]
+                 ):
+        # type: (...) -> None
+        self.nodes = nodes
+        self.inputs = inputs
+        self.outputs = outputs
+        self.shape_dict = shape_dict  # data blob name to its shape
+
+        # data blob name to the list of op types it feeds into
+        self.blob_to_op_type = {} # type: Dict[Text, List[Text]]
+        # data blob name to the op_type that generates it
+        self.blob_from_op_type = {}  # type: Dict[Text, Text]
+
+        for node_ in nodes:
+            for input_ in node_.inputs:
+                if input_ in self.blob_to_op_type:
+                    self.blob_to_op_type[input_].append(node_.op_type)
+                else:
+                    self.blob_to_op_type[input_] = [node_.op_type]
+            for output_ in node_.outputs:
+                if output_ in self.blob_from_op_type:
+                    raise ValueError("Data blob: %s, is generated by more than 1 op" %(output_))
+                self.blob_from_op_type[output_] = node_.op_type
+
+
+    def transformed(self, transformers):  # type: (Iterable[Transformer]) -> Graph
+        graph = self
+        for transformer in transformers:
+            graph = transformer(graph)
+        return graph
+
+    def has_edge_name(self, name):  # type: (Text) -> bool
+        '''
+        Check if name is already used for graph inputs/outputs or for nodes
+        inputs/outputs
+        '''
+        names = set()
+        for input in self.inputs:
+            names.add(input[0])
+        for output in self.outputs:
+            names.add(output[0])
+        for node in self.nodes:
+            names.update(node.inputs)
+            names.update(node.outputs)
+        return name in names
+
+    def get_unique_edge_name(self, name):  # type: (Text) -> Text
+        n_ = name
+        i = 0
+        while self.has_edge_name(n_):
+            n_ = "{}_{}".format(name, i)
+            i += 1
+        return n_
+
+    @staticmethod
+    def from_onnx(graph):  # type: (GraphProto) -> Graph
+        input_tensors = {
+            t.name: numpy_helper.to_array(t) for t in graph.initializer
+        }
+        nodes_ = []
+        nodes_by_input = {}  # type: Dict[Text, List[Node]]
+        nodes_by_output = {}
+        for node in graph.node:
+            node_ = Node.from_onnx(node)
+            for input_ in node_.inputs:
+                if input_ in input_tensors:
+                    node_.input_tensors[input_] = input_tensors[input_]
+                else:
+                    if input_ in nodes_by_input:
+                        input_nodes = nodes_by_input[input_]
+                    else:
+                        input_nodes = []
+                        nodes_by_input[input_] = input_nodes
+                    input_nodes.append(node_)
+            for output_ in node_.outputs:
+                nodes_by_output[output_] = node_
+            nodes_.append(node_)
+
+        inputs = []
+        for i in graph.input:
+            if i.name not in input_tensors:
+                inputs.append(_input_from_onnx_input(i))
+
+        outputs = []
+        for o in graph.output:
+            outputs.append(_input_from_onnx_input(o))
+
+        for node_ in nodes_:
+            for input_ in node_.inputs:
+                if input_ in nodes_by_output:
+                    node_.parents.append(nodes_by_output[input_])
+            for output_ in node_.outputs:
+                if output_ in nodes_by_input:
+                    node_.children.extend(nodes_by_input[output_])
+
+        # Dictionary to hold the "value_info" field from ONNX graph
+        shape_dict = {} # type: Dict[Text,Tuple[int,...]]
+
+        def extract_value_info(shape_dict, # type: Dict[Text,Tuple[int,...]]
+                               value_info, # type: ValueInfoProto[...]
+                               ):
+            # type: (...) -> None
+            shape_dict[value_info.name] = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim])
+
+        for value_info in graph.value_info:
+            extract_value_info(shape_dict, value_info)
+        for value_info in graph.input:
+            extract_value_info(shape_dict, value_info)
+        for value_info in graph.output:
+            extract_value_info(shape_dict, value_info)
+
+
+        return Graph(nodes_, inputs, outputs, shape_dict)
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/_operators.py b/insightface/tools/onnx2caffe/onnx2caffe/_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..815ff963a6bfcc6118f677fbdb0d6e8b53266953
--- /dev/null
+++ b/insightface/tools/onnx2caffe/onnx2caffe/_operators.py
@@ -0,0 +1,485 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe import params as P
+import math
+import numpy as np
+from ._graph import Node, Graph
+from MyCaffe import Function as myf
+
+USE_DECONV_AS_UPSAMPLE = False
+
+def _compare(a, b, encoding="utf8"): #type: (Text, Text, Text) -> bool
+    if isinstance(a, bytes):
+        a = a.decode(encoding)
+    if isinstance(b, bytes):
+        b = b.decode(encoding)
+    return a == b
+
+def make_input(input):
+    name = input[0]
+    output = input[0]
+    output = [output]
+    shape = input[2]
+    shape = list(shape)
+    input_layer = myf("Input", name, [], output, input_param=dict(shape=dict(dim=shape)))
+    return input_layer
+
+def _convert_conv(node, graph, err):
+    weight_name = node.inputs[1]
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    node_name = node.name
+    W = None
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name,))
+    is_deconv = False
+    if node.op_type.endswith("Transpose"):
+        is_deconv = True
+    bias_flag = False
+    bias = None
+    if len(node.inputs) > 2:
+        bias = node.input_tensors[node.inputs[2]]
+        bias_flag = True
+    dilations = node.attrs.get("dilations", [1, 1])
+    # groups = 1
+    groups = node.attrs.get("group", 1)
+    kernel_shape = node.attrs["kernel_shape"]
+    pads = node.attrs.get("pads", [0, 0, 0, 0])
+    strides = node.attrs["strides"]
+
+    layer = myf("Convolution", node_name, [input_name], [output_name],
+                kernel_h = kernel_shape[0],kernel_w = kernel_shape[1],
+                stride_h=strides[0], stride_w = strides[1], group = groups,
+                pad_h = pads[0], pad_w = pads[1],
+                num_output=W.shape[0],  dilation = dilations[0], bias_term = bias_flag)
+
+    graph.channel_dims[output_name] = W.shape[0]
+    return layer
+
+def _convert_relu(node,graph,err):
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    name = str(node.name)
+
+    if input_name==output_name:
+        inplace = True
+    else:
+        inplace = False
+
+    layer = myf("ReLU",name,[input_name],[output_name],in_place=inplace)
+    # l_top_relu1 = L.ReLU(l_bottom, name=name, in_place=True)
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+
+    return layer
+
+def _convert_sigmoid(node,graph,err):
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    name = str(node.name)
+
+    if input_name==output_name:
+        inplace = True
+    else:
+        inplace = False
+
+    layer = myf("Sigmoid",name,[input_name],[output_name],in_place=inplace)
+    # l_top_relu1 = L.ReLU(l_bottom, name=name, in_place=True)
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+
+    return layer
+
+def _convert_BatchNorm(node,graph,err):
+    epsilon = node.attrs.get("epsilon", 1e-5)
+    scale = node.input_tensors[node.inputs[1]]
+    bias = node.input_tensors[node.inputs[2]]
+    mean = node.input_tensors[node.inputs[3]]
+    var = node.input_tensors[node.inputs[4]]
+    node_name = node.name
+
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+
+    if input_name==output_name:
+        inplace = True
+    else:
+        inplace = False
+
+    bn_layer = myf("BatchNorm", node_name+"_bn",[input_name],[output_name],eps = epsilon, use_global_stats = True, in_place=inplace)
+    scale_layer = myf("Scale", node_name, [output_name],[output_name],in_place=True,bias_term=True)
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+
+    return bn_layer,scale_layer
+
+def _convert_Add(node,graph,err):
+    input_name_list = [str(i) for i in node.inputs]
+    output_name = str(node.outputs[0])
+    node_name = node.name
+
+    max_dim = 0
+    for name in input_name_list:
+        if graph.channel_dims[name]>max_dim:
+            max_dim = graph.channel_dims[name]
+
+    if 'broadcast' in node.attrs:
+        if node.attrs['broadcast'] == 1:
+            input_node_number = len(input_name_list)
+            if input_node_number !=2:
+                return err.unsupported_op_configuration(node, "Broadcast Add must has 2 input, not {}".format(input_node_number))
+            axis = node.attrs['axis']
+            flat_layer = myf("Flatten",node_name+'_flat',[input_name_list[1]],[output_name+'_flat'])
+            layer = myf("Bias", node_name, [input_name_list[0],output_name+'_flat'], [output_name], axis = axis)
+            # layer = myf("Bias", node_name, input_name_list, [output_name], bias_term = False, axis = axis)
+            graph.channel_dims[output_name] = graph.channel_dims[input_name_list[0]]
+            return flat_layer,layer
+
+    layer = myf("Eltwise",node_name,input_name_list,[output_name],operation=P.Eltwise.SUM)
+    graph.channel_dims[output_name] = graph.channel_dims[input_name_list[0]]
+    return layer
+
+def _convert_Mul(node,graph,err):
+    input_name_list = [str(i) for i in node.inputs]
+    output_name = str(node.outputs[0])
+    node_name = node.name
+    print('Mul:', node.name, node.attrs, input_name_list, output_name)
+    if len(node.attrs)==0:
+        assert len(node.input_tensors)==1
+        assert len(input_name_list)==2
+        inp_tensor = node.input_tensors[input_name_list[1]]
+        scale_value = float(inp_tensor)
+        print(scale_value)
+        layer = myf("Scale", node_name, [input_name_list[0]], [output_name], bias_term = False, 
+                scale_param = dict(filler = dict(value=scale_value), bias_term=False))
+        return layer
+        #layer = myf("Reshape", node_name, [input_name], [output_name], reshape_param = dict(shape=dict(dim=list(shape))))
+    #print(len(node.input_tensors))
+
+    # max_dim = 0
+    # for name in input_name_list:
+    #     if graph.channel_dims[name]>max_dim:
+    #         max_dim = graph.channel_dims[name]
+
+    if 'broadcast' in node.attrs:
+        if node.attrs['broadcast'] == 1:
+            input_node_number = len(input_name_list)
+            if input_node_number !=2:
+                return err.unsupported_op_configuration(node, "Broadcast Mul must has 2 input, not {}".format(input_node_number))
+            axis = node.attrs['axis']
+            flat_layer = myf("Flatten",node_name+'_flat',[input_name_list[1]],[output_name+'_flat'])
+            layer = myf("Scale", node_name, [input_name_list[0],output_name+'_flat'], [output_name], bias_term = False, axis = axis)
+            graph.channel_dims[output_name] = graph.channel_dims[input_name_list[0]]
+            return flat_layer,layer
+
+    layer = myf("Eltwise",node_name,input_name_list,[output_name],operation=P.Eltwise.PROD)
+    graph.channel_dims[output_name] = graph.channel_dims[input_name_list[0]]
+    return layer
+
+def _convert_Reshape(node,graph,err):
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    if len(node.inputs)==1:
+        shape = tuple(node.attrs.get('shape', ()))
+    else:
+        shape = tuple(node.input_tensors[node.inputs[1]])
+    # if shape == ():
+
+    #print('reshape to', shape)
+
+    if input_name==output_name:
+        inplace = True
+    else:
+        inplace = False
+
+    graph.channel_dims[output_name] = shape[1]
+    layer = myf("Reshape", node_name, [input_name], [output_name], reshape_param = dict(shape=dict(dim=list(shape))))
+    return layer
+
+    #if len(shape) == 2:
+    #    layer = myf("Flatten",node_name,[input_name],[output_name],in_place=inplace)
+    #    graph.channel_dims[output_name] = shape[1]
+    #    return layer
+    #elif len(shape) == 4:
+    #    graph.channel_dims[output_name] = shape[1]
+    #    layer = myf("Reshape", node_name, [input_name], [output_name], reshape_param = dict(shape=dict(dim=list(shape))))
+    #    return layer
+    #else:
+    #    return err.unsupported_op_configuration(node, "Reshape dimention number shall be 2 or 4")
+
+def _convert_Flatten(node,graph,err):
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    # shape = tuple(node.attrs.get('shape', ()))
+    if input_name==output_name:
+        inplace = True
+    else:
+        inplace = False
+    layer = myf("Flatten", node_name, [input_name], [output_name], in_place=inplace)
+    # graph.channel_dims[output_name] = shape[1]
+    return layer
+
+def _convert_pool(node,graph,err):
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    if node.op_type.endswith("MaxPool"):
+        pool_type = P.Pooling.MAX
+    elif node.op_type.endswith("AveragePool"):
+        pool_type = P.Pooling.AVE
+    else:
+        return err.unsupported_op_configuration(node,  "Unsupported pool type")
+
+    kernel_shape = node.attrs["kernel_shape"]
+    strides = node.attrs.get('strides', [1, 1])
+    pads = node.attrs.get('pads', [0, 0, 0, 0])
+
+    layer = myf("Pooling",node_name,[input_name],[output_name],pooling_param = dict(pool = pool_type,
+                                                                                    kernel_h = kernel_shape[0],
+                                                                                    kernel_w = kernel_shape[1],
+                                                                                    stride_h = strides[0],
+                                                                                    stride_w = strides[1],
+                                                                                    pad_h = pads[0],
+                                                                                    pad_w = pads[1]))
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_dropout(node,graph,err):
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    ratio = node.attrs.get('ratio', 0.5)
+    layer = myf("Dropout", node_name, [input_name], [output_name], dropout_ratio =ratio)
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_gemm(node,graph,err):
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    weight_name = node.inputs[1]
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name, ))
+        return
+
+    if node.attrs["broadcast"] != 1 or node.attrs["transB"] != 1:
+        return err.unsupported_op_configuration(node,"Gemm is supported only for inner_product layer")
+
+    b = None
+    bias_flag = False
+    if len(node.inputs) > 2:
+        b = node.input_tensors[node.inputs[2]]
+
+    if len(W.shape) != 2 or (b is not None and len(b.shape) != 1):
+        return err.unsupported_op_configuration(node, "Gemm is supported only for inner_product layer")
+    if b is not None:
+        bias_flag = True
+        if W.shape[0] != b.shape[0]:
+            return err.unsupported_op_configuration(node,
+                                                    "Gemm is supported only for inner_product layer")
+
+    layer = myf("InnerProduct",node_name,[input_name],[output_name],num_output = W.shape[0],bias_term = bias_flag)
+    graph.channel_dims[output_name] = W.shape[0]
+
+    return layer
+
+def _convert_upsample(node,graph,err):
+    factor = int(node.attrs["height_scale"])
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    # input_shape = graph.shape_dict[input_name]
+    # channels = input_shape[1]
+    channels = graph.channel_dims[input_name]
+    pad = int(math.ceil((factor - 1) / 2.))
+    # layer = myf("Deconvolution", node_name, [input_name], [output_name],
+    #             kernel_size=2 * factor - factor % 2,
+    #             stride=factor, group=channels,
+    #             pad = pad, num_output=channels, bias_term = False)
+    mode = node.attrs["mode"]
+    #https://github.com/pytorch/pytorch/issues/6900
+    if mode=="bilinear":
+        layer = myf("Deconvolution", node_name, [input_name], [output_name],
+                    convolution_param=dict(
+                        num_output=channels,
+                        kernel_size=2 * factor - factor % 2,
+                        stride=factor,
+                        pad=pad,
+                        group=channels,
+                        bias_term=False,
+                        weight_filler=dict(type="bilinear_upsampling")
+                    ))
+    else:
+        layer = myf("Deconvolution", node_name, [input_name], [output_name],
+                    convolution_param=dict(
+                        num_output=channels,
+                        kernel_size=factor,
+                        stride=factor,
+                        group=channels,
+                        bias_term=False,
+                    ))
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_resize(node,graph,err):
+    if not USE_DECONV_AS_UPSAMPLE:
+        #print(node, graph)
+        node_name = node.name
+        input_name = str(node.inputs[0])
+        output_name = str(node.outputs[0])
+        #print(node.attrs, node_name, input_name, output_name)
+        layer = myf("Upsample", node_name, [input_name], [output_name],
+                    upsample_param=dict(
+                        scale = 2
+                    ))
+
+        graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    else:
+        print('add resize deconv operator')
+        factor = 2
+        node_name = node.name
+        input_name = str(node.inputs[0])
+        output_name = str(node.outputs[0])
+        # input_shape = graph.shape_dict[input_name]
+        # channels = input_shape[1]
+        channels = graph.channel_dims[input_name]
+        pad = int(math.ceil((factor - 1) / 2.))
+        layer = myf("Deconvolution", node_name, [input_name], [output_name],
+                    convolution_param=dict(
+                        num_output=channels,
+                        kernel_size=factor,
+                        stride=factor,
+                        group=channels,
+                        bias_term=False,
+                    ))
+        graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_transpose(node,graph,err):
+    #print(node, graph)
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    #print(node.attrs, node_name, input_name, output_name)
+    layer = myf("Permute", node_name, [input_name], [output_name],
+                permute_param=dict(
+                    order = node.attrs['perm']
+                ))
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_softmax(node,graph,err):
+    #print(node, graph)
+    node_name = node.name
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    #print(node.attrs, node_name, input_name, output_name)
+    layer = myf("Softmax", node_name, [input_name], [output_name],
+                softmax_param=dict(
+                    axis = node.attrs['axis']
+                ))
+
+    graph.channel_dims[output_name] = graph.channel_dims[input_name]
+    return layer
+
+def _convert_concat(node,graph,err):
+    node_name = node.name
+    input_name_list = [str(i) for i in node.inputs]
+    output_name = str(node.outputs[0])
+    axis = node.attrs.get("axis", 1)
+
+    layer = myf('Concat', node_name, input_name_list, [output_name], axis = axis)
+    if axis == 1:
+        dim = 0
+        for name in input_name_list:
+            dim+=graph.channel_dims[name]
+        graph.channel_dims[output_name] = dim
+    else:
+        graph.channel_dims[output_name] = graph.channel_dims[input_name_list[0]]
+
+    return layer
+
+def _convert_conv_transpose(node,graph,err):
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    node_name = node.name
+    weight_name = node.inputs[1]
+    W = None
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name,))
+    bias_flag = False
+    bias = None
+    if len(node.inputs) > 2:
+        bias = node.input_tensors[node.inputs[2]]
+        bias_flag = True
+    dilations = node.attrs.get("dilations", [1, 1])
+    # groups = 1
+    groups = node.attrs.get("group", 1)
+    kernel_shape = node.attrs["kernel_shape"]
+    pads = node.attrs.get("pads", [0, 0, 0, 0])
+    strides = node.attrs["strides"]
+
+    layer = myf('Deconvolution', node_name, [input_name], [output_name],
+                convolution_param=dict(
+                    num_output=W.shape[1],
+                    kernel_h=kernel_shape[0],kernel_w=kernel_shape[1],
+                    stride_h=strides[0],stride_w = strides[1],
+                    group=groups,
+                    pad_h=pads[0], pad_w=pads[1],
+                    bias_term=bias_flag,
+                ))
+
+    graph.channel_dims[output_name] = W.shape[1]
+    return layer
+
+    # l_top = L.Deconvolution(
+    #     l_bottom,
+    #     name=name,
+    #     convolution_param=dict(
+    #         num_output=W.shape[1],
+    #         kernel_h=kernel_h,
+    #         kernel_w=kernel_w,
+    #         stride_h=stride_h,
+    #         stride_w=stride_w,
+    #         pad_h=pad_h,
+    #         pad_w=pad_w,
+    #         group=groups,
+    #         bias_term=bias_term))
+
+
+
+_ONNX_NODE_REGISTRY = {
+    "Conv": _convert_conv,
+    "Relu": _convert_relu,
+    "BatchNormalization": _convert_BatchNorm,
+    "Add": _convert_Add,
+    "Mul": _convert_Mul,
+    "Reshape": _convert_Reshape,
+    "MaxPool": _convert_pool,
+    "AveragePool": _convert_pool,
+    "Dropout": _convert_dropout,
+    "Gemm": _convert_gemm,
+    "Upsample": _convert_upsample,
+    "Concat": _convert_concat,
+    "ConvTranspose": _convert_conv_transpose,
+    "Sigmoid": _convert_sigmoid,
+    "Flatten": _convert_Flatten,
+    "Resize": _convert_resize,
+    "Transpose": _convert_transpose,
+    "Softmax": _convert_softmax,
+}
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/_transformers.py b/insightface/tools/onnx2caffe/onnx2caffe/_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f8c8159b4af88481843eae854a74cac39dfb65a
--- /dev/null
+++ b/insightface/tools/onnx2caffe/onnx2caffe/_transformers.py
@@ -0,0 +1,520 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from typing import Sequence, Text, Dict, List
+import numpy as np
+
+from onnx import TensorProto
+
+from ._graph import Graph, Node
+
+
+class NodesFuser(object):
+    '''
+    An abstract helper for merging nodes
+    '''
+    def __init__(self,
+                 num_nodes,  # type: int
+                 ):
+        # type: (...) -> None
+        assert num_nodes >= 2, "Algorithm only works if fusing multiple nodes"
+        self.num_nodes = num_nodes
+
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        nodes = graph.nodes
+        merged_nodes = {}
+        for node in nodes:
+            nodes_window = []  # type: List[Node]
+            n = node
+            for _ in range(self.num_nodes - 1):
+                if len(n.parents) != 1:
+                    # We're only fusing nodes with single parents
+                    break
+                p = n.get_only_parent()
+                if len(p.children) != 1:
+                    # We can only fuse a node if its parent's
+                    # value isn't used by any other node.
+                    break
+                nodes_window.insert(0, n)
+                n = p
+            if len(nodes_window) > 0:
+                # add parent of chained nodes
+                first = nodes_window[0]
+                p = first.get_only_parent()
+                if len(p.children) == 1:
+                    nodes_window.insert(0, p)
+            if len(nodes_window) != self.num_nodes:
+                continue
+            if not self.is_eligible(graph, nodes_window):
+                continue
+            merged = self.merge(graph, nodes_window)
+            first, last = nodes_window[0], nodes_window[-1]
+            for parent in first.parents:
+                parent.children.remove(first)
+                if merged[0] not in parent.children:
+                    parent.add_child(merged[0])
+            for child in last.children:
+                child.parents.remove(last)
+                if merged[-1] not in child.parents:
+                    child.add_parent(merged[-1])
+            for n in nodes_window:
+                merged_nodes[n.name] = merged
+
+        transformed_nodes = []
+        added_merged = []  # type: List[Node]
+        for node in nodes:
+            if node.name in merged_nodes:
+                merged = merged_nodes[node.name]
+                if merged[0] not in added_merged:
+                    for n in merged:
+                        transformed_nodes.append(n)
+                    added_merged.append(merged[0])
+            else:
+                transformed_nodes.append(node)
+        return Graph(transformed_nodes, graph.inputs, graph.outputs, graph.shape_dict)
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        '''Returns true if this subset of nodes is eligible for fusion.'''
+        raise NotImplementedError('Must be implemented by subclass.')
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        '''Merge nodes'''
+        nodes[0].outputs = nodes[-1].outputs
+        return [nodes[0]]
+
+
+class ConvAddFuser(NodesFuser):
+    '''
+    Fuses Add layer into parent convolution layer.
+    '''
+    def __init__(self):  # type: () -> None
+        super(ConvAddFuser, self).__init__(2)
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        parent, child = nodes[0], nodes[1]
+        if parent.op_type != 'Conv':
+            return False
+        if child.op_type != 'Add':
+            return False
+        if 'broadcast' not in child.attrs:
+            return False
+        if 'axis' not in child.attrs:
+            return False
+        if parent.inputs[1] not in parent.input_tensors:
+            return False
+        if len(parent.inputs) > 2 and parent.inputs[2] not in parent.input_tensors:
+            return False
+        if child.inputs[1] not in child.input_tensors:
+            return False
+
+        broadcast = child.attrs['broadcast']
+        if broadcast != 1:
+            return False
+
+        axis = child.attrs['axis']
+        if axis != 1:
+            return False
+
+        return True
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        parent, child = nodes[0], nodes[1]
+        output_channels = parent.input_tensors[parent.inputs[1]].shape[0]
+        if len(parent.inputs) > 2:
+            bias_input_name = parent.inputs[2]
+            bias = parent.input_tensors[bias_input_name]
+        else:
+            bias_input_name = "{}_bias".format(parent.name,)
+            parent.inputs.append(bias_input_name)
+            bias = np.zeros(
+                (output_channels,), dtype=np.float32
+            )
+            parent.input_tensors[bias_input_name] = bias
+        bias = bias + child.input_tensors[child.inputs[1]]
+        parent.input_tensors[bias_input_name] = bias
+        parent.outputs = child.outputs
+        parent.children.remove(child)
+        child.parents.remove(parent)
+        return [parent]
+
+
+class BNBroadcastedMulFuser(NodesFuser):
+    '''
+    Fuses Mul into BatchNorm
+    '''
+    def __init__(self):  # type: () -> None
+        super(BNBroadcastedMulFuser, self).__init__(2)
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        parent, child = nodes[0], nodes[1]
+        if parent.op_type != 'BatchNormalization':
+            return False
+        if child.op_type != 'Mul':
+            return False
+        if "broadcast" not in child.attrs:
+            return False
+        if child.attrs["broadcast"] != 1:
+            return False
+        if "axis" not in child.attrs:
+            return False
+        if child.attrs["axis"] != 1:
+            return False
+        if child.inputs[1] not in child.input_tensors:
+            return False
+        if parent.inputs[1] not in parent.input_tensors:
+            return False
+        if parent.inputs[2] not in parent.input_tensors:
+            return False
+        return True
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        parent, child = nodes[0], nodes[1]
+        weight = parent.input_tensors[parent.inputs[1]]
+        bias = parent.input_tensors[parent.inputs[2]]
+        W = child.input_tensors[child.inputs[1]]
+        parent.input_tensors[parent.inputs[1]] = np.multiply(weight, W)
+        parent.input_tensors[parent.inputs[2]] = np.multiply(bias, W)
+        parent.outputs = child.outputs
+        parent.children.remove(child)
+        child.parents.remove(parent)
+        return [parent]
+
+
+class BNBroadcastedAddFuser(NodesFuser):
+    '''
+    Fuses Add into BatchNorm
+    '''
+    def __init__(self):  # type: () -> None
+        super(BNBroadcastedAddFuser, self).__init__(2)
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        parent, child = nodes[0], nodes[1]
+        if parent.op_type != 'BatchNormalization':
+            return False
+        if child.op_type != 'Add':
+            return False
+        if "broadcast" not in child.attrs:
+            return False
+        if child.attrs["broadcast"] != 1:
+            return False
+        if "axis" not in child.attrs:
+            return False
+        if child.attrs["axis"] != 1:
+            return False
+        if len(child.inputs) != 2:
+            return False
+        if child.inputs[1] not in child.input_tensors:
+            return False
+        if parent.inputs[2] not in parent.input_tensors:
+            return False
+        return True
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        parent, child = nodes[0], nodes[1]
+        bias = parent.input_tensors[parent.inputs[2]]
+        b = child.input_tensors[child.inputs[1]]
+        parent.input_tensors[parent.inputs[2]] = bias + b
+        parent.outputs = child.outputs
+        parent.children.remove(child)
+        child.parents.remove(parent)
+        return [parent]
+
+
+class DropoutRemover(NodesFuser):
+    '''
+    Removes Dropout layer
+    '''
+    def __init__(self):  # type: () -> None
+        super(DropoutRemover, self).__init__(2)
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        child = nodes[1]
+        return child.op_type == "Dropout"
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        parent, child = nodes[0], nodes[1]
+        parent.children.remove(child)
+        child.parents.remove(parent)
+        parent.outputs = child.outputs
+        return [parent]
+
+
+class ReshapeInitTensorFuser(object):
+    '''
+    Fuses Reshape operator if it is used only to reshape blob in
+    graph initializer. We can reshape here instead of runtime.
+    '''
+
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        nodes = graph.nodes
+        removed = []
+        for node in nodes:
+            if node.op_type != 'Reshape':
+                continue
+            if not (len(node.input_tensors) == 2 or len(node.input_tensors) == 1):
+                continue
+            tensor_name = node.inputs[0]
+            if tensor_name not in node.input_tensors:
+                continue
+            if len(node.inputs) > 1:
+                shape_name = node.inputs[1]
+                if shape_name not in node.input_tensors:
+                    continue
+            is_non_constant_parent = False
+            if len(node.parents) > 0:
+                for parent in node.parents:
+                    if parent.op_type != 'Constant':
+                        is_non_constant_parent = True
+                        break
+            if is_non_constant_parent:
+                continue
+
+            removed.append(node)
+            output_name = node.outputs[0]
+
+            tensor = node.input_tensors[tensor_name]
+            if 'shape' in node.attrs:
+                shape = tuple(node.attrs["shape"])
+            else:
+                shape = node.input_tensors[shape_name] # type: ignore
+
+            # ONNX spec supports setting dimension to '0', in which case
+            # it should be taken from old dimension.
+            # This isn't supported in numpy, so don't transform.
+            # TODO Should we support this case?
+            if any([s == 0 for s in shape]):
+                continue
+
+            reshaped_tensor = tensor.reshape(shape)
+
+            for child in node.children:
+                child.parents.remove(node)
+                child.input_tensors[output_name] = reshaped_tensor
+
+        transformed_nodes = [node for node in nodes if node not in removed]
+        return Graph(transformed_nodes, graph.inputs, graph.outputs, graph.shape_dict)
+
+
+class OutputRenamer(object):
+    '''
+    Rename outputs according to mapping
+    '''
+    def __init__(self,
+                 mapping,  # type: Dict[Text, Text]
+                 ):
+        # type: (...) -> None
+        self.mapping = mapping
+
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        mapping = self.mapping.copy()
+        nodes = graph.nodes
+        for node in nodes:
+            for i in range(len(node.outputs)):
+                output = node.outputs[i]
+                if output not in mapping:
+                    continue
+                node.outputs[i] = mapping[output]
+                for child in node.children:
+                    for j in range(len(child.inputs)):
+                        input_ = child.inputs[j]
+                        if input_ != output:
+                            continue
+                        child.inputs[j] = mapping[output]
+                del mapping[output]
+                if len(mapping) == 0:
+                    break
+        return graph
+
+
+class PixelShuffleFuser(NodesFuser):
+    '''
+    Fuses 3 operators reshape->transpose->reshape which is equivalent to
+    pytorch's pixel_shuffle layer
+    '''
+    def __init__(self):  # type: () -> None
+        super(PixelShuffleFuser, self).__init__(3)
+        self.num_added = 0
+
+    def is_eligible(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> bool
+        if nodes[0].op_type != 'Reshape':
+            return False
+        if nodes[1].op_type != 'Transpose':
+            return False
+        if nodes[2].op_type != 'Reshape':
+            return False
+        if nodes[0].inputs[1] not in nodes[0].input_tensors:
+            return False
+        if nodes[2].inputs[1] not in nodes[2].input_tensors:
+            return False
+
+        shape = nodes[0].input_tensors[nodes[0].inputs[1]]
+        if len(shape) != 6:
+            return False
+        if shape[0] != 1 or shape[2] != shape[3]:
+            return False
+
+        input_channels = shape[1]
+        scale_factor = shape[2]
+        input_height = shape[4]
+        input_width = shape[5]
+
+        if nodes[1].attrs.get('perm', []) != [0, 1, 4, 2, 5, 3]:
+            return False
+
+        shape = nodes[2].input_tensors[nodes[2].inputs[1]]
+        if len(shape) != 4:
+            return False
+
+        output_channels = shape[1]
+        output_height = shape[2]
+        output_width = shape[3]
+        if input_channels != output_channels:
+            return False
+        if (input_height * scale_factor) != output_height:
+            return False
+        if (input_width * scale_factor) != output_width:
+            return False
+
+        return True
+
+    def get_unique_edge_name(self, graph, name):  # type: (Graph, Text) -> Text
+        self.num_added += 1
+        return graph.get_unique_edge_name(name + '_' + str(self.num_added))
+
+    def merge(self, graph, nodes):  # type: (Graph, Sequence[Node]) -> Sequence[Node]
+        '''
+        Pixel shuffle is implemented using 3 operators:
+            - Reshape(1, channels, scale, scale, height, width)
+            - Transpose(0, 1, 4, 2, 5, 3)
+            - Reshape(1, channels, height * scale, width * scale)
+        CoreML Reshape and Transpose layers don't support tensors with more
+        than 4 dimensions. Thus we change above sequence of operators to the
+        following equivalent sequence:
+            - Reshape(channels, scale * scale, height, width)
+            - Transpose(0, 2, 1, 3)
+            - Reshape(channels * height, scale, scale, width)
+            - Transpose(0, 1, 3, 2)
+            - Reshape(1, channels, height * scale, width * scale)
+        '''
+        reshape_1 = nodes[0]
+        transpose_1 = nodes[1]
+        transpose_1.children = []
+
+        shape = reshape_1.input_tensors[reshape_1.inputs[1]]
+
+        channels = shape[1]
+        scale = shape[2]
+        height = shape[4]
+        width = shape[5]
+
+        reshape_1.input_tensors[reshape_1.inputs[1]] = np.asarray([channels, scale * scale, height, width])
+        transpose_1.attrs['perm'] = [0, 2, 1, 3]
+
+        reshape_output_name = 'pixel_shuffle_reshape'
+        transpose_output_name = 'pixel_shuffle_transpose'
+
+        transpose_1.outputs = [
+            self.get_unique_edge_name(graph, transpose_output_name)
+        ]
+
+        shape_name_second_reshape = self.get_unique_edge_name(graph, reshape_output_name)
+        output_name_second_reshape = self.get_unique_edge_name(graph, reshape_output_name)
+        reshape_2 = Node(
+            reshape_output_name,
+            'Reshape',
+            {},
+            [transpose_1.outputs[0], shape_name_second_reshape],
+            [output_name_second_reshape]
+        )
+        reshape_2.input_tensors[shape_name_second_reshape] = np.asarray([channels * height, scale, scale, width])
+        transpose_1.add_child(reshape_2)
+
+        transpose_2 = Node(
+            transpose_output_name,
+            'Transpose',
+            {'perm': [0, 1, 3, 2]},
+            reshape_2.outputs,
+            [self.get_unique_edge_name(graph, transpose_output_name)]
+        )
+        reshape_2.add_child(transpose_2)
+
+        final_reshape = nodes[2]
+        final_reshape.inputs = [transpose_2.outputs[0], nodes[2].inputs[1]]
+        final_reshape.parents = []
+        transpose_2.add_child(final_reshape)
+        return [reshape_1, transpose_1, reshape_2, transpose_2, final_reshape]
+
+
+class AddModelInputsOutputs(object):
+    '''
+    Expose hidden states of recurrent layers as model inputs and outputs
+    '''
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        input_names = [str(input_[0]) for input_ in graph.inputs]
+        output_names = [str(output_[0]) for output_ in graph.outputs]
+        for node in graph.nodes:
+            if str(node.op_type) == 'LSTM':
+                input_h = node.inputs[5] if len(node.inputs) > 5 else node.inputs[0] + '_h_input'
+                input_c = node.inputs[6] if len(node.inputs) > 6 else node.inputs[0] + '_c_input'
+                output_h = node.outputs[1] if len(node.outputs) > 1 else node.outputs[0] + '_h_output'
+                output_c = node.outputs[2] if len(node.outputs) > 2 else node.outputs[0] + '_c_output'
+                h = node.attrs["hidden_size"]
+                for input_ in [str(input_h), str(input_c)]:
+                    if input_ not in input_names:
+                        graph.inputs.append(tuple((input_, TensorProto.FLOAT, (h,))))  #type: ignore
+                    if input_ not in graph.blob_to_op_type:
+                        graph.blob_to_op_type[input_] = ['LSTM']
+                for output_ in [str(output_h), str(output_c)]:
+                    if output_ not in output_names:
+                        graph.outputs.append(tuple((output_, TensorProto.FLOAT, (h,))))  #type: ignore
+                    graph.blob_from_op_type[output_] = 'LSTM'
+        return graph
+
+
+class ConstantsToInitializers(object):
+    '''
+    Takes onnx Constant nodes and puts the tensor into graph initializers instead.
+    '''
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        output_names = [str(output_[0]) for output_ in graph.outputs]
+        remaining_nodes = []
+        for node in graph.nodes:
+            if node.op_type != 'Constant' or node.name in output_names:
+                remaining_nodes.append(node)
+                continue
+            for child in node.children:
+                child.input_tensors[node.outputs[0]] = node.attrs["value"]
+
+        graph.nodes = remaining_nodes
+        return graph
+
+
+class ImageScalerRemover(object):
+    '''
+    Removes ImageScaler layer if connected to a model input and single parent child nodes
+    '''
+
+    def __call__(self, graph):  # type: (Graph) -> Graph
+        input_names = [str(input_[0]) for input_ in graph.inputs]
+        nodes_to_be_removed = []
+        for node in graph.nodes:
+            if (node.op_type != 'ImageScaler') or (len(node.parents) != 0) or (node.inputs[0] not in input_names):
+                continue
+            is_eligible = True
+            for child in node.children:
+                if not (len(child.parents) == 1 and child.inputs[0] == node.outputs[0]):
+                    is_eligible = False
+                    break
+                child.inputs[0] = node.inputs[0]
+                child.parents = []
+            if not is_eligible:
+                continue
+            nodes_to_be_removed.append(node.name)
+
+        transformed_nodes = []
+        for node in graph.nodes:
+            if node.name not in nodes_to_be_removed:
+                transformed_nodes.append(node)
+        return Graph(transformed_nodes, graph.inputs, graph.outputs, graph.shape_dict)
\ No newline at end of file
diff --git a/insightface/tools/onnx2caffe/onnx2caffe/_weightloader.py b/insightface/tools/onnx2caffe/onnx2caffe/_weightloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1129ee96ab8ca8deddee84cba901687b272eed
--- /dev/null
+++ b/insightface/tools/onnx2caffe/onnx2caffe/_weightloader.py
@@ -0,0 +1,162 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+# from caffe import params as P
+import numpy as np
+from ._graph import Node, Graph
+
+USE_DECONV_AS_UPSAMPLE = False
+
+def _convert_conv(net, node, graph, err):
+    weight_name = node.inputs[1]
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    node_name = node.name
+    W = None
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name,))
+    bias_flag = False
+    bias = None
+    if len(node.inputs) > 2:
+        bias = node.input_tensors[node.inputs[2]]
+        bias_flag = True
+    # net.params[node_name][0].data = W
+    # if bias_flag:
+    #     net.params[node_name][1].data = bias
+    np.copyto(net.params[node_name][0].data,W,casting='same_kind')
+    if bias_flag:
+        np.copyto(net.params[node_name][1].data, bias, casting='same_kind')
+
+def _convert_relu(net, node, graph, err):
+    pass
+
+def _convert_sigmoid(net, node, graph, err):
+    pass
+
+def _convert_BatchNorm(net, node, graph, err):
+    scale = node.input_tensors[node.inputs[1]]
+    bias = node.input_tensors[node.inputs[2]]
+    mean = node.input_tensors[node.inputs[3]]
+    var = node.input_tensors[node.inputs[4]]
+    node_name = node.name
+    np.copyto(net.params[node_name + '_bn'][0].data, mean, casting='same_kind')
+    np.copyto(net.params[node_name + '_bn'][1].data, var, casting='same_kind')
+    net.params[node_name + '_bn'][2].data[...] = 1.0
+    np.copyto(net.params[node_name][0].data, scale, casting='same_kind')
+    np.copyto(net.params[node_name][1].data, bias, casting='same_kind')
+    # net.params[node_name+'_bn'][1].data = var
+    # net.params[node_name][0].data = scale
+    # net.params[node_name][1].data = bias
+
+def _convert_Add(net, node, graph, err):
+    pass
+
+def _convert_Mul(net, node, graph, err):
+    pass
+
+def _convert_Reshape(net, node, graph, err):
+    pass
+
+def _convert_Flatten(net, node, graph, err):
+    pass
+
+def _convert_pool(net, node, graph, err):
+    pass
+
+def _convert_dropout(net, node, graph, err):
+    pass
+
+def _convert_gemm(net, node, graph, err):
+    node_name = node.name
+    weight_name = node.inputs[1]
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name, ))
+    if node.attrs["broadcast"] != 1 or node.attrs["transB"] != 1:
+        return err.unsupported_op_configuration(node, "Gemm is supported only for inner_product layer")
+    b = None
+    if len(node.inputs) > 2:
+        b = node.input_tensors[node.inputs[2]]
+    if len(W.shape) != 2 or (b is not None and len(b.shape) != 1):
+        return err.unsupported_op_configuration(node, "Gemm is supported only for inner_product layer")
+    if b is not None:
+        if W.shape[0] != b.shape[0]:
+            return err.unsupported_op_configuration(node, "Gemm is supported only for inner_product layer")
+    net.params[node_name][0].data[...] = W
+    net.params[node_name][1].data[...] = b
+
+def _convert_upsample(net, node, graph, err):
+    mode = node.attrs["mode"]
+    node_name = node.name
+    if mode == "nearest":
+        caffe_params = net.params[node_name][0].data
+        weights = np.ones(caffe_params.shape).astype("float32")
+        np.copyto(net.params[node_name][0].data, weights, casting='same_kind')
+        # net.params[node_name][0].data[]
+
+def _convert_resize(net, node, graph, err):
+    if USE_DECONV_AS_UPSAMPLE:
+        print('add resize deconv param')
+        node_name = node.name
+        caffe_params = net.params[node_name][0].data
+        weights = np.ones(caffe_params.shape).astype("float32")
+        np.copyto(net.params[node_name][0].data, weights, casting='same_kind')
+
+def _convert_transpose(net, node, graph, err):
+    pass
+def _convert_concat(net, node, graph, err):
+    pass
+def _convert_softmax(net, node, graph, err):
+    pass
+
+def _convert_conv_transpose(net, node, graph, err):
+    weight_name = node.inputs[1]
+    input_name = str(node.inputs[0])
+    output_name = str(node.outputs[0])
+    node_name = node.name
+    W = None
+    if weight_name in node.input_tensors:
+        W = node.input_tensors[weight_name]
+    else:
+        err.missing_initializer(node,
+                                "Weight tensor: {} not found in the graph initializer".format(weight_name,))
+    bias_flag = False
+    bias = None
+    if len(node.inputs) > 2:
+        bias = node.input_tensors[node.inputs[2]]
+        bias_flag = True
+    # net.params[node_name][0].data = W
+    # if bias_flag:
+    #     net.params[node_name][1].data = bias
+    np.copyto(net.params[node_name][0].data,W,casting='same_kind')
+    if bias_flag:
+        np.copyto(net.params[node_name][1].data, bias, casting='same_kind')
+
+_ONNX_NODE_REGISTRY = {
+    "Conv": _convert_conv,
+    "Relu": _convert_relu,
+    "BatchNormalization": _convert_BatchNorm,
+    "Add": _convert_Add,
+    "Mul": _convert_Mul,
+    "Reshape": _convert_Reshape,
+    "MaxPool": _convert_pool,
+    "AveragePool": _convert_pool,
+    "Dropout": _convert_dropout,
+    "Gemm": _convert_gemm,
+    "Upsample": _convert_upsample,
+    "Concat": _convert_concat,
+    "ConvTranspose": _convert_conv_transpose,
+    "Sigmoid": _convert_sigmoid,
+    "Flatten": _convert_Flatten,
+    "Resize": _convert_resize,
+    "Transpose": _convert_transpose,
+    "Softmax": _convert_softmax,
+}
+
+
diff --git a/insightface/web-demos/README.md b/insightface/web-demos/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d93214206ce9a118574b7b9e7e0fee8574ddbcb
--- /dev/null
+++ b/insightface/web-demos/README.md
@@ -0,0 +1,18 @@
+InsightFace Web Demos
+---
+
+## UPDATE
+
+``2023.04.01``:  Please use our swapping demo via Discord bot, see detail [here](swapping_discord/).
+
+
+## Swapping Demo on Discord
+
+Check the tutorial [here](swapping_discord/).
+
+
+## [Link to Face Recognition Demo](http://demo.insightface.ai:7008/)
+
+  [<img src="https://insightface.ai/assets/img/custom/thumb_subcenter.png" width="480"/>](http://demo.insightface.ai:7007/)
+
+
diff --git a/insightface/web-demos/src_recognition/arcface_onnx.py b/insightface/web-demos/src_recognition/arcface_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..870e7d641d13150918316767cdc545170f00a5c8
--- /dev/null
+++ b/insightface/web-demos/src_recognition/arcface_onnx.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# @Organization  : insightface.ai
+# @Author        : Jia Guo
+# @Time          : 2021-05-04
+# @Function      : 
+
+import numpy as np
+import cv2
+import onnx
+import onnxruntime
+import face_align
+
+__all__ = [
+    'ArcFaceONNX',
+]
+
+
+class ArcFaceONNX:
+    def __init__(self, model_file=None, session=None):
+        assert model_file is not None
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'recognition'
+        find_sub = False
+        find_mul = False
+        model = onnx.load(self.model_file)
+        graph = model.graph
+        for nid, node in enumerate(graph.node[:8]):
+            #print(nid, node.name)
+            if node.name.startswith('Sub') or node.name.startswith('_minus'):
+                find_sub = True
+            if node.name.startswith('Mul') or node.name.startswith('_mul'):
+                find_mul = True
+        if find_sub and find_mul:
+            #mxnet arcface model
+            input_mean = 0.0
+            input_std = 1.0
+        else:
+            input_mean = 127.5
+            input_std = 127.5
+        self.input_mean = input_mean
+        self.input_std = input_std
+        #print('input mean and std:', self.input_mean, self.input_std)
+        if self.session is None:
+            self.session = onnxruntime.InferenceSession(self.model_file, providers=['CUDAExecutionProvider'])
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        input_name = input_cfg.name
+        self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        assert len(self.output_names)==1
+        self.output_shape = outputs[0].shape
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+
+    def get(self, img, kps):
+        aimg = face_align.norm_crop(img, landmark=kps, image_size=self.input_size[0])
+        embedding = self.get_feat(aimg).flatten()
+        return embedding
+
+    def compute_sim(self, feat1, feat2):
+        from numpy.linalg import norm
+        feat1 = feat1.ravel()
+        feat2 = feat2.ravel()
+        sim = np.dot(feat1, feat2) / (norm(feat1) * norm(feat2))
+        return sim
+
+    def get_feat(self, imgs):
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        input_size = self.input_size
+        
+        blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size,
+                                      (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+    def forward(self, batch_data):
+        blob = (batch_data - self.input_mean) / self.input_std
+        net_out = self.session.run(self.output_names, {self.input_name: blob})[0]
+        return net_out
+
+
diff --git a/insightface/web-demos/src_recognition/face_align.py b/insightface/web-demos/src_recognition/face_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e9a7c90042c670411f414fd9724ad7e71d3709
--- /dev/null
+++ b/insightface/web-demos/src_recognition/face_align.py
@@ -0,0 +1,141 @@
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+src1 = np.array([[51.642, 50.115], [57.617, 49.990], [35.740, 69.007],
+                 [51.157, 89.050], [57.025, 89.702]],
+                dtype=np.float32)
+#<--left
+src2 = np.array([[45.031, 50.118], [65.568, 50.872], [39.677, 68.111],
+                 [45.177, 86.190], [64.246, 86.758]],
+                dtype=np.float32)
+
+#---frontal
+src3 = np.array([[39.730, 51.138], [72.270, 51.138], [56.000, 68.493],
+                 [42.463, 87.010], [69.537, 87.010]],
+                dtype=np.float32)
+
+#-->right
+src4 = np.array([[46.845, 50.872], [67.382, 50.118], [72.737, 68.111],
+                 [48.167, 86.758], [67.236, 86.190]],
+                dtype=np.float32)
+
+#-->right profile
+src5 = np.array([[54.796, 49.990], [60.771, 50.115], [76.673, 69.007],
+                 [55.388, 89.702], [61.257, 89.050]],
+                dtype=np.float32)
+
+src = np.array([src1, src2, src3, src4, src5])
+src_map = {112: src, 224: src * 2}
+
+arcface_src = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+
+arcface_src = np.expand_dims(arcface_src, axis=0)
+
+# In[66]:
+
+
+# lmk is prediction; src is template
+def estimate_norm(lmk, image_size=112, mode='arcface'):
+    assert lmk.shape == (5, 2)
+    tform = trans.SimilarityTransform()
+    lmk_tran = np.insert(lmk, 2, values=np.ones(5), axis=1)
+    min_M = []
+    min_index = []
+    min_error = float('inf')
+    if mode == 'arcface':
+        if image_size == 112:
+            src = arcface_src
+        else:
+            src = float(image_size) / 112 * arcface_src
+    else:
+        src = src_map[image_size]
+    for i in np.arange(src.shape[0]):
+        tform.estimate(lmk, src[i])
+        M = tform.params[0:2, :]
+        results = np.dot(M, lmk_tran.T)
+        results = results.T
+        error = np.sum(np.sqrt(np.sum((results - src[i])**2, axis=1)))
+        #         print(error)
+        if error < min_error:
+            min_error = error
+            min_M = M
+            min_index = i
+    return min_M, min_index
+
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+    M, pose_index = estimate_norm(landmark, image_size, mode)
+    warped = cv2.warpAffine(img, M, (image_size, image_size), borderValue=0.0)
+    return warped
+
+def square_crop(im, S):
+    if im.shape[0] > im.shape[1]:
+        height = S
+        width = int(float(im.shape[1]) / im.shape[0] * S)
+        scale = float(S) / im.shape[0]
+    else:
+        width = S
+        height = int(float(im.shape[0]) / im.shape[1] * S)
+        scale = float(S) / im.shape[1]
+    resized_im = cv2.resize(im, (width, height))
+    det_im = np.zeros((S, S, 3), dtype=np.uint8)
+    det_im[:resized_im.shape[0], :resized_im.shape[1], :] = resized_im
+    return det_im, scale
+
+
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    #translation = (output_size/2-center[0]*scale_ratio, output_size/2-center[1]*scale_ratio)
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+
+
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i] = new_pt[0:2]
+
+    return new_pts
+
+
+def trans_points3d(pts, M):
+    scale = np.sqrt(M[0][0] * M[0][0] + M[0][1] * M[0][1])
+    #print(scale)
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        #print('new_pt', new_pt.shape, new_pt)
+        new_pts[i][0:2] = new_pt[0:2]
+        new_pts[i][2] = pts[i][2] * scale
+
+    return new_pts
+
+
+def trans_points(pts, M):
+    if pts.shape[1] == 2:
+        return trans_points2d(pts, M)
+    else:
+        return trans_points3d(pts, M)
+
diff --git a/insightface/web-demos/src_recognition/main.py b/insightface/web-demos/src_recognition/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..818843373c98ad31966c70d824d4cafc9af35646
--- /dev/null
+++ b/insightface/web-demos/src_recognition/main.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import os
+import os.path as osp
+import argparse
+import cv2
+import numpy as np
+import onnxruntime
+from scrfd import SCRFD
+from arcface_onnx import ArcFaceONNX
+
+onnxruntime.set_default_logger_severity(3)
+
+assets_dir = osp.expanduser('~/.insightface/models/buffalo_l')
+
+detector = SCRFD(os.path.join(assets_dir, 'det_10g.onnx'))
+detector.prepare(0)
+model_path = os.path.join(assets_dir, 'w600k_r50.onnx')
+rec = ArcFaceONNX(model_path)
+rec.prepare(0)
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('img1', type=str)
+    parser.add_argument('img2', type=str)
+    return parser.parse_args()
+
+
+def func(args):
+    image1 = cv2.imread(args.img1)
+    image2 = cv2.imread(args.img2)
+    bboxes1, kpss1 = detector.autodetect(image1, max_num=1)
+    if bboxes1.shape[0]==0:
+        return -1.0, "Face not found in Image-1"
+    bboxes2, kpss2 = detector.autodetect(image2, max_num=1)
+    if bboxes2.shape[0]==0:
+        return -1.0, "Face not found in Image-2"
+    kps1 = kpss1[0]
+    kps2 = kpss2[0]
+    feat1 = rec.get(image1, kps1)
+    feat2 = rec.get(image2, kps2)
+    sim = rec.compute_sim(feat1, feat2)
+    if sim<0.2:
+        conclu = 'They are NOT the same person'
+    elif sim>=0.2 and sim<0.28:
+        conclu = 'They are LIKELY TO be the same person'
+    else:
+        conclu = 'They ARE the same person'
+    return sim, conclu
+
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    output = func(args)
+    print('sim: %.4f, message: %s'%(output[0], output[1]))
+
diff --git a/insightface/web-demos/src_recognition/scrfd.py b/insightface/web-demos/src_recognition/scrfd.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc04996d296824b73dbc3611b6f261d64e48adb4
--- /dev/null
+++ b/insightface/web-demos/src_recognition/scrfd.py
@@ -0,0 +1,329 @@
+
+from __future__ import division
+import datetime
+import numpy as np
+#import onnx
+import onnxruntime
+import os
+import os.path as osp
+import cv2
+import sys
+
+def softmax(z):
+    assert len(z.shape) == 2
+    s = np.max(z, axis=1)
+    s = s[:, np.newaxis] # necessary step to do broadcasting
+    e_x = np.exp(z - s)
+    div = np.sum(e_x, axis=1)
+    div = div[:, np.newaxis] # dito
+    return e_x / div
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class SCRFD:
+    def __init__(self, model_file=None, session=None):
+        import onnxruntime
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        self.batched = False
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, providers=['CUDAExecutionProvider'])
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in scrfd model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            # If model support batch dim, take first output
+            if self.batched:
+                scores = net_outs[idx][0]
+                bbox_preds = net_outs[idx + fmc][0]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2][0] * stride
+            # If model doesn't support batching take output as is
+            else:
+                scores = net_outs[idx]
+                bbox_preds = net_outs[idx + fmc]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2] * stride
+
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, thresh=None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+        det_thresh = thresh if thresh is not None else self.det_thresh
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def autodetect(self, img, max_num=0, metric='max'):
+        bboxes, kpss = self.detect(img, input_size=(640, 640), thresh=0.5)
+        bboxes2, kpss2 = self.detect(img, input_size=(128, 128), thresh=0.5)
+        bboxes_all = np.concatenate([bboxes, bboxes2], axis=0)
+        kpss_all = np.concatenate([kpss, kpss2], axis=0)
+        keep = self.nms(bboxes_all)
+        det = bboxes_all[keep,:]
+        kpss = kpss_all[keep,:]
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
diff --git a/insightface/web-demos/swapping_discord/README.md b/insightface/web-demos/swapping_discord/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0978f621e29029c61a5438be4bd1bc506fc4e4d1
--- /dev/null
+++ b/insightface/web-demos/swapping_discord/README.md
@@ -0,0 +1,229 @@
+# Using Midjourney and the Picsi.AI by InsightFaceSwap Bot to create a personalized portrait
+
+We have named this highly realistic portrait creation tool as ``Picsi.AI``. You can use it for free, or head over to [Patreon](https://www.patreon.com/picsi) to subscribe and access more features and higher usage limits.
+
+## Important Links
+
+1. Discord bot invitation link: https://discord.com/api/oauth2/authorize?client_id=1090660574196674713&permissions=274877945856&scope=bot
+2. Discord discussion server(to get help): https://discord.gg/Ym3X8U59ZN
+3. Patreon subscription: https://www.patreon.com/picsi
+
+## ChangeLog
+
+**`2023-09-09`**: **Introducing HiFidelity Mode (BETA) and Sharpen Options**
+1) Use the ``-f`` command to enable HiFidelity mode: ``/setid example -f`` or ``/swapid example -f``
+2) Add the ``-s`` command for an extra layer of sharpness: ``/setid joey -f -s`` or ``/swapid joey -f -s``
+3) These new features can be combined with existing functionalities like Oldify for even more jaw-dropping results! For instance: ``/swapid pamela -o -f -s``
+4) Sample outputs(zoom in to view the details):
+     <div align="left">
+         <img src="https://github.com/nttstar/insightface-resources/blob/master/images/v0.4_2_2.jpg?raw=true" width="800"/>
+     </div>
+5) For examples in detail, please jump to https://www.patreon.com/posts/89036144.
+
+**`2023-08-27`** 
+1) **Enhanced GIF Quality**: Improved resolution, reduced noise, and enhanced sharpness for GIF outputs. This aims to provide a clearer and better visual experience.
+2) **Increased GIF Size Limit**: The maximum allowable GIF file size has been increased from 7MB to 10MB, allowing for more detailed and creative GIFs.
+3) **Extended Frame Limit for Pro Members**: Pro members can now utilize up to 75 frames for GIFs at a flat rate of 30 credits. This expands the possibilities for more complex and intricate GIFs.
+4) **GIF Support for Basic Members**: Basic members now have access to GIF support, limited to 20 frames at a cost of 20 credits.
+5) **URL Support for GIF**: Added the ability to directly work on GIFs using URL links, eliminating the need to download and re-upload GIF files. Provides an easier and faster way to create funny GIFs.
+6) For examples in detail, please jump to https://www.patreon.com/posts/88351201.
+
+
+
+**`2023-08-25`** 
+ Time Travel Has Never Been So Easy! Introducing Oldifying Faces.
+  1) Use a saved face and transfer it into your target image, then apply the oldifying effect. For instance:
+     
+       ``/swapid johndoe --oldify 300``
+     
+     This will take the saved face named johndoe, and then oldify it with an intensity of 300.
+     
+     Note that we can use ``-o`` as a shorthand for the ``--oldify`` argument.
+  3) You can directly oldify a face in the attached picture without transfer it with one of your saved faces:
+     
+       ``/swapid _ --oldify 200``
+     
+  4) Use the --oldify option to set the transformation intensity, ranging from 1 to 1000. The default intensity is 300 if none is specified.
+
+       ``/swapid _ --oldify``
+     
+  5) Special reminder: Due to the additional arguments parsing, please make sure that the input for idname does not contain any spaces. For example, ``/setid A,B`` is allowed, but ``/setid A, B`` is incorrect.
+      <div align="left">
+         <img src="https://github.com/nttstar/insightface-resources/blob/master/images/v0.3_image.jpg?raw=true" width="640"/>
+      </div>
+
+**`2023-08-02`** 
+  We have deployed a new model and optimized three aspects:
+  1) The new model performs better in handling skin shading under complex lighting conditions, reducing the likelihood of generating black or white erroneous pixels on the skin.
+  2) We have optimized the handling of glasses in the Saved/Source photo. When the Source photo contains glasses, we will generate the image based on the version without glasses to avoid any ghosting effects caused by glasses in the resulting photo. For target images that originally have glasses or sunglasses, this process will not affect the final results.
+  3) We have optimized the handling of bangs/fringe. When the source photo has thick bangs/fringe, we will try to minimize the impact on the generated result.
+      <div align="left">
+         <img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/picsi_20230802.jpg" width="640"/>
+      </div>
+
+**`2023-06-02`**
+  1) The length limit for idname has been increased to 10, and the maximum number of idnames that can be saved has been increased to 20.
+  2) Remove the 'greedy' prefer option, now '--nogreedy' and '--greedy' produce the same result.
+  3) The feature of ID mixing has been added. You can use the symbol "+" to link multiple idnames (up to 3) to generate interesting results. For example, ``/setid father+mother`` might generate an image similar to their son, and ``/setid mother+father`` might generate a photo like their daughter (that is, the order of the "+" link will affect the result). You can also use ``/setid mother+mother+father`` to enhance the features of the mother ID. There's an example [here](https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/240_14.jpeg)
+
+**`2023-05-17`**
+  1) The maximum command usage per image is set to 2, meaning that even if there are 4 faces in a single image, it will only consume 2 commands.
+  2) Now we use a queue in our backend. When there are too many users online, the requests will be queued and processed one by one which may slow to respond.
+  3) The support for GIFs has been temporarily removed, in order to ensure fast response time.
+
+**`2023-05-13`**
+  Now we support the **greedy** mode as the default option, which can provide higher identity similarity. You can use the ``/setid --nogreedy``(put ``--nogreedy`` in the ``idname`` field) command to disable it (and use ``/setid --greedy`` to enable again). In addition, the ``/listid`` command can be used to view the current ID name and prefer options. For more information, please refer to the instruction of the ``/setid`` command on this page.
+
+**`2023-05-08`**
+  1) The maximum pixel output has now been changed to 2048, previously it was 1920.
+  2) The number of command statistics have been changed from the number of images to the number of faces (i.e. if there are 2 faces in one image, it will consume 2 commands).
+
+**`2023-04-27`**
+  1) Now we support swapping on GIFs. The usage is the same as static images. A few extra key points: 1) Uploaded gifs cannot exceed 5MB in size; 2) Performing one gif face swap will consume 5 command opportunities (i.e. a maximum of 10 gifs can be operated per day); 3) Up to the first 15 frames can be operated; 4) Supports single-person swapping only in GIFs; 5) The frames may be dynamically resized to a lower resolution.
+  2) Add FAQ.
+
+**`2023-04-18`**
+  Now we support Discord application commands(AKA. slash commands), please remember joining our [Discord group](https://discord.gg/Ym3X8U59ZN) to get notification.
+
+## Disclaimer
+
+By using this service, you acknowledge that you have read, understood, and agreed to the terms and conditions outlined in this disclaimer.
+
+We would like to emphasize that our service is intended for research and legal AI creation purposes only. We do not condone or promote the use of our service for any illegal or unethical activities. We strictly prohibit the use of our service to process the facial features of individuals without their express permission or consent. Additionally, we do not allow the usage of features of political figures, public officials, or any other public figures without their permission.
+
+We also do not assume any responsibility or liability for the consequences that may arise from the use of our service. Our service is provided on an "as is" basis, and we do not guarantee the accuracy, completeness, or reliability of the results obtained through the use of our service.
+
+By using our service, you agree to indemnify and hold us harmless from any claim or demand, including reasonable attorneys' fees, made by any third-party due to or arising out of your use of the service, your violation of these terms and conditions, or your violation of any rights of another.
+
+In summary, we strictly prohibit the use of our service for any illegal or unethical activities and we are not responsible for any consequences that may arise from the use of our service. If you agree to these terms and conditions, please proceed to use our service.
+
+## License
+
+Our service does not claim any intellectual property rights over the original images or the transformed AI-generated images. Any use of these AI-generated images should respect the copyrights and trademarks of the original images and should not infringe upon the rights of the original copyright owners.
+
+As long as the images do not infringe on any copyrights, paid users can use the generated images for commercial purposes. Free members can not. It is crucial to indicate that these images were altered and generated by Picsi.Ai - Powered by InsightFace, in a visible and accessible manner, to ensure compliance with our licensing terms, legal obligations, and ethical considerations. If a digital picture, this must also be included in the meta and exif data of the photo.
+
+## Introduction
+
+For over 99% of people, using Midjourney to create your own portraits is not feasible unless you're a famous celebrity with thousands or millions of photos online. But now, with the InsightFaceSwap Discord bot, you can accomplish this task easily with just a few steps.
+
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd0.jpg" width="800"/>
+</div>
+
+## Discord Slash Commands
+
+InsightFaceSwap bot can help you with the following commands:
+
+### /saveid ``name`` ``upload-ID-image``
+
+Used to upload and register your own ID photo or numpy feature for subsequent facial replacement and editing. You can upload up to 10 instances permanently and use them without having to upload them repeatedly.
+
+(Front-view, high quality, no glasses, no heavy bangs ID photos are prefered.）
+
+### /setid ``name/prefer``
+
+This command can be used to do two things.
+
+1) Set default identity name(s), for image generation using context menu. If you need to set multiple ID names, please use commas to separate them.
+2) Set prefer options, e.g. use ``/setid --greedy`` to enable greedy mode and ``/setid --nogreedy`` to disable. (The prefer options are placed in the ``idname`` field of ``/setid`` command, don't worry about it)
+
+Note that you can not set current id names and prefer options in one ``/setid`` command simultaneously, but call them separately.
+
+### /listid
+
+List all registered identity names, default identity names and prefer options.
+
+### /delid ``name``
+
+Delete specific identity name.
+
+### /delall 
+
+Delete all registered names.
+
+### /swapid ``name(s)`` ``upload-image``
+
+Replace the face with the registered identity name(s) on target image.
+
+## Discord Context Menu
+
+### Apps/INSwapper
+
+Replace the face with the current/default identity name(s) on target image. Current/default identity name(s) can be set via ``/savevid`` and ``/setid`` slash commands.
+
+
+   
+
+## Step-by-step guide:
+
+1. Refer to [this link](https://docs.midjourney.com/docs/invite-the-bot) to register Discord app, create a new chat room, and invite the Midjourney bot to the chat room.
+2. Invite the InsightFaceSwap bot to the chat room by this link: <https://discord.com/api/oauth2/authorize?client_id=1090660574196674713&permissions=274877945856&scope=bot>.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd1.jpg" width="480"/>
+</div>
+3. Use ``/saveid`` command to register your identity name and feature. Here 'mnls' is the registered name, which can be any alphabets or numbers up to 8 characters long. If everything goes well, the bot will tell you that the save was successful. Note that the newly created identity will be automatically set as the default identity.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd2.jpg" width="640"/>
+</div>
+4. Next, we can experiment with creating the portrait. Let's start chanting the Midjourney prompt and enlarge one of the outputs.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd3.jpg" width="640"/>
+</div>
+5. After the enlargement is complete, we can simply use the ``INSwapper`` context menu to generate our portrait. Right click on the target image and then select ``Apps-INSwapper`` menu. Note that we can also use ``/setid`` command to change the default identity name.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd4.jpg" width="640"/>
+</div>
+6. Generally, the task is completed in less than a second and we can see the result.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd5.jpg" width="640"/>
+</div>
+7. In addition to processing photos generated by Midjourney, we can also process locally uploaded photos by using ``/swapid`` command explicitly.
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd6.jpg" width="640"/>
+</div>
+8. Hit to complete!
+<div align="left">
+<img src="https://raw.githubusercontent.com/nttstar/insightface-resources/master/images/swapd7.jpg" width="640"/>
+</div>
+9. Note that the ``INSwapper`` context menu can also work on user uploaded images in your Discord channel.
+
+
+## FAQ
+
+Q: Why "application did not respond"?
+
+A: This error indicates that the server was overloaded at the time. Please try again.
+
+Q: Why is the service sometimes slow to respond?
+
+A: We used a queue in our backend. When there are too many users online, the requests will be queued and processed one by one.
+
+Q: Can I list my registered ID list?
+
+A: Yes, use ``/listid`` command.
+
+Q: Are there any restrictions on ID names?
+
+A: All ID names can only be alphabets and numbers, and cannot exceed 10 characters. The total number of registered IDs cannot exceed 20.
+
+Q: Can I delete my registered IDs?
+
+A: You can use ``/delid`` and ``/delall`` commands to delete registered IDs.
+
+Q: Support multi-facial replacement?
+
+A: Yes, you can input a comma splitted idname list, such as ``/setid me,you,him,her``. You can also use the ``_`` symbol to indicate no-replacement(e.g. ``/setid me,_,him``).
+
+Q: How to get good results?
+
+A: 1) Select front-view, high quality, no glasses, no heavy bangs ID photos; 2) Try greedy mode if you need higher identity similarity; 3) For the target image, please ensure that the facial features are proportionate to those of real humans, otherwise it may cause overflow effects.
+
+## Other notes:
+
+1. Front-view, high quality, no glasses, no heavy bangs ID photos are prefered.
+2. Each Discord account can execute 50 commands per day.
+3. This is in early development stage, so we cannot guarantee that the result will be great in every cases.
+4. If there's any problem, please join our Discord group: [link](https://discord.gg/Ym3X8U59ZN)
+
+
diff --git a/insightface/web-demos/swapping_discord/privacy.md b/insightface/web-demos/swapping_discord/privacy.md
new file mode 100644
index 0000000000000000000000000000000000000000..58f1520f930c78792cf50b9af7c76cf7687a90ce
--- /dev/null
+++ b/insightface/web-demos/swapping_discord/privacy.md
@@ -0,0 +1,24 @@
+## Privacy Policy
+This Privacy Policy describes how your personal information is collected and used in our Discord bot application.
+
+## Information We Collect
+When you visit our bot, we collect certain information about your USER-ID and your manually uploaded identity features, the identity images will not be saved.
+We collect the information using the API of Discord application.
+
+## How We Use Your Information
+We use the information that we collect to help process the image generation.
+
+## Sharing Your Information
+We will not share your information with any external organizations, individuals, or companies.
+
+## Behavioural Advertising
+We will not do any advertisements or marketing communications by using your information.
+
+## Your Rights
+You can list your saved information and delete your information in any time.
+
+## Changes
+We may update this privacy policy from time to time in order to reflect, for example, changes to our practices or for other operational, legal or regulatory reasons.
+
+## Contact Us
+For more information about our privacy practices, if you have questions, or if you would like to make a complaint, please email us at ``contact#insightface.ai``.
diff --git a/requirements.txt b/requirements.txt
index 5d4b35df8065e1812e5781a3c7b4891fe8b9409a..9ff68fc99be7b23b0b35abdcfb356167ba3f7eb9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,7 @@ gradio
 controlnet_aux
 gdown
 peft
-mediapipe
\ No newline at end of file
+mediapipe
+Cython>=0.29.28
+cmake>=3.22.3    
+numpy>=1.22.3
\ No newline at end of file